├── maxwell-solver
    ├── __init__.py
    ├── gce
    │   ├── __init__.py
    │   ├── const.py
    │   ├── data.py
    │   ├── out.py
    │   ├── README.md
    │   ├── kernel.cu
    │   ├── space.py
    │   ├── grid.py
    │   └── kernel.py
    ├── solvers
    │   ├── __init__.py
    │   └── test_bicg.py
    ├── kernels
    │   ├── alpha_allpre.cu
    │   ├── fdfd_matrix_multiplication.cu
    │   ├── fdfd_residual.cu
    │   ├── alpha_biCGSTAB.cu
    │   ├── omega_bloch_allpre.cu
    │   ├── fdfd_matrix_multiplication_pec_pmc.cu
    │   ├── fdfd_residual_pec_pmc.cu
    │   ├── omega_bloch_pmc_pec.cu
    │   └── alpha_bloch_pmc_pec.cu
    ├── fdfd.py
    └── maxwell_ops_lumped.py
├── .gitignore
├── maxwell-server
    ├── maxwell-solver
    ├── unbuffered.py
    ├── maxwell_config.py
    ├── webserver.py
    └── simserver.py
├── start_maxwell_docker
├── maxwellfdfd.service
├── run_docker
├── start_maxwell
├── Dockerfile
├── maxwell_sweeper.py
├── README
└── LICENSE


/maxwell-solver/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | **/__pycache__/**
2 | 


--------------------------------------------------------------------------------
/maxwell-solver/gce/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maxwell-solver/solvers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maxwell-server/maxwell-solver:
--------------------------------------------------------------------------------
1 | ../maxwell-solver/


--------------------------------------------------------------------------------
/start_maxwell_docker:
--------------------------------------------------------------------------------
1 | #!/bin/bash -e
2 | source /pyenv/bin/activate
3 | python3 maxwell-server/simserver.py $NGPUS &> simserver.log &
4 | python3 maxwell-server/webserver.py $PORT  &> webserver.log
5 | 


--------------------------------------------------------------------------------
/maxwell-server/unbuffered.py:
--------------------------------------------------------------------------------
 1 | #Class for flushing a stream after every write
 2 | 
 3 | class Unbuffered(object):
 4 |    def __init__(self, stream):
 5 |        self.stream = stream
 6 |    def write(self, data):
 7 |        self.stream.write(data)
 8 |        self.stream.flush()
 9 |    def __getattr__(self, attr):
10 |        return getattr(self.stream, attr)
11 | 
12 | 


--------------------------------------------------------------------------------
/maxwellfdfd.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=MaxwellFDFD webserver and simserver
 3 | After=network-online.target nss-lookup.target
 4 | Wants=network-online.target nss-lookup.target
 5 | 
 6 | [Service]
 7 | WorkingDirectory=/home/maxwell
 8 | Type=forking
 9 | ExecStart=/usr/bin/sudo -u maxwell /home/maxwell/start_maxwell
10 | 
11 | [Install]
12 | WantedBy=multi-user.target
13 | 


--------------------------------------------------------------------------------
/maxwell-server/maxwell_config.py:
--------------------------------------------------------------------------------
 1 | """ Configuration file for Maxwell.
 2 | 
 3 |     Holds constants and such...
 4 | """
 5 | 
 6 | import os
 7 | 
 8 | path = os.environ["MAXWELL_SERVER_FILES"]
 9 | 
10 | if not os.path.exists(path):
11 |     os.makedirs(path)
12 | 
13 | def list_requests():
14 |     return [f for f in os.listdir(path) \
15 |                 if f[-len('.request'):] == '.request']
16 | 


--------------------------------------------------------------------------------
/run_docker:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Builds and runs maxwell in Docker.
 3 | PORT=9041
 4 | MAXWELL_DOCKER_VOL=maxwell-vol
 5 | MAXWELL_SERVER_FILES=/mnt/maxwell-server-files
 6 | 
 7 | docker build -t maxwell .
 8 | docker run --runtime=nvidia \
 9 |     --mount "source=$MAXWELL_DOCKER_VAL,target=$MAXWELL_SERVER_FILES" \
10 |     -d \
11 |     -p $PORT:$PORT \
12 |     -e PORT=$PORT \
13 |     -e MAXWELL_SERVER_FILES="$MAXWELL_SERVER_FILES" \
14 |     -e NGPUS=1 \
15 |     maxwell #tail -f /dev/null
16 | 


--------------------------------------------------------------------------------
/start_maxwell:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | 
 3 | # Simple script for starting up maxwell
 4 | 
 5 | # Location to store temporary files.
 6 | MAXWELL_SERVER_FILES=/tmp/maxwell-server-files
 7 | # Port to use for webserver.
 8 | PORT=9041
 9 | # Number of GPUS per solve.
10 | NGPUS=1
11 | 
12 | # Main directory for Maxwell source code.
13 | BASEDIR=.
14 | # Location of Python virtualenv containing Maxwell dependencies.
15 | PYENV=maxwell-solver-env
16 | SERVER_DIR=maxwell-server
17 | PYTHON=python3
18 | 
19 | cd $BASEDIR
20 | 
21 | source $PYENV/bin/activate
22 | $PYTHON $SERVER_DIR/webserver.py $PORT  &> $BASEDIR/webserver.log &
23 | $PYTHON $SERVER_DIR/simserver.py $NGPUS &> $BASEDIR/simserver.log
24 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.0-devel
 2 | 
 3 | # Do as much installation as possible to make use of caching as installing
 4 | # is very slow.
 5 | # A few comments:
 6 | # 1) openmpi seems to give trouble so use mpich2.
 7 | # 2) Use a virtualenv to avoid outdated system packages (i.e. six).
 8 | RUN apt-get update && \
 9 |     apt-get install -y python3-pip \
10 |                        python3-setuptools \
11 |                        libhdf5-serial-dev \
12 |                        mpich
13 | 
14 | RUN pip3 install virtualenv
15 | RUN virtualenv -p python3 pyenv
16 | 
17 | RUN /pyenv/bin/pip3 install numpy
18 | RUN /pyenv/bin/pip3 install pycuda jinja2 h5py mpi4py
19 | RUN /pyenv/bin/pip3 install scipy
20 | 
21 | WORKDIR /app
22 | COPY . /app
23 | 
24 | EXPOSE 9041
25 | 
26 | CMD ["./start_maxwell_docker"]
27 | 


--------------------------------------------------------------------------------
/maxwell-solver/gce/const.py:
--------------------------------------------------------------------------------
 1 | """ Defines the Const class for GCE. """
 2 | 
 3 | from pycuda import gpuarray as ga
 4 | from gce.data import Data
 5 | import numpy as np
 6 | 
 7 | class Const(Data):
 8 |     """ Const class for GCE. 
 9 |     
10 |     Used to store globally accessible, but unchangeable data.
11 | 
12 |     Derives from the Data class.
13 | 
14 |     New methods:
15 |     __init__ -- Store an array as a Const on the GPU. 
16 |     """
17 | 
18 |     def __init__(self, array):
19 |         """ Create a Const.
20 | 
21 |         Input variables
22 |         array -- a numpy array of valid dtype.
23 |         """
24 | 
25 |         self._set_gce_type('const')
26 |         if type(array) is not np.ndarray: # Make sure we actually got an array.
27 |                 raise TypeError('Array must be a numpy ndarray.')
28 | 
29 |         self._get_dtype(array.dtype.type) # Validate the array's dtype.
30 |         self.to_gpu(array) # Load onto device.
31 | 
32 | 


--------------------------------------------------------------------------------
/maxwell-solver/solvers/test_bicg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import unittest
 3 | import bicg
 4 | 
 5 | n = 10
 6 | A0 = np.random.randn(n, n)
 7 | b0 = np.random.randn(n)
 8 | 
 9 | class TestBicg(unittest.TestCase):
10 |     def test_asymm(self):
11 |         A = A0
12 |         b = b0
13 | 
14 |         def multA(x, y):
15 |             y[:] = np.dot(A, x)
16 | 
17 |         def multAT(x, y):
18 |             y[:] = np.dot(A.T, x)
19 | 
20 |         ops = {'multA': multA, 'multAT': multAT}
21 | 
22 |         x, err, success = bicg.solve_asymm(b, **ops)
23 |         self.assertTrue(success)
24 | 
25 |     def test_symm(self):
26 |         A = np.dot(A0.T, A0) # Make A symmetric.
27 |         b = b0
28 | 
29 |         def multA(x, y):
30 |             y[:] = np.dot(A, x)
31 | 
32 |         ops = {'multA': multA}
33 | 
34 |         x, err, success = bicg.solve_symm(b, **ops)
35 |         self.assertTrue(success)
36 | 
37 |     def test_zlumped(self):
38 |         A = np.dot(A0.T, A0) # Make A symmetric.
39 |         b = b0
40 | 
41 |         def alpha_step(rho_k, rho_k_1, p, r, v):
42 |             p[:] = r + (rho_k / rho_k_1) * p
43 |             v[:] = np.dot(A, p)
44 |             return rho_k / np.dot(p, v) # Return alpha.
45 | 
46 |         def rho_step(alpha, p, r, v, x):
47 |             x[:] = x + alpha * p
48 |             r[:] = r - alpha * v
49 | 
50 |             # Return rho and err.
51 |             return np.dot(r, r), np.sqrt(np.dot(np.conj(r), r)) 
52 | 
53 |         def zeros():
54 |             return np.zeros_like(b)
55 |         ops = {'rho_step': rho_step, 'alpha_step': alpha_step, 'zeros': zeros}
56 | 
57 |         x, err, success = bicg.solve_symm_lumped(b, **ops)
58 |         self.assertTrue(success)
59 | 
60 | if __name__ == '__main__':
61 |     unittest.main()
62 |         
63 |         
64 |         
65 | 
66 | 


--------------------------------------------------------------------------------
/maxwell-solver/gce/data.py:
--------------------------------------------------------------------------------
 1 | """ Defines the Data class for GCE. """
 2 | 
 3 | from gce.space import get_space_info
 4 | import numpy as np
 5 | from pycuda import gpuarray as ga
 6 | 
 7 | class Data:
 8 |     """ Generic data class for GCE. 
 9 |     
10 |     The Grid, Const, and Out classes are derived from this class.
11 | 
12 |     Only supports datatypes: np.float32, np.float64, np.complex64, 
13 |         and np.complex128.
14 | 
15 |     Functions:
16 |     to_gpu -- Load a numpy array on to the GPU.
17 |     get -- Transfer data back to host memory.
18 | 
19 |     Variables:
20 |     data -- GPUArray instance.
21 |     dtype -- Numpy datatype of the data.
22 |     cuda_type -- Corresponding cuda type of the data.
23 |     """
24 | 
25 |     def _get_dtype(self, dtype):
26 |         """ Certify that the dtype is valid, and find the cuda datatype. """
27 |         if dtype not in (np.int32, np.float32, np.float64, np.complex64, np.complex128):
28 |             raise TypeError('Array is of an unsupported dtype.')
29 |         
30 |         self.dtype = dtype # The numpy datatype.
31 | 
32 |         cuda_dict = {np.float32: 'float', np.float64: 'double', \
33 |                     np.int32: 'int', \
34 |                     np.complex64: 'pycuda::complex<float>', \
35 |                     np.complex128: 'pycuda::complex<double>'}
36 | 
37 |         self.cuda_type = cuda_dict[self.dtype] # Corresponding cuda datatype.
38 | 
39 |     def _set_gce_type(self, type):
40 |         """ Set whether we have a Grid, Const, or Out. """
41 |         if type in ('grid', 'const', 'out'):
42 |             self.gce_type = type
43 |         else:
44 |             raise TypeError('Invalid gce type.')
45 | 
46 |     def to_gpu(self, array):
47 |         """ Load data to the gpu. """
48 |         self.data = ga.to_gpu(array)
49 | 
50 |     def get(self):
51 |         """ Get data from the gpu. """
52 |         return self.data.get()
53 | 


--------------------------------------------------------------------------------
/maxwell-solver/gce/out.py:
--------------------------------------------------------------------------------
 1 | """ Defines the Out class for GCE. """
 2 | 
 3 | from pycuda import gpuarray as ga
 4 | from pycuda.reduction import ReductionKernel
 5 | from gce.space import get_space_info
 6 | from gce.data import Data
 7 | import numpy as np
 8 | from mpi4py.MPI import COMM_WORLD as comm
 9 | 
10 | 
11 | class Out(Data):
12 |     """ Out class for GCE. 
13 |     
14 |     Outs store reduction operations. Outs allow for reduction operations in
15 |     the GCE framework by storing intermediary (y,z) values during a kernel
16 |     operation, which are then reduced into a single value. See the Kernel
17 |     class for additional information.
18 | 
19 |     Currently only the "sum" operation is supported.
20 | 
21 |     Derives from the Data class.
22 | 
23 |     New methods:
24 |     __init__ -- Create an Out of a particular dtype and operation.
25 |     get -- Redefined to retrieve the result of the reduction.
26 | 
27 |     """
28 | 
29 |     def __init__(self, dtype, op='sum'):
30 |         """ Create an Out.
31 | 
32 |         Input variables
33 |         dtype -- numpy dtype.
34 | 
35 |         Keyword variables
36 |         op -- type of reduction operation to perform. Default='sum'.
37 |             At this time, only the "sum" operation is supported.
38 |         """
39 | 
40 |         self._set_gce_type('out')
41 |         self._get_dtype(dtype) # Validate dtype.
42 | 
43 |         if op not in ('sum','prod'): # Validate op.
44 |             raise TypeError('Invalid op.')
45 |         self.op = op
46 | 
47 |         # Obtain the neutral value and store it in the result variable.
48 |         neutral_val = {'sum': 0, 'prod': 1}
49 | 
50 |         # Create the intermediary values.
51 |         shape = get_space_info()['shape']
52 |         self.to_gpu((neutral_val[op] * \
53 |                         np.ones((1, shape[1], shape[2]))).astype(self.dtype))
54 | 
55 |     def reduce(self):
56 |         """ Compute the result. """
57 |         self.result = comm.allreduce(ga.sum(self.data).get())
58 | 
59 |     def get(self):
60 |         """ Redefine get() to return the result of the operation. """
61 |         return self.result
62 | 
63 | 
64 | def batch_reduce(*outs):
65 |     """ Optimal (compared to self.reduce) when communication cost is latency bound. """
66 |     results = comm.allreduce(np.array([ga.sum(out.data).get() for out in outs]))
67 |     for k in range(len(outs)):
68 |         outs[k].result = results[k]
69 | 
70 | 


--------------------------------------------------------------------------------
/maxwell_sweeper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | """ Removes the temporary files for Maxwell automatically.
 3 | 
 4 | The temporary files in Maxwell tend to build up over time and these files
 5 | eventually take up enough space to warrant removing them. Rather than manually
 6 | performing this task, this script automatically removes that are older than
 7 | a certain date (default: 7 days). This script is intended to be placed in
 8 | cron folder (be sure to give the script executable permissions).
 9 | """
10 | import datetime
11 | import logging
12 | import os
13 | 
14 | # Temporary maxwell server files directory to sweep.
15 | MAXWELL_SERVER_FILES_DIR = os.environ["MAXWELL_SERVER_FILES"]
16 | # Number of days to retain temporary files.
17 | DELETE_THRESHOLD_DAY = 7
18 | # Logging format to use.
19 | LOG_FORMAT = '[%(asctime)-15s][%(levelname)s][%(module)s][%(funcName)s] %(message)s'
20 | # Place to store logs.
21 | LOG_LOCATION = '/home/maxwell/maxwell-sweeper.log'
22 | 
23 | # Append to log file so that script can be run multiple times.
24 | logging.basicConfig(filename=LOG_LOCATION, filemode='a',
25 |                     format=LOG_FORMAT, level=logging.INFO)
26 | logger = logging.getLogger(__name__)
27 | 
28 | def main():
29 |     logger.info('Beginning sweep...')
30 |     # Keep track of number of deleted files.
31 |     deleted_files = 0
32 |     # List all the files
33 |     for filename in os.listdir(MAXWELL_SERVER_FILES_DIR):
34 |         fullpath = os.path.join(MAXWELL_SERVER_FILES_DIR, filename)
35 |         try:
36 |             # Get the last modified time and compare against current time.
37 |             # Note that it is safer to retrieve the last modified timestamp
38 |             # before obtaining the current timestamp to avoid the situation
39 |             # where `last_modified > now`.
40 |             last_modified = datetime.datetime.fromtimestamp(
41 |                 os.path.getmtime(fullpath))
42 |             now = datetime.datetime.today()
43 | 
44 |             delta_time = now - last_modified
45 |             if delta_time.days > DELETE_THRESHOLD_DAY:
46 |                 logging.debug('Removing {0} ({1} days old)...'.format(
47 |                     fullpath, delta_time.days))
48 |                 os.remove(fullpath)
49 |                 deleted_files += 1
50 |         except:
51 |             logger.exception('Error handling {0}'.format(fullpath))
52 |     logger.info('Sweep finished. Removed {0} files.'.format(deleted_files))
53 | 
54 | if __name__ == '__main__':
55 |     main()
56 | 


--------------------------------------------------------------------------------
/maxwell-solver/gce/README.md:
--------------------------------------------------------------------------------
 1 | TODO
 2 | ====
 3 | 
 4 | *   Clean-up and update documentation.
 5 | 
 6 | 
 7 | What is GCE?
 8 | ============
 9 | GCE stands for Gird Compute Engine, written by Jesse Lu in early 2012.
10 | 
11 | 
12 | What does GCE do?
13 | =================
14 | GCE makes it easy to write fast 3D finite-difference applications 
15 |   for CUDA.
16 | 
17 | 
18 | How does GCE work?
19 | ==================
20 | GCE provides a simple interface for manipulating gridded 3D data
21 |   on the GPU.
22 | GCE is based on simple memory and execution objects
23 |   which hide non-essential features and details, 
24 |   allowing applications to be defined in a simple, abstract way.
25 | 
26 | 
27 | What is GCE built on?
28 | =====================
29 | GCE is heavily dependent on PyCUDA.
30 | 
31 | 
32 | Interface overview
33 | ==================
34 | 
35 | For a simple example of GCE at work, see test_example.py.
36 | 
37 | Space
38 | -----
39 | The space forms the context for colocating grids and kernels.
40 | For example, creating two grids on the same space tells GCE that
41 |   these two grids should be overlaid on top of each other.
42 | In the same way, defining a kernel on the space defines which grid elements
43 |   will be updated.
44 | As such, the space contains all the intra-process communication elements
45 |   needed to synchronize grids and to execute kernels in parallel.
46 | Also, the space contains all the device information needed to run kernels
47 |   on the GPU devices.
48 | 
49 | Currently, the creation of only one global space is supported.
50 | 
51 | Grid
52 | ----
53 | Grids represent three-dimensional fields. 
54 | To efficiently operate on Grids, every element in a Grid has limited visibility.
55 | This means that when operating on a Grid (with a Kernel),
56 |   only the certain adjacent neighboring elements may be accessed.
57 | Specifically, only elements within a cube of length 2n+1 
58 |   (where n, the stencil size, is specified by the user) may be accessed.
59 | 
60 | Const
61 | -----
62 | Consts are global constant arrays that be accessed by any element of any grid.
63 | However, the values of Const elements may not be reliably changed,
64 |   since such changes are not synchronized across devices.
65 | 
66 | Out
67 | ---
68 | Outs are global scalars that are used to store the result of reduce operations
69 |  on a space.
70 | Currently only sum operations are supported.
71 | 
72 | Kernel
73 | ------
74 | Kernels perform operations on Grids and 
75 |   accept Grids, Consts, and Outs as input parameters.
76 | Kernels perform both update and sum functions.
77 | Additionally, Kernels automatically provide self-optimization features
78 |   such as block_size optimization and loop-unrolling.
79 | 
80 | Writing code for Kernels
81 | ------------------------
82 | GCE provides a number of simple conventions to make writing kernel code
83 |   simpler:
84 | 
85 | *   Relative addressing of Grid elements. 
86 |     Computations that must be performed at every point on a Grid can be
87 |       described using relative indexing.
88 |     For example, copying from one Grid to another can be performed in the 
89 |       following way:
90 |         x(0,0,0) = y(0,0,0);
91 | 
92 | Optimization tips
93 | =================
94 | *   Remember to turn ECC off via 'nvidia-smi -e 0', 
95 |       this can later be checked using the test_pycuda_speed module.
96 | 


--------------------------------------------------------------------------------
/maxwell-server/webserver.py:
--------------------------------------------------------------------------------
  1 | """ Web server for Maxwell.
  2 | 
  3 |     Allows users to upload simulations and download results through HTTP.
  4 | 
  5 |     Performs just three operations:
  6 |     1.  Receive job as a from client (POST).
  7 |     2.  Return job status or simulation result to client (GET).
  8 |     3.  Return queue status to client (HEAD).
  9 | 
 10 |     Defaults to use port 9041.
 11 | """
 12 | 
 13 | import http.server
 14 | from io import StringIO
 15 | import cgi, shutil, tempfile, sys, os
 16 | from socketserver import ThreadingMixIn
 17 | import maxwell_config
 18 | from unbuffered import Unbuffered
 19 | 
 20 | 
 21 | class MaxwellHandler(http.server.BaseHTTPRequestHandler):
 22 |     """ Handler for the server. """
 23 | 
 24 |     def do_POST(self):
 25 |         """ Accepts files from client. """
 26 |         form = cgi.FieldStorage( \
 27 |             fp=self.rfile, \
 28 |             headers=self.headers, \
 29 |             environ={'REQUEST_METHOD':'POST', \
 30 |                     'CONTENT_TYPE':self.headers['Content-Type']})
 31 | 
 32 |         # "{ip}-" prefix added in front of file name.
 33 |         try:
 34 |             filename = self.my_prefix() + form['key'].value
 35 |             f = open(filename, 'wb')
 36 |         except:
 37 |             self.send_error(400, "Upload failed.")
 38 |             return
 39 | 
 40 |         # Save file.
 41 |         shutil.copyfileobj(form['file'].file, f)
 42 |         f.close()
 43 | 
 44 |         #         # Open permissions on file.
 45 |         #         os.chmod(filename, 0777)
 46 | 
 47 |         self.send_response(200)
 48 |         self.send_header('Content-type', 'maxwell!')
 49 |         self.end_headers()
 50 | 
 51 |     def do_GET(self):
 52 |         """ Return file to client. """
 53 | 
 54 |         if self.path == '/':  # No file specified, treat as HEAD request.
 55 |             self.do_HEAD()
 56 |             return
 57 | 
 58 |         fname = self.my_prefix() + self.path.lstrip('/')
 59 |         try:
 60 |             f = open(fname, 'rb')
 61 |         except:
 62 |             self.send_error(404, "File not found.")
 63 |             return
 64 | 
 65 |         self.send_response(200)
 66 |         self.send_header('Content-type', 'maxwell!')
 67 |         self.end_headers()
 68 |         shutil.copyfileobj(f, self.wfile)
 69 |         f.close()
 70 | 
 71 |         # If ends with something like .E_xr then delete the file.
 72 |         ending = fname.split('.')[-1]
 73 |         if len(ending) == 4 and \
 74 |             ending[0] in 'EH' and \
 75 |             ending[1] == '_' and \
 76 |             ending[2] in 'xyz' and \
 77 |             ending [3] in 'ri':
 78 |             os.remove(fname)
 79 | 
 80 |         # print self.client_address
 81 | 
 82 |     def my_prefix(self):
 83 |         """ Produce the user-specific prefix for files. """
 84 |         return os.path.join(maxwell_config.path, self.client_address[0] + ':')
 85 | 
 86 |     def do_HEAD(self):
 87 |         """ Returns the number of jobs in queue. """
 88 |         self.send_response(200)
 89 |         self.send_header('Content-type', 'maxwell!')
 90 |         self.end_headers()
 91 | 
 92 |         num_requests = len(maxwell_config.list_requests())
 93 |         shutil.copyfileobj(StringIO("%d jobs pending (maxwell-server)" \
 94 |                             % num_requests), self.wfile)
 95 | 
 96 | 
 97 | class ThreadingHTTPServer(ThreadingMixIn, http.server.HTTPServer):
 98 |     """ We use a multi-process version of HTTPServer. """
 99 |     pass
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     sys.stdout = Unbuffered(sys.stdout)
104 | 
105 |     # Determine the port to use.
106 |     if len(sys.argv) == 2:
107 |         port = int(sys.argv[1])
108 |     else:
109 |         port = 9041
110 | 
111 |     server_address = ("", port)
112 |     print("Serving at", server_address)
113 | 
114 |     httpd = ThreadingHTTPServer(server_address, MaxwellHandler)
115 |     httpd.serve_forever()
116 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | Maxwell is a multi-GPU implementation of finite-difference frequency domain solver.
  2 | This code is intended to be used in conjunction with SPINS (github.com/stanfordnqp/spins-b).
  3 | 
  4 | Overview
  5 | ========
  6 | Maxwell is implemented as a server to which SPINS can send simulations to be run.
  7 | This allows the actual simulation server (i.e. where the GPUs are) to be located separately
  8 | from where the rest of the optimization code is running, though it is recommended to keep
  9 | SPINS and Maxwell on the same machine if possible.
 10 | 
 11 | At its core, running Maxwell involves running two separate services:
 12 | 1. A webserver `maxwell-server/webserver.py` that manages sending and receiving simulation data over HTTP.
 13 | 2. A simserver `maxwell-server/simserver.py` that manages running the simulations.
 14 | Both services must be up and running for Maxwell to function properly.
 15 | 
 16 | Maxwell can run in the follow methods:
 17 | 1) Use the Dockerfile provided. This is the preferred mechanism as it creates an isolated environment for Maxwell.
 18 | 2) Manually launching the webserver and simserver. This allows for most fine-grain control.
 19 | 
 20 | 
 21 | Docker
 22 | ======
 23 | We have Dockerized the Maxwell solver to make solver maintenance easier.
 24 | In addition to CUDA Toolkit, all other Maxwell dependencies are listed in the Dockerfile.
 25 | 
 26 | Installation
 27 | ------------
 28 | 1. Install Docker (http://docker.com).
 29 | 2. Install CUDA 10.0 Toolkit (https://developer.nvidia.com/cuda-10.0-download-archive).
 30 | 3. Install NVIDIA-Docker (https://github.com/NVIDIA/nvidia-docker).
 31 | 
 32 | Usage
 33 | -----
 34 | The Dockerfile is contained the root Maxwell directory.
 35 | We have also provided a script to build and launch the Docker container:
 36 | 
 37 | $ ./run_docker
 38 | 
 39 | To change the number of GPUs used per simulation, edit the `run_docker` script and set `NGPUS` to the desired value.
 40 | 
 41 | Docker Quick Reference
 42 | ----------------------
 43 | To list all running containers,
 44 | 
 45 | $ docker ps ls
 46 | 
 47 | To kill a container,
 48 | 
 49 | $ docker kill [container-name-or-id]
 50 | 
 51 | To clean up all containers (dead containers still take up disks space),
 52 | 
 53 | $ docker system prune --volumes
 54 | 
 55 | To examine the container state, launch an interactive bash session:
 56 | 
 57 | $ docker exec -it [container-name-or-id] bash
 58 | 
 59 | 
 60 | Manual Installation
 61 | ===================
 62 | Maxwell can be manually installed. Follow the installation procedure listed in
 63 | `Dockerfile` for the installation procedure. See `./start_maxwell` for an
 64 | example of how to launch Maxwell manually.
 65 | 
 66 | 
 67 | Options
 68 | =======
 69 | 
 70 | 1. Work directory: Maxwell requires a folder to store temporary data.
 71 |    This location is specified by the environment variable `MAXWELL_SERVER_FILES`
 72 |    that must be set for Maxwell to run. Maxwell must have permissions to read
 73 |    and write to this directory.
 74 | 2. Port number: The webserver by default runs on port 9041. This can be changed
 75 |    by changing the argument passed to `webserver.py`
 76 |    (e.g. `python webserver.py 9042). If using Docker, change the `PORT` variable
 77 |    in `./run_docker`. Note that the port number change must be reflected in
 78 |    `spins` as well in order for this to work.
 79 | 3. Number of GPUs per solve: By default, Maxwell will attempt to use 1 GPU
 80 |    per simulation. For larger simulations, it may be beneficial to use multiple
 81 |    GPUs per simulation. This can be used by changing the `NGPUS` value.
 82 | 
 83 | 
 84 | Troubleshooting
 85 | ===============
 86 | 1. Check the Maxwell server log files.
 87 | 
 88 | Maxwell consists of two separate servers: `webserver.py` manages sending and receiving data
 89 | and `simserver.py` manages running the actual simulations. Both servers save logs named `webserver.log` and
 90 | `simserver.log` respectively.
 91 | 
 92 | 2. Check the individual simulation log files.
 93 | 
 94 | Every simulation maintains its own log file. This can be found under `$MAXWELL_SERVER_FILES` directory.
 95 | By default, the Docker sets this location to be `/mnt/maxwell-server-files` in the container.
 96 | 
 97 | 
 98 | Acknowledgements
 99 | ================
100 | This code is primarily based off of https://github.com/JesseLu/maxwell-solver.
101 | 


--------------------------------------------------------------------------------
/maxwell-solver/gce/kernel.cu:
--------------------------------------------------------------------------------
  1 | // These macros redefine the CUDA blocks and grids to be row-major,
  2 | // instead of column major.
  3 | 
  4 | #define _tx threadIdx.z
  5 | #define _ty (signed int)(threadIdx.y - {{ padding[0] }})
  6 | #define _tz (signed int)(threadIdx.x - {{ padding[2] }})
  7 | 
  8 | #define _bx blockIdx.z
  9 | #define _by blockIdx.y
 10 | #define _bz blockIdx.x
 11 | 
 12 | #define _txx blockDim.z
 13 | #define _tyy (blockDim.y - {{ padding[0] + padding[1] }})
 14 | #define _tzz (blockDim.x - {{ padding[2] + padding[3] }})
 15 | 
 16 | #define _bxx gridDim.z
 17 | #define _byy gridDim.y
 18 | #define _bzz gridDim.x
 19 | 
 20 | // Use the complex-value definition and operators included with pycuda.
 21 | // This allows us to work with pycuda's GPUArray class.
 22 | #include <pycuda-complex.hpp>
 23 | 
 24 | // Defines row major access to a 3D array.
 25 | // dx, dy, dz are shifts from the present location of the field.
 26 | // Note that there is an offset in the x-index
 27 | #define _MY_OFFSET(dx,dy,dz) ((_X - {{ x_range[0] }} + dx) * {{ dims[1] }} * {{ dims[2] }} + \
 28 |                              (_Y + dy) * {{ dims[2] }} + (_Z + dz))
 29 | 
 30 | // Macros to access fields using the field(i,j,k) format,
 31 | // where sx, sy, and sz are RELATIVE offsets in the x, y, and z directions
 32 | // respectively.
 33 | {%- for p in params if p.gce_type == 'grid' %}
 34 | #define {{ p.name }}(dx,dy,dz) {{ p.name }}[_MY_OFFSET(dx,dy,dz)]
 35 | {%- endfor %} 
 36 | 
 37 | // Constants. We have to have a crude work-around to avoid problems with 
 38 | // trying to use pycuda::complex types in constant memory.
 39 | {# Commented out for now.
 40 | {%- for p in params if p.gce_type == 'const' %}
 41 | __constant__ {{ p.alt_type }}  _{{ p.name }}_temp[{{ p.num_elems }}];
 42 | {%- endfor %} 
 43 | {%- for p in params if p.gce_type == 'const' %}
 44 | {%- if p.cuda_type in ('pycuda::complex<double>', 'pycuda::complex<float>') %}
 45 | #define {{ p.name }}(i) {{ p.cuda_type }}(_{{ p.name }}_temp[i].x, _{{ p.name }}_temp[i].y)
 46 | {%- else %}
 47 | #define {{ p.name }}(i) _{{ p.name }}_temp[i]
 48 | {%- endif %}
 49 | {%- endfor %} 
 50 | #}
 51 | {%- for p in params if p.gce_type == 'const' %}
 52 | #define {{ p.name }}(i) {{ p.name }}[i]
 53 | {%- endfor %} 
 54 | 
 55 | // Dynamic allocation of shared memory
 56 | extern __shared__ pycuda::complex<double> _gce_smem[];
 57 | 
 58 | __global__ void _gce_kernel(const int _x_start, const int _x_end, 
 59 |         {#- Add the fields as input parameters to the function. -#}
 60 |                            {#{%- for p in params if p.gce_type != 'const' -%}#}
 61 |                            {%- for p in params -%} 
 62 |                            {% if p.gce_type ==  'const' -%}
 63 |                            {{ p.cuda_type }}* {{ p.name }}
 64 |                            {% elif p.gce_type ==  'number' -%}
 65 |                            {{ p.cuda_type }} {{ p.name }}
 66 |                            {% elif p.gce_type == 'out' -%}
 67 |                            {{ p.cuda_type }} *_{{ p.name }}_out
 68 |                            {% else -%} 
 69 |                            {{ p.cuda_type }}* {{ p.name }}
 70 |                            {% endif -%}
 71 |                            {%- if not loop.last -%}, 
 72 |                            {%- else -%}) {% endif %} {% endfor %} 
 73 | {
 74 |     // Global index variables which determine where you are in the space,
 75 |     // and subsequently which grid point you will access.
 76 |     int _X = _tx + _txx * _bx + _x_start;
 77 |     int _Y = _ty + _tyy * _by;
 78 |     int _Z = _tz + _tzz * _bz;
 79 | 
 80 |     // Threads that are responsible for a grid point.
 81 |     const bool _in_global = (((_Y >= 0) && (_Y < {{ dims[1] }})) && \
 82 |                             ((_Z >= 0) && (_Z < {{ dims[2] }})));
 83 | 
 84 |     // Threads that are not part of the thread block padding.
 85 |     const bool _in_local =  (_ty >= 0) && (_ty < _tyy) && \
 86 |                             (_tz >= 0) && (_tz < _tzz);
 87 | 
 88 |     // Initialize the local variables for the Outs.
 89 |     {%- for p in params if p.gce_type == 'out' %}
 90 |     // {{ p.cuda_type }} {{ p.name }} = {{ p.cuda_type }}(0);
 91 |     {{ p.cuda_type }} {{ p.name }} = _{{ p.name }}_out[_Y * {{ dims[2] }} + _Z];
 92 |     {%- endfor %} 
 93 | 
 94 |     // User-defined "pre-loop" code.
 95 |     {{ preloop_code }}
 96 | 
 97 |     for (; _X < _x_end ; _X += _txx) {
 98 |         // User-defined "loop" code.
 99 |         {{ loop_code }}
100 |     }
101 |    
102 |     // Save outs of non-padding threads to global memory.
103 |     if (_in_local && _in_global) {
104 |         {%- for p in params if p.gce_type == 'out' %}
105 |         _{{ p.name }}_out[_Y * {{ dims[2] }} + _Z] = {{ p.name }};
106 |         {%- endfor %} 
107 |     }
108 | 
109 |     return;
110 | }
111 | 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/maxwell-server/simserver.py:
--------------------------------------------------------------------------------
  1 | """ Simulation server for Maxwell.
  2 | 
  3 |     Executes uploaded jobs.
  4 | 
  5 |     Consists of an infinite loop which does the following:
  6 |     0.  Check that GCE is not running.
  7 |     1.  Find the oldest job.
  8 |     2.  Run the solver on it.
  9 |     3.  Repeat.
 10 | 
 11 | """
 12 | 
 13 | import logging
 14 | import os
 15 | import os.path
 16 | import shlex
 17 | import subprocess
 18 | import sys
 19 | import time
 20 | 
 21 | import maxwell_config
 22 | import pycuda.driver
 23 | import unbuffered
 24 | 
 25 | LOG_FORMAT = '[%(asctime)-15s][%(levelname)s][%(module)s][%(funcName)s] %(message)s'
 26 | logging.basicConfig(format=LOG_FORMAT)
 27 | logger = logging.getLogger(__name__)
 28 | logger.setLevel(logging.INFO)
 29 | 
 30 | def get_num_gpus():
 31 |     pycuda.driver.init()
 32 |     return pycuda.driver.Device.count()
 33 | 
 34 | def check_process_running(pname):
 35 |     # Check if process with name 'pname' is running
 36 |     p1 = subprocess.Popen(['ps', 'ax'], 
 37 |                           stdout=subprocess.PIPE)
 38 |     p2 = subprocess.Popen(['grep', pname],
 39 |                           stdin=p1.stdout, stdout=subprocess.PIPE)
 40 |     p3 = subprocess.Popen(['grep', '-v', 'grep'],
 41 |                            stdin=p2.stdout, stdout=subprocess.PIPE)
 42 |     return p3.communicate()[0].find(b'\n') > -1
 43 | 
 44 | def find_oldest_job():
 45 |     req = maxwell_config.list_requests() # Get the requests.
 46 |     if not req:
 47 |         return None
 48 | 
 49 |     req_with_time = {}
 50 |     for r in req:
 51 |         req_with_time[r] = os.stat(os.path.join(maxwell_config.path, r)).st_ctime
 52 | 
 53 |     oldest_req = min(req_with_time) # Run this job.
 54 |     os.remove(os.path.join(maxwell_config.path, oldest_req))
 55 |     return oldest_req[:-len('.request')]
 56 | 
 57 | def main():
 58 |     sys.stdout = unbuffered.Unbuffered(sys.stdout)
 59 | 
 60 |     path_to_solver_dir = os.path.abspath(__file__).replace(
 61 |                             __file__.split('/')[-1], 'maxwell-solver') + '/'
 62 |     logger.info('Solver directory set to {0}'.format(path_to_solver_dir))
 63 | 
 64 |     # Determine number of GPUs on system
 65 |     num_gpus = get_num_gpus()
 66 |     logger.info('Number of GPUs detected on system: {0}'.format(num_gpus))
 67 | 
 68 |     # Determine number of GPUs to use per solve (user input)
 69 |     if len(sys.argv) > 1:
 70 |         gpus_per_solve = int(sys.argv[1])
 71 |     else:
 72 |         gpus_per_solve = 2
 73 |     logger.info('Number of GPUs used per solve: {0}'.format(gpus_per_solve))
 74 | 
 75 |     if gpus_per_solve > num_gpus:
 76 |         raise ValueError('Number of GPUs is {0}, but number of GPUs requested '
 77 |                          'per solve is {1}, exceeding the number of available '
 78 |                          'GPUs.'.format(num_gpus, gpus_per_solve))
 79 | 
 80 |     # Generate GPU groupings.
 81 |     # Groupings take the form gpu1,gpu2,gpu3...
 82 |     # e.g. with 8 GPUs and 3 GPUs per solve we have:
 83 |     # solve_gpus = ['0,1,2','3,4,5']
 84 |     num_solves  = num_gpus // gpus_per_solve
 85 |     solve_gpus  = []
 86 |     for i in range(0, num_solves):
 87 |         start_num = i * gpus_per_solve
 88 |         solve_gpus.append(','.join(
 89 |              str(j) for j in range(start_num, start_num + gpus_per_solve)))
 90 | 
 91 |     # Managing loop
 92 |     solve_obj   = [None]*num_solves
 93 |     solve_paths = ['']*num_solves
 94 |     out_files   = [None]*num_solves
 95 | 
 96 |     logger.info('Ready to accept simulations.')
 97 | 
 98 |     while True:
 99 |         time.sleep(1)
100 | 
101 |         # Check for solve completion
102 |         for i in range(len(solve_obj)):
103 |             if solve_obj[i] and solve_obj[i].poll() is not None:
104 |                 logger.info('Simulation {0} ended with code {1}'.format(
105 |                     i, solve_obj[i].returncode))
106 | 
107 |                 # Close output log file
108 |                 out_files[i].close()
109 | 
110 |                 # Used to let user know that files can be downloaded.
111 |                 time.sleep(0.5)
112 |                 filepath = os.path.join(maxwell_config.path, solve_paths[i]
113 |                                         + '.finished')
114 |                 f = open(filepath, 'w')
115 |                 logger.debug('Writing finished file at {0}'.format(filepath))
116 |                 f.write('{}'.format(solve_obj[i].returncode))
117 |                 f.close()
118 | 
119 |                 # Delete old job
120 |                 solve_obj[i] = None
121 |                 out_files[i] = None
122 |                 solve_paths[i] = ''
123 | 
124 |         # Ensure that GCE is not running
125 |         if check_process_running('job_manager'):
126 |             continue
127 | 
128 |         # Check for and start new solves
129 |         for i in range(len(solve_obj)):
130 |             if solve_obj[i]:
131 |                 continue
132 |             solve_paths[i] = find_oldest_job()
133 |             if solve_paths[i]:
134 |                 logger.info('Solving {0} as simulation {1}'.format(
135 |                     solve_paths[i], i))
136 | 
137 |                 tmp_env = os.environ.copy()
138 |                 tmp_env['CUDA_VISIBLE_DEVICES'] = solve_gpus[i]
139 |                 #logger.debug('Environment provided: {0}'.format(tmp_env))
140 | 
141 |                 out_file_log = os.path.join(maxwell_config.path,
142 |                                             solve_paths[i] + '.log')
143 |                 out_files[i] = open(out_file_log, 'w')
144 |                 logger.debug('Outputing to log file: {0}'.format(out_file_log))
145 | 
146 |                 command = ('mpirun -n ' + str(gpus_per_solve) + ' python ' +
147 |                            path_to_solver_dir + 'fdfd.py ' +
148 |                            os.path.join(maxwell_config.path, solve_paths[i]))
149 |                 logger.debug('Running command {0}'.format(command))
150 | 
151 |                 solve_obj[i] = subprocess.Popen(shlex.split(command),
152 |                                    stdout=out_files[i],
153 |                                    stderr=subprocess.STDOUT,
154 |                                    env=tmp_env)
155 | 
156 | if __name__ == '__main__':
157 |     main()
158 | 


--------------------------------------------------------------------------------
/maxwell-solver/gce/space.py:
--------------------------------------------------------------------------------
  1 | """ Used to setup the global space for GCE. """
  2 | 
  3 | from mpi4py import MPI
  4 | from mpi4py.MPI import COMM_WORLD as comm
  5 | from pycuda import driver 
  6 | 
  7 | 
  8 | def _init_gpu(comm):
  9 |     """ Chooses a gpu and creates a context on it. """
 10 |     # Find out how many GPUs are available to us on this node.
 11 |     driver.init()
 12 |     num_gpus = driver.Device.count()
 13 | 
 14 |     # Figure out the names of the other hosts.
 15 |     rank = comm.Get_rank() # Find out which process I am.
 16 |     name = MPI.Get_processor_name() # The name of my node.
 17 |     hosts = comm.allgather(name) # Get the names of all the other hosts
 18 | 
 19 |     # Find out which GPU to take (by precedence).
 20 |     gpu_id = hosts[0:rank].count(name)
 21 |     if gpu_id >= num_gpus:
 22 |         raise TypeError('No GPU available.')
 23 | 
 24 |     
 25 |     # Create a context on the appropriate device.
 26 |     for k in range(num_gpus):
 27 |         try:
 28 |             device = driver.Device((gpu_id + k) % num_gpus)
 29 |             context = device.make_context()
 30 |         except:
 31 |             continue
 32 |         else:
 33 | #             print "On %s: process %d taking gpu %d of %d.\n" % \
 34 | #                 (name, rank, gpu_id+k, num_gpus)
 35 |             break
 36 | 
 37 |     return device, context # Return device and context.
 38 | 
 39 | # Global variable for the global space.
 40 | # The leading double underscore should prevent outside modules from accessing
 41 | # this variable.
 42 | __GLOBAL_SPACE = None 
 43 | 
 44 | # Upon module initialization, claim a GPU and create a context on it.
 45 | __DEVICE, __CONTEXT = _init_gpu(comm)
 46 | 
 47 | import atexit
 48 | atexit.register(__CONTEXT.pop)
 49 | 
 50 | def initialize_space(shape):
 51 |     """ Form the space. """
 52 |     global __GLOBAL_SPACE, __DEVICE, __CONTEXT 
 53 |     __GLOBAL_SPACE = __Space(shape, __DEVICE, __CONTEXT)
 54 | 
 55 | def get_space_info():
 56 |     """ Returns all the info needed about a space. """
 57 |     if __GLOBAL_SPACE is None: # Global space not yet initialized.
 58 |         raise TypeError('The global space is not initialized.')
 59 |     else:
 60 |         return __GLOBAL_SPACE.get_info()
 61 | 
 62 | def print_space_info():
 63 |     """ Prints out information about the space. """
 64 |     if __GLOBAL_SPACE is None: # Global space not yet initialized.
 65 |         raise TypeError('The global space is not initialized.')
 66 |     info = __GLOBAL_SPACE.get_info()
 67 |     for name, val in info.iteritems():
 68 |         print(name, val)
 69 | 
 70 | # def destroy_space():
 71 | #     """ Set global space to none. """
 72 | #     global __GLOBAL_SPACE 
 73 | #     __GLOBAL_SPACE.__del__()
 74 | #     __GLOBAL_SPACE = None
 75 | 
 76 | class __Space():
 77 |     """ Space forms the 3D context for Grid and Kernel objects. 
 78 |     
 79 |     As of the current implementation, it is assumed that only one space
 80 |     will be created, and that all Const, Grid, and Kernel objects will
 81 |     operate on that space.
 82 | 
 83 |     """
 84 | 
 85 |     def __init__(self, shape, device, context):
 86 |         """ Constructor for the Space class. 
 87 | 
 88 |         Input variables
 89 |         shape -- Three-element tuple of positive integers defining the size of
 90 |             the space in the x-, y-, and z-directions.
 91 | 
 92 |         """
 93 | 
 94 |         # Make sure shape has exactly three elements.
 95 |         if len(shape) is not 3:
 96 |             raise TypeError('Shape must have exactly three elements.')
 97 | 
 98 |         # Make sure they are all integers.
 99 |         if any([type(s) is not int for s in shape]):
100 |             raise TypeError('Shape must have only integer elements.')
101 | 
102 |         # Make sure all elements are positive.
103 |         if any([s < 1 for s in shape]):
104 |             raise TypeError('Shape must have only integer elements.')
105 | 
106 | #         # Make sure stencil is a single, non-negative integer.
107 | #         if (type(stencil) is not int) or (stencil < 0):
108 | #             raise TypeError('Stencil must be a non-negative scalar integer.')
109 | # 
110 |         # Initialize the space.
111 |         self.shape = shape
112 | 
113 |         # Get MPI information.
114 |         rank = comm.Get_rank()
115 |         size = comm.Get_size()
116 | 
117 |         # Nodes to pass forward and backward (along x) to.
118 |         self.mpi_adj = {'forw': (rank+1)%size, 'back': (rank-1)%size}	
119 | 
120 |         # Grid is too small to be partitioned.
121 |         if (size > self.shape[0]): 
122 |             raise TypeError('Shape is too short along x to be partitioned.')
123 | 
124 |         # Create the context on the appropriate GPU.
125 |         # self.device, self.context = self._init_gpu(comm)
126 |         self.device = device
127 |         self.context = context
128 | 
129 |         # Partition the space.
130 |         # Each space is responsible for field[x_range[0]:x_range[1],:,:].
131 |         get_x_range = lambda r: (int(self.shape[0] * (float(r) / size)), \
132 |                                 int(self.shape[0] * (float(r+1) / size)))
133 |         self.x_range = get_x_range(rank)
134 | 
135 |         self.all_x_ranges = [get_x_range(r) for r in range(size)]
136 | 
137 | 
138 | #     def __del__(self):
139 | #         """ Pop the cuda context on cleanup. """
140 | #         # Make sure the space was actually initialized.
141 | #         if hasattr(self, 'context'): 
142 | #             self.context.pop()
143 | 
144 |     def get_info(self):
145 |         """ Return information about the space as a dict. """
146 |         return {'shape': self.shape, \
147 |                 'x_range': self.x_range, \
148 |                 'all_x_ranges': self.all_x_ranges, \
149 |                 'mpi_adj': self.mpi_adj, \
150 |                 'max_shared_mem': self.device.max_shared_memory_per_block, \
151 |                 'max_block_z': self.device.max_block_dim_x, \
152 |                 'max_block_y': self.device.max_block_dim_y, \
153 |                 'max_threads': self.device.max_threads_per_block, \
154 |                 'mem_bandwidth': 1000 * self.device.memory_clock_rate/8 * \
155 |                     self.device.global_memory_bus_width * 2, \
156 |                 'max_registers': self.device.max_registers_per_block, \
157 |                 'async_engine_count': self.device.async_engine_count, \
158 |                 'ecc_enabled': self.device.ecc_enabled, \
159 |                 }
160 | 


--------------------------------------------------------------------------------
/maxwell-solver/kernels/alpha_allpre.cu:
--------------------------------------------------------------------------------
  1 | // Mark the threads that need to load from global memory.
  2 | const bool adj_dims =  (((_X >= -1) && (_X <= {{ dims[0] }})) && \
  3 |                         ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \
  4 |                         ((_Z >= -1) && (_Z <= {{ dims[2] }})));
  5 | 
  6 | // Set relevant field pointers to create wrap-around periodic grid.
  7 | if (_Y == -1) {
  8 |     _Y = {{ dims[1]-1 }};
  9 | }
 10 | if (_Y == {{ dims[1] }}) {
 11 |     _Y = 0;
 12 | }
 13 | if (_Z == -1) {
 14 |     _Z = {{ dims[2]-1 }};
 15 | }
 16 | if (_Z == {{ dims[2] }}) {
 17 |     _Z = 0;
 18 | }
 19 | 
 20 | // Some definitions for shared memory.
 21 | // Used to get unpadded thread indices.
 22 | #define s_ty (_ty + 1)
 23 | #define s_tz (_tz + 1)
 24 | #define s_tyy (_tyy + 2)
 25 | #define s_tzz (_tzz + 2)
 26 | 
 27 | // Helper definitions.
 28 | #define s_next_field (s_tyy * s_tzz)
 29 | #define s_to_local (s_ty * s_tzz + (s_tz))   
 30 | #define s_zp +1
 31 | #define s_zn -1
 32 | #define s_yp +s_tzz
 33 | #define s_yn -s_tzz
 34 | 
 35 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 36 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 37 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 38 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 39 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 40 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 41 | 
 42 | // Local memory.
 43 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n;
 44 | {{ type }} vx, vy, vz;
 45 | {{ type }} px, py, pz, py_p, pz_p;
 46 | 
 47 | int xn, xp;
 48 | if (_X == 0) 
 49 |     xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction.
 50 | else
 51 |     xn = -1;
 52 | 
 53 | 
 54 | // Load E-fields into shared memory.
 55 | if (adj_dims) {
 56 |     // Load in p = r + beta * p.
 57 |     Ex_0[0] = (Rx(-1,0,0) + beta * Px(-1,0,0)) * 
 58 |                 (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z));
 59 |     Ey_0[0] = (Ry(-1,0,0) + beta * Py(-1,0,0)) * 
 60 |                 (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
 61 |     Ez_0[0] = (Rz(-1,0,0) + beta * Pz(-1,0,0)) * 
 62 |                 (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
 63 | 
 64 |     // Ey_p = Ry(0,0,0) + beta * Ey(0,0,0);
 65 |     py_p = Ry(0,0,0) + beta * Py(0,0,0);
 66 |     Ey_p = (py_p) * (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
 67 | 
 68 |     // Ez_p = Rz(0,0,0) + beta * Ez(0,0,0);
 69 |     pz_p = Rz(0,0,0) + beta * Pz(0,0,0);
 70 |     Ez_p = (pz_p) * (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
 71 | }
 72 | __syncthreads();
 73 | 
 74 | // Calculate H-fields and store in shared_memory.
 75 | // Hy.
 76 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
 77 |     Hy_0[0] = my(-1,0,0) * (sx1(_X+xn) * (Ez_0[0] - Ez_p) - 
 78 |                 sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
 79 | }
 80 | 
 81 | // Hz.
 82 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
 83 |     Hz_0[0] = mz(-1,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
 84 |                 sx1(_X+xn) * (Ey_0[0] - Ey_p));
 85 | }
 86 | __syncthreads();
 87 | 
 88 | for (; _X < _x_end ; _X += _txx) {
 89 |     // We've moved ahead in X, so transfer appropriate field values.
 90 |     Ey_0[0] = Ey_p;
 91 |     Ez_0[0] = Ez_p;
 92 |     Hy_n = Hy_0[0];
 93 |     Hz_n = Hz_0[0];
 94 | 
 95 |     py = py_p;
 96 |     pz = pz_p;
 97 | 
 98 |     // Load E-fields into shared memory.
 99 |     if (_X == {{ dims[0]-1 }}) 
100 |         xp = {{ -(dims[0]-1) }};
101 |     else
102 |         xp = +1;
103 | 
104 |     if (adj_dims) {
105 |         px = Rx(0,0,0) + beta * Px(0,0,0);
106 |         Ex_0[0] = (px) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z));
107 | 
108 |         py_p = Ry(+1,0,0) + beta * Py(+1,0,0);    
109 |         Ey_p = (py_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
110 | 
111 |         pz_p = Rz(+1,0,0) + beta * Pz(+1,0,0);
112 |         Ez_p = (pz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
113 |     }
114 | 
115 |     __syncthreads();
116 | 
117 |     // Calculate H-fields and store in shared_memory.
118 |     {% if mu_equals_1 == True %}
119 |     // Hx.
120 |     if ((_ty != _tyy) && (_tz != _tzz)) {
121 |         Hx_0[0] =   (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
122 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
123 |     }
124 | 
125 |     // Hy.
126 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
127 |         Hy_0[0] =   (sx1(_X) * (Ez_0[0] - Ez_p) - 
128 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
129 |     }
130 | 
131 |     // Hz.
132 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
133 |         Hz_0[0] =   (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
134 |                     sx1(_X) * (Ey_0[0] - Ey_p));
135 |     }
136 |     {% else %}
137 |     // Hx.
138 |     if ((_ty != _tyy) && (_tz != _tzz)) {
139 |         Hx_0[0] =   mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
140 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
141 |     }
142 | 
143 |     // Hy.
144 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
145 |         Hy_0[0] =   my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 
146 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
147 |     }
148 | 
149 |     // Hz.
150 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
151 |         Hz_0[0] =   mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
152 |                     sx1(_X) * (Ey_0[0] - Ey_p));
153 |     }
154 |     {% endif %}
155 |     __syncthreads();
156 | 
157 |     // Write out the results.
158 |     if (_in_global && _in_local) {
159 |         {% if full_operator %}
160 |         P1x(0,0,0) = px;
161 |         P1y(0,0,0) = py;
162 |         P1z(0,0,0) = pz;
163 | 
164 |         vx = ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) *
165 |                     (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn])
166 |                     - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn])
167 |                     - ex(0,0,0) * Ex_0[0]));
168 |         vy = ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) *
169 |                     (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 
170 |                     - sx0(_X) * (Hz_0[0] - Hz_n) 
171 |                     - ey(0,0,0) * Ey_0[0]));
172 |         vz = ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) *
173 |                     (sx0(_X) * (Hy_0[0] - Hy_n) 
174 |                     - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 
175 |                     - ez(0,0,0) * Ez_0[0]));
176 | 
177 |         Vx(0,0,0) = vx;
178 |         Vy(0,0,0) = vy;
179 |         Vz(0,0,0) = vz;
180 | 
181 |         alpha_denom += (px * vx) + (py * vy) + (pz * vz);
182 | 
183 |         {% else %}
184 |         Vx(0,0,0) = Hx_0[0];
185 |         Vy(0,0,0) = Hy_0[0];
186 |         Vz(0,0,0) = Hz_0[0];
187 | 
188 |         {% endif %}
189 |     }
190 |     __syncthreads();
191 | }
192 | 


--------------------------------------------------------------------------------
/maxwell-solver/kernels/fdfd_matrix_multiplication.cu:
--------------------------------------------------------------------------------
  1 | // Mark the threads that need to load from global memory.
  2 | const bool adj_dims =  (((_X >= -1) && (_X <= {{ dims[0] }})) && \
  3 |                         ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \
  4 |                         ((_Z >= -1) && (_Z <= {{ dims[2] }})));
  5 | 
  6 | // Set relevant field pointers to create wrap-around periodic grid.
  7 | {{ type }} bloch_phaseYZ_x = 1.0;
  8 | {{ type }} bloch_phaseYZ_y = 1.0;
  9 | {{ type }} bloch_phaseYZ_z = 1.0;
 10 | 
 11 | // Set relevant field pointers to create wrap-around periodic grid.
 12 | if (_Y == -1) {
 13 |     _Y = {{ dims[1]-1 }};
 14 | 	bloch_phaseYZ_x *= conj(bloch_y(0));
 15 | 	bloch_phaseYZ_y *= conj(bloch_y(1));
 16 | 	bloch_phaseYZ_z *= conj(bloch_y(2));
 17 | }
 18 | if (_Y == {{ dims[1] }}) {
 19 |     _Y = 0;
 20 | 	bloch_phaseYZ_x *= bloch_y(0);
 21 | 	bloch_phaseYZ_y *= bloch_y(1);
 22 | 	bloch_phaseYZ_z *= bloch_y(2);
 23 | }
 24 | if (_Z == -1) {
 25 |     _Z = {{ dims[2]-1 }};
 26 | 	bloch_phaseYZ_x *= conj(bloch_z(0));
 27 | 	bloch_phaseYZ_y *= conj(bloch_z(1));
 28 | 	bloch_phaseYZ_z *= conj(bloch_z(2));
 29 | }
 30 | if (_Z == {{ dims[2] }}) {
 31 |     _Z = 0;
 32 | 	bloch_phaseYZ_x *= bloch_z(0);
 33 | 	bloch_phaseYZ_y *= bloch_z(1);
 34 | 	bloch_phaseYZ_z *= bloch_z(2);
 35 | }
 36 | 
 37 | // Some definitions for shared memory.
 38 | // Used to get unpadded thread indices.
 39 | #define s_ty (_ty + 1)
 40 | #define s_tz (_tz + 1)
 41 | #define s_tyy (_tyy + 2)
 42 | #define s_tzz (_tzz + 2)
 43 | 
 44 | // Helper definitions.
 45 | #define s_next_field (s_tyy * s_tzz)
 46 | #define s_to_local (s_ty * s_tzz + (s_tz))   
 47 | #define s_zp +1
 48 | #define s_zn -1
 49 | #define s_yp +s_tzz
 50 | #define s_yn -s_tzz
 51 | 
 52 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 53 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 54 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 55 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 56 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 57 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 58 | 
 59 | // Local memory.
 60 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n;
 61 | {{ type }} vx, vy, vz;
 62 | {{ type }} px, py, pz, py_p, pz_p;
 63 | 
 64 | int xn, xp;
 65 | {{ type }} bloch_phaseX_x = 1;
 66 | {{ type }} bloch_phaseX_y = 1;
 67 | {{ type }} bloch_phaseX_z = 1;
 68 | if (_X == 0) {
 69 |     bloch_phaseX_x = conj(bloch_x(0));
 70 |     bloch_phaseX_y = conj(bloch_x(1));
 71 |     bloch_phaseX_z = conj(bloch_x(2));
 72 |     xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction.
 73 | } else {
 74 |     xn = -1;
 75 | }
 76 | 
 77 | // Load E-fields into shared memory.
 78 | if (adj_dims) {
 79 |     // Load in p = r + beta * p.
 80 |     Ex_0[0] = bloch_phaseX_x * bloch_phaseYZ_x * (Xx(-1,0,0)) * 
 81 |                 (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z));
 82 |     Ey_0[0] = bloch_phaseX_y * bloch_phaseYZ_y * (Xy(-1,0,0)) * 
 83 |                 (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
 84 |     Ez_0[0] = bloch_phaseX_z * bloch_phaseYZ_z * (Xz(-1,0,0)) * 
 85 |                 (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
 86 | 
 87 |     // Ey_p = Ry(0,0,0) + beta * Ey(0,0,0);
 88 |     py_p = Xy(0,0,0);
 89 |     Ey_p = bloch_phaseYZ_y * (py_p) * (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
 90 | 
 91 |     // Ez_p = Rz(0,0,0) + beta * Ez(0,0,0);
 92 |     pz_p = Xz(0,0,0);
 93 |     Ez_p = bloch_phaseYZ_z * (pz_p) * (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
 94 | }
 95 | __syncthreads();
 96 | 
 97 | // Calculate H-fields and store in shared_memory.
 98 | // Hy.
 99 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
100 |     Hy_0[0] = my(-1,0,0) * (sx1(_X+xn) * (Ez_0[0] - Ez_p) - 
101 |                 sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
102 | }
103 | 
104 | // Hz.
105 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
106 |     Hz_0[0] = mz(-1,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
107 |                 sx1(_X+xn) * (Ey_0[0] - Ey_p));
108 | }
109 | __syncthreads();
110 | 
111 | for (; _X < _x_end ; _X += _txx) {
112 |     // We've moved ahead in X, so transfer appropriate field values.
113 |     Ey_0[0] = Ey_p;
114 |     Ez_0[0] = Ez_p;
115 |     Hy_n = Hy_0[0];
116 |     Hz_n = Hz_0[0];
117 | 
118 |     py = py_p;
119 |     pz = pz_p;
120 | 
121 |     // Load E-fields into shared memory.
122 |     if (_X == {{ dims[0]-1 }}) {
123 |         bloch_phaseX_x = bloch_x(0);
124 |         bloch_phaseX_y = bloch_x(1);
125 |         bloch_phaseX_z = bloch_x(2);
126 |         xp = {{ -(dims[0]-1) }};
127 |     } else {
128 |         xp = +1;
129 |     	bloch_phaseX_x = 1;
130 |     	bloch_phaseX_y = 1;
131 |     	bloch_phaseX_z = 1;
132 |     }
133 |     if (adj_dims) {
134 |         px = Xx(0,0,0);
135 |         Ex_0[0] = bloch_phaseYZ_x * (px) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z));
136 | 
137 |         py_p = Xy(+1,0,0);    
138 |         Ey_p = bloch_phaseX_y * bloch_phaseYZ_y * (py_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
139 | 
140 |         pz_p = Xz(+1,0,0);
141 |         Ez_p = bloch_phaseX_z * bloch_phaseYZ_z * (pz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
142 |     }
143 | 
144 |     __syncthreads();
145 | 
146 |     // Calculate H-fields and store in shared_memory.
147 |     {% if mu_equals_1 == True %}
148 |     // Hx.
149 |     if ((_ty != _tyy) && (_tz != _tzz)) {
150 |         Hx_0[0] =   (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
151 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
152 |     }
153 | 
154 |     // Hy.
155 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
156 |         Hy_0[0] =   (sx1(_X) * (Ez_0[0] - Ez_p) - 
157 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
158 |     }
159 | 
160 |     // Hz.
161 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
162 |         Hz_0[0] =   (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
163 |                     sx1(_X) * (Ey_0[0] - Ey_p));
164 |     }
165 |     {% else %}
166 |     // Hx.
167 |     if ((_ty != _tyy) && (_tz != _tzz)) {
168 |         Hx_0[0] =   mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
169 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
170 |     }
171 | 
172 |     // Hy.
173 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
174 |         Hy_0[0] =   my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 
175 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
176 |     }
177 | 
178 |     // Hz.
179 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
180 |         Hz_0[0] =   mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
181 |                     sx1(_X) * (Ey_0[0] - Ey_p));
182 |     }
183 |     {% endif %}
184 |     __syncthreads();
185 | 
186 |     // Write out the results.
187 |     if (_in_global && _in_local) {
188 |         {% if full_operator %}
189 | 
190 |         vx = ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) *
191 |                     (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn])
192 |                     - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn])
193 |                     - ex(0,0,0) * Ex_0[0]));
194 |         vy = ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) *
195 |                     (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 
196 |                     - sx0(_X) * (Hz_0[0] - Hz_n) 
197 |                     - ey(0,0,0) * Ey_0[0]));
198 |         vz = ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) *
199 |                     (sx0(_X) * (Hy_0[0] - Hy_n) 
200 |                     - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 
201 |                     - ez(0,0,0) * Ez_0[0]));
202 | 
203 |         Bx(0,0,0) = vx;
204 |         By(0,0,0) = vy;
205 |         Bz(0,0,0) = vz;
206 | 
207 |         {% else %}
208 |         Bx(0,0,0) = Hx_0[0];
209 |         By(0,0,0) = Hy_0[0];
210 |         Bz(0,0,0) = Hz_0[0];
211 | 
212 |         {% endif %}
213 |     }
214 |     __syncthreads();
215 | }
216 | 


--------------------------------------------------------------------------------
/maxwell-solver/kernels/fdfd_residual.cu:
--------------------------------------------------------------------------------
  1 | // Mark the threads that need to load from global memory.
  2 | const bool adj_dims =  (((_X >= -1) && (_X <= {{ dims[0] }})) && \
  3 |                         ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \
  4 |                         ((_Z >= -1) && (_Z <= {{ dims[2] }})));
  5 | 
  6 | // Set relevant field pointers to create wrap-around periodic grid.
  7 | {{ type }} bloch_phaseYZ_x = 1.0;
  8 | {{ type }} bloch_phaseYZ_y = 1.0;
  9 | {{ type }} bloch_phaseYZ_z = 1.0;
 10 | 
 11 | // Set relevant field pointers to create wrap-around periodic grid.
 12 | if (_Y == -1) {
 13 |     _Y = {{ dims[1]-1 }};
 14 | 	bloch_phaseYZ_x *= bloch_y(0);
 15 | 	bloch_phaseYZ_y *= bloch_y(1);
 16 | 	bloch_phaseYZ_z *= bloch_y(2);
 17 | }
 18 | if (_Y == {{ dims[1] }}) {
 19 |     _Y = 0;
 20 | 	bloch_phaseYZ_x *= conj(bloch_y(0));
 21 | 	bloch_phaseYZ_y *= conj(bloch_y(1));
 22 | 	bloch_phaseYZ_z *= conj(bloch_y(2));
 23 | }
 24 | if (_Z == -1) {
 25 |     _Z = {{ dims[2]-1 }};
 26 | 	bloch_phaseYZ_x *= bloch_z(0);
 27 | 	bloch_phaseYZ_y *= bloch_z(1);
 28 | 	bloch_phaseYZ_z *= bloch_z(2);
 29 | }
 30 | if (_Z == {{ dims[2] }}) {
 31 |     _Z = 0;
 32 | 	bloch_phaseYZ_x *= conj(bloch_z(0));
 33 | 	bloch_phaseYZ_y *= conj(bloch_z(1));
 34 | 	bloch_phaseYZ_z *= conj(bloch_z(2));
 35 | }
 36 | 
 37 | // Some definitions for shared memory.
 38 | // Used to get unpadded thread indices.
 39 | #define s_ty (_ty + 1)
 40 | #define s_tz (_tz + 1)
 41 | #define s_tyy (_tyy + 2)
 42 | #define s_tzz (_tzz + 2)
 43 | 
 44 | // Helper definitions.
 45 | #define s_next_field (s_tyy * s_tzz)
 46 | #define s_to_local (s_ty * s_tzz + (s_tz))   
 47 | #define s_zp +1
 48 | #define s_zn -1
 49 | #define s_yp +s_tzz
 50 | #define s_yn -s_tzz
 51 | 
 52 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 53 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 54 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 55 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 56 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 57 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 58 | 
 59 | // Local memory.
 60 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n;
 61 | {{ type }} vx, vy, vz;
 62 | {{ type }} px, py, pz, py_p, pz_p;
 63 | 
 64 | int xn, xp;
 65 | {{ type }} bloch_phaseX_x = 1;
 66 | {{ type }} bloch_phaseX_y = 1;
 67 | {{ type }} bloch_phaseX_z = 1;
 68 | if (_X == 0) {
 69 |     bloch_phaseX_x = bloch_x(0);
 70 |     bloch_phaseX_y = bloch_x(1);
 71 |     bloch_phaseX_z = bloch_x(2);
 72 |     xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction.
 73 | } else {
 74 |     xn = -1;
 75 | }
 76 | 
 77 | // Load E-fields into shared memory.
 78 | if (adj_dims) {
 79 |     // Load in p = r + beta * p.
 80 |     Ex_0[0] = bloch_phaseX_x * bloch_phaseYZ_x * (Xx(-1,0,0)) * 
 81 |                 (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z));
 82 |     Ey_0[0] = bloch_phaseX_y * bloch_phaseYZ_y * (Xy(-1,0,0)) * 
 83 |                 (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
 84 |     Ez_0[0] = bloch_phaseX_z * bloch_phaseYZ_z * (Xz(-1,0,0)) * 
 85 |                 (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
 86 | 
 87 |     // Ey_p = Ry(0,0,0) + beta * Ey(0,0,0);
 88 |     py_p = Xy(0,0,0);
 89 |     Ey_p = bloch_phaseYZ_y * (py_p) * (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
 90 | 
 91 |     // Ez_p = Rz(0,0,0) + beta * Ez(0,0,0);
 92 |     pz_p = Xz(0,0,0);
 93 |     Ez_p = bloch_phaseYZ_z * (pz_p) * (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
 94 | }
 95 | __syncthreads();
 96 | 
 97 | // Calculate H-fields and store in shared_memory.
 98 | // Hy.
 99 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
100 |     Hy_0[0] = my(-1,0,0) * (sx1(_X+xn) * (Ez_0[0] - Ez_p) - 
101 |                 sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
102 | }
103 | 
104 | // Hz.
105 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
106 |     Hz_0[0] = mz(-1,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
107 |                 sx1(_X+xn) * (Ey_0[0] - Ey_p));
108 | }
109 | __syncthreads();
110 | 
111 | for (; _X < _x_end ; _X += _txx) {
112 |     // We've moved ahead in X, so transfer appropriate field values.
113 |     Ey_0[0] = Ey_p;
114 |     Ez_0[0] = Ez_p;
115 |     Hy_n = Hy_0[0];
116 |     Hz_n = Hz_0[0];
117 | 
118 |     py = py_p;
119 |     pz = pz_p;
120 | 
121 |     // Load E-fields into shared memory.
122 |     if (_X == {{ dims[0]-1 }}) {
123 |         bloch_phaseX_x = conj(bloch_x(0));
124 |         bloch_phaseX_y = conj(bloch_x(1));
125 |         bloch_phaseX_z = conj(bloch_x(2));
126 |         xp = {{ -(dims[0]-1) }};
127 |     } else {
128 |         xp = +1;
129 |     	bloch_phaseX_x = 1;
130 |     	bloch_phaseX_y = 1;
131 |     	bloch_phaseX_z = 1;
132 |     }
133 |     if (adj_dims) {
134 |         px = Xx(0,0,0);
135 |         Ex_0[0] = bloch_phaseYZ_x * (px) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z));
136 | 
137 |         py_p = Xy(+1,0,0);    
138 |         Ey_p = bloch_phaseX_y * bloch_phaseYZ_y * (py_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
139 | 
140 |         pz_p = Xz(+1,0,0);
141 |         Ez_p = bloch_phaseX_z * bloch_phaseYZ_z * (pz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
142 |     }
143 | 
144 |     __syncthreads();
145 | 
146 |     // Calculate H-fields and store in shared_memory.
147 |     {% if mu_equals_1 == True %}
148 |     // Hx.
149 |     if ((_ty != _tyy) && (_tz != _tzz)) {
150 |         Hx_0[0] =   (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
151 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
152 |     }
153 | 
154 |     // Hy.
155 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
156 |         Hy_0[0] =   (sx1(_X) * (Ez_0[0] - Ez_p) - 
157 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
158 |     }
159 | 
160 |     // Hz.
161 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
162 |         Hz_0[0] =   (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
163 |                     sx1(_X) * (Ey_0[0] - Ey_p));
164 |     }
165 |     {% else %}
166 |     // Hx.
167 |     if ((_ty != _tyy) && (_tz != _tzz)) {
168 |         Hx_0[0] =   mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
169 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
170 |     }
171 | 
172 |     // Hy.
173 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
174 |         Hy_0[0] =   my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 
175 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
176 |     }
177 | 
178 |     // Hz.
179 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
180 |         Hz_0[0] =   mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
181 |                     sx1(_X) * (Ey_0[0] - Ey_p));
182 |     }
183 |     {% endif %}
184 |     __syncthreads();
185 | 
186 |     // Write out the results.
187 |     if (_in_global && _in_local) {
188 |         {% if full_operator %}
189 | 
190 |         vx = Bx(0,0,0) - ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) *
191 |                     (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn])
192 |                     - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn])
193 |                     - ex(0,0,0) * Ex_0[0]));
194 |         vy = By(0,0,0) - ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) *
195 |                     (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 
196 |                     - sx0(_X) * (Hz_0[0] - Hz_n) 
197 |                     - ey(0,0,0) * Ey_0[0]));
198 |         vz = Bz(0,0,0) - ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) *
199 |                     (sx0(_X) * (Hy_0[0] - Hy_n) 
200 |                     - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 
201 |                     - ez(0,0,0) * Ez_0[0]));
202 | 
203 |         Rx(0,0,0) = vx;
204 |         Ry(0,0,0) = vy;
205 |         Rz(0,0,0) = vz;
206 | 
207 |         {% else %}
208 |         Bx(0,0,0) = Hx_0[0];
209 |         By(0,0,0) = Hy_0[0];
210 |         Bz(0,0,0) = Hz_0[0];
211 | 
212 |         {% endif %}
213 |     }
214 |     __syncthreads();
215 | }
216 | 


--------------------------------------------------------------------------------
/maxwell-solver/kernels/alpha_biCGSTAB.cu:
--------------------------------------------------------------------------------
  1 | // Mark the threads that need to load from global memory.
  2 | const bool adj_dims =  (((_X >= -1) && (_X <= {{ dims[0] }})) && \
  3 |                         ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \
  4 |                         ((_Z >= -1) && (_Z <= {{ dims[2] }})));
  5 | 
  6 | // Set relevant field pointers to create wrap-around periodic grid.
  7 | {{ type }} bloch_phaseYZ_x = 1.0;
  8 | {{ type }} bloch_phaseYZ_y = 1.0;
  9 | {{ type }} bloch_phaseYZ_z = 1.0;
 10 | 
 11 | // Set relevant field pointers to create wrap-around periodic grid.
 12 | if (_Y == -1) {
 13 |     _Y = {{ dims[1]-1 }};
 14 | 	bloch_phaseYZ_x *= bloch_y(0);
 15 | 	bloch_phaseYZ_y *= bloch_y(1);
 16 | 	bloch_phaseYZ_z *= bloch_y(2);
 17 | }
 18 | if (_Y == {{ dims[1] }}) {
 19 |     _Y = 0;
 20 | 	bloch_phaseYZ_x *= conj(bloch_y(0));
 21 | 	bloch_phaseYZ_y *= conj(bloch_y(1));
 22 | 	bloch_phaseYZ_z *= conj(bloch_y(2));
 23 | }
 24 | if (_Z == -1) {
 25 |     _Z = {{ dims[2]-1 }};
 26 | 	bloch_phaseYZ_x *= bloch_z(0);
 27 | 	bloch_phaseYZ_y *= bloch_z(1);
 28 | 	bloch_phaseYZ_z *= bloch_z(2);
 29 | }
 30 | if (_Z == {{ dims[2] }}) {
 31 |     _Z = 0;
 32 | 	bloch_phaseYZ_x *= conj(bloch_z(0));
 33 | 	bloch_phaseYZ_y *= conj(bloch_z(1));
 34 | 	bloch_phaseYZ_z *= conj(bloch_z(2));
 35 | }
 36 | 
 37 | // Some definitions for shared memory.
 38 | // Used to get unpadded thread indices.
 39 | #define s_ty (_ty + 1)
 40 | #define s_tz (_tz + 1)
 41 | #define s_tyy (_tyy + 2)
 42 | #define s_tzz (_tzz + 2)
 43 | 
 44 | // Helper definitions.
 45 | #define s_next_field (s_tyy * s_tzz)
 46 | #define s_to_local (s_ty * s_tzz + (s_tz))   
 47 | #define s_zp +1
 48 | #define s_zn -1
 49 | #define s_yp +s_tzz
 50 | #define s_yn -s_tzz
 51 | 
 52 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 53 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 54 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 55 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 56 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 57 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 58 | 
 59 | // Local memory.
 60 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n;
 61 | {{ type }} vx, vy, vz;
 62 | {{ type }} px, py, pz, py_p, pz_p;
 63 | 
 64 | int xn, xp;
 65 | {{ type }} bloch_phaseX_x = 1;
 66 | {{ type }} bloch_phaseX_y = 1;
 67 | {{ type }} bloch_phaseX_z = 1;
 68 | if (_X == 0) {
 69 |     bloch_phaseX_x = bloch_x(0);
 70 |     bloch_phaseX_y = bloch_x(1);
 71 |     bloch_phaseX_z = bloch_x(2);
 72 |     xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction.
 73 | } else {
 74 |     xn = -1;
 75 | }
 76 | 
 77 | // Load E-fields into shared memory.
 78 | if (adj_dims) {
 79 |     // Load in p = r + beta * p.
 80 |     Ex_0[0] = bloch_phaseX_x * bloch_phaseYZ_x * (Rx(-1,0,0) + beta * (Px(-1,0,0) - omega * Vx(-1,0,0))) * 
 81 |                 (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z));
 82 |     Ey_0[0] = bloch_phaseX_y * bloch_phaseYZ_y * (Ry(-1,0,0) + beta * (Py(-1,0,0) - omega * Vy(-1,0,0))) * 
 83 |                 (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
 84 |     Ez_0[0] = bloch_phaseX_z * bloch_phaseYZ_z * (Rz(-1,0,0) + beta * (Pz(-1,0,0) - omega * Vz(-1,0,0))) * 
 85 |                 (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
 86 | 
 87 |     // Ey_p = Ry(0,0,0) + beta * Ey(0,0,0);
 88 |     py_p = Ry(0,0,0) + beta * (Py(0,0,0) - omega * Vy(0,0,0));
 89 |     Ey_p = bloch_phaseYZ_y * (py_p) * (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
 90 | 
 91 |     // Ez_p = Rz(0,0,0) + beta * Ez(0,0,0);
 92 |     pz_p = Rz(0,0,0) + beta * (Pz(0,0,0) - omega * Vz(0,0,0));
 93 |     Ez_p = bloch_phaseYZ_z * (pz_p) * (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
 94 | }
 95 | __syncthreads();
 96 | 
 97 | // Calculate H-fields and store in shared_memory.
 98 | // Hy.
 99 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
100 |     Hy_0[0] = my(-1,0,0) * (sx1(_X+xn) * (Ez_0[0] - Ez_p) - 
101 |                 sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
102 | }
103 | 
104 | // Hz.
105 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
106 |     Hz_0[0] = mz(-1,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
107 |                 sx1(_X+xn) * (Ey_0[0] - Ey_p));
108 | }
109 | __syncthreads();
110 | 
111 | for (; _X < _x_end ; _X += _txx) {
112 |     // We've moved ahead in X, so transfer appropriate field values.
113 |     Ey_0[0] = Ey_p;
114 |     Ez_0[0] = Ez_p;
115 |     Hy_n = Hy_0[0];
116 |     Hz_n = Hz_0[0];
117 | 
118 |     py = py_p;
119 |     pz = pz_p;
120 | 
121 |     // Load E-fields into shared memory.
122 |     if (_X == {{ dims[0]-1 }}) {
123 |         bloch_phaseX_x = conj(bloch_x(0));
124 |         bloch_phaseX_y = conj(bloch_x(1));
125 |         bloch_phaseX_z = conj(bloch_x(2));
126 |         xp = {{ -(dims[0]-1) }};
127 |     } else {
128 |         xp = +1;
129 |     	bloch_phaseX_x = 1;
130 |     	bloch_phaseX_y = 1;
131 |     	bloch_phaseX_z = 1;
132 |     }
133 |     if (adj_dims) {
134 |         px = Rx(0,0,0) + beta * (Px(0,0,0) - omega * Vx(0,0,0));
135 |         Ex_0[0] = bloch_phaseYZ_x * (px) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z));
136 | 
137 |         py_p = Ry(+1,0,0) + beta * (Py(+1,0,0) - omega * Vy(+1,0,0));    
138 |         Ey_p = bloch_phaseX_y * bloch_phaseYZ_y * (py_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
139 | 
140 |         pz_p = Rz(+1,0,0) + beta * (Pz(+1,0,0) - omega * Vz(+1,0,0));
141 |         Ez_p = bloch_phaseX_z * bloch_phaseYZ_z * (pz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
142 |     }
143 | 
144 |     __syncthreads();
145 | 
146 |     // Calculate H-fields and store in shared_memory.
147 |     {% if mu_equals_1 == True %}
148 |     // Hx.
149 |     if ((_ty != _tyy) && (_tz != _tzz)) {
150 |         Hx_0[0] =   (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
151 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
152 |     }
153 | 
154 |     // Hy.
155 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
156 |         Hy_0[0] =   (sx1(_X) * (Ez_0[0] - Ez_p) - 
157 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
158 |     }
159 | 
160 |     // Hz.
161 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
162 |         Hz_0[0] =   (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
163 |                     sx1(_X) * (Ey_0[0] - Ey_p));
164 |     }
165 |     {% else %}
166 |     // Hx.
167 |     if ((_ty != _tyy) && (_tz != _tzz)) {
168 |         Hx_0[0] =   mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
169 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
170 |     }
171 | 
172 |     // Hy.
173 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
174 |         Hy_0[0] =   my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 
175 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
176 |     }
177 | 
178 |     // Hz.
179 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
180 |         Hz_0[0] =   mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
181 |                     sx1(_X) * (Ey_0[0] - Ey_p));
182 |     }
183 |     {% endif %}
184 |     __syncthreads();
185 | 
186 |     // Write out the results.
187 |     if (_in_global && _in_local) {
188 |         {% if full_operator %}
189 |         P1x(0,0,0) = px;
190 |         P1y(0,0,0) = py;
191 |         P1z(0,0,0) = pz;
192 | 
193 |         vx = ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) *
194 |                     (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn])
195 |                     - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn])
196 |                     - ex(0,0,0) * Ex_0[0]));
197 |         vy = ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) *
198 |                     (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 
199 |                     - sx0(_X) * (Hz_0[0] - Hz_n) 
200 |                     - ey(0,0,0) * Ey_0[0]));
201 |         vz = ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) *
202 |                     (sx0(_X) * (Hy_0[0] - Hy_n) 
203 |                     - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 
204 |                     - ez(0,0,0) * Ez_0[0]));
205 | 
206 |         V1x(0,0,0) = vx;
207 |         V1y(0,0,0) = vy;
208 |         V1z(0,0,0) = vz;
209 | 
210 |         alpha_denom += (R_hatHx(0,0,0) * vx) + (R_hatHy(0,0,0) * vy) + (R_hatHz(0,0,0) * vz);
211 | 
212 |         {% else %}
213 |         V1x(0,0,0) = Hx_0[0];
214 |         V1y(0,0,0) = Hy_0[0];
215 |         V1z(0,0,0) = Hz_0[0];
216 | 
217 |         {% endif %}
218 |     }
219 |     __syncthreads();
220 | }
221 | 


--------------------------------------------------------------------------------
/maxwell-solver/kernels/omega_bloch_allpre.cu:
--------------------------------------------------------------------------------
  1 | // Mark the threads that need to load from global memory.
  2 | const bool adj_dims =  (((_X >= -1) && (_X <= {{ dims[0] }})) && \
  3 |                         ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \
  4 |                         ((_Z >= -1) && (_Z <= {{ dims[2] }})));
  5 | 
  6 | // Set relevant field pointers to create wrap-around periodic grid.
  7 | {{ type }} bloch_phaseYZ_x = 1.0;
  8 | {{ type }} bloch_phaseYZ_y = 1.0;
  9 | {{ type }} bloch_phaseYZ_z = 1.0;
 10 | if (_Y == -1) {
 11 |     _Y = {{ dims[1]-1 }};
 12 | 	bloch_phaseYZ_x *= conj(bloch_y(0));
 13 | 	bloch_phaseYZ_y *= conj(bloch_y(1));
 14 | 	bloch_phaseYZ_z *= conj(bloch_y(2));
 15 | }
 16 | if (_Y == {{ dims[1] }}) {
 17 |     _Y = 0;
 18 | 	bloch_phaseYZ_x *= bloch_y(0);
 19 | 	bloch_phaseYZ_y *= bloch_y(1);
 20 | 	bloch_phaseYZ_z *= bloch_y(2);
 21 | }
 22 | if (_Z == -1) {
 23 |     _Z = {{ dims[2]-1 }};
 24 | 	bloch_phaseYZ_x *= conj(bloch_z(0));
 25 | 	bloch_phaseYZ_y *= conj(bloch_z(1));
 26 | 	bloch_phaseYZ_z *= conj(bloch_z(2));
 27 | }
 28 | if (_Z == {{ dims[2] }}) {
 29 |     _Z = 0;
 30 | 	bloch_phaseYZ_x *= bloch_z(0);
 31 | 	bloch_phaseYZ_y *= bloch_z(1);
 32 | 	bloch_phaseYZ_z *= bloch_z(2);
 33 | }
 34 | 
 35 | // Some definitions for shared memory.
 36 | // Used to get unpadded thread indices.
 37 | #define s_ty (_ty + 1)
 38 | #define s_tz (_tz + 1)
 39 | #define s_tyy (_tyy + 2)
 40 | #define s_tzz (_tzz + 2)
 41 | 
 42 | // Helper definitions.
 43 | #define s_next_field (s_tyy * s_tzz)
 44 | #define s_to_local (s_ty * s_tzz + (s_tz))   
 45 | #define s_zp +1
 46 | #define s_zn -1
 47 | #define s_yp +s_tzz
 48 | #define s_yn -s_tzz
 49 | 
 50 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 51 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 52 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 53 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 54 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 55 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
 56 | 
 57 | // Local memory.
 58 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n;
 59 | {{ type }} tx, ty, tz;
 60 | {{ type }} sx, sy, sz, sy_p, sz_p;
 61 | 
 62 | int xn, xp;
 63 | {{ type }} bloch_phaseX_x = 1;
 64 | {{ type }} bloch_phaseX_y = 1;
 65 | {{ type }} bloch_phaseX_z = 1;
 66 | if (_X == 0) { 
 67 |     xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction.
 68 |     bloch_phaseX_x = conj(bloch_x(0));
 69 |     bloch_phaseX_y = conj(bloch_x(1));
 70 |     bloch_phaseX_z = conj(bloch_x(2));}
 71 | else {
 72 |     xn = -1;}
 73 | 
 74 | // Load E-fields into shared memory.
 75 | if (adj_dims) {
 76 |     // Load in s = r - alpha * v.
 77 |     Ex_0[0] = bloch_phaseX_x*bloch_phaseYZ_x*(Rx(-1,0,0) - alpha * Vx(-1,0,0)) * 
 78 |                 (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z));
 79 |     Ey_0[0] = bloch_phaseX_y*bloch_phaseYZ_y*(Ry(-1,0,0) - alpha * Vy(-1,0,0)) * 
 80 |                 (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
 81 |     Ez_0[0] = bloch_phaseX_z*bloch_phaseYZ_z*(Rz(-1,0,0) - alpha * Vz(-1,0,0)) * 
 82 |                 (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
 83 | 
 84 |     // Ey_p = Ry(0,0,0) - alpha * Vy(0,0,0);
 85 |     sy_p = Ry(0,0,0) - alpha * Vy(0,0,0);
 86 |     Ey_p = bloch_phaseYZ_y * (sy_p) * (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
 87 | 
 88 |     // Ez_p = Rz(0,0,0) - alpha * Vz(0,0,0);
 89 |     sz_p = Rz(0,0,0) - alpha * Vz(0,0,0);
 90 |     Ez_p = bloch_phaseYZ_z * (sz_p) * (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
 91 | }
 92 | __syncthreads();
 93 | 
 94 | // Calculate H-fields and store in shared_memory.
 95 | // Hy.
 96 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
 97 |     Hy_0[0] = my(-1,0,0) * (sx1(_X+xn) * (Ez_0[0] - Ez_p) - 
 98 |                 sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
 99 | }
100 | 
101 | // Hz.
102 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
103 |     Hz_0[0] = mz(-1,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
104 |                 sx1(_X+xn) * (Ey_0[0] - Ey_p));
105 | }
106 | __syncthreads();
107 | 
108 | for (; _X < _x_end ; _X += _txx) {
109 |     // We've moved ahead in X, so transfer appropriate field values.
110 |     Ey_0[0] = Ey_p;
111 |     Ez_0[0] = Ez_p;
112 |     Hy_n = Hy_0[0];
113 |     Hz_n = Hz_0[0];
114 | 
115 |     sy = sy_p;
116 |     sz = sz_p;
117 | 
118 |     // Load E-fields into shared memory.
119 |     if (_X == {{ dims[0]-1 }}){
120 |         xp = {{ -(dims[0]-1) }};
121 |         bloch_phaseX_x = bloch_x(0);
122 |         bloch_phaseX_y = bloch_x(1);
123 |         bloch_phaseX_z = bloch_x(2);}
124 |     else {
125 |         xp = +1;
126 |     	bloch_phaseX_x = 1;
127 |     	bloch_phaseX_y = 1;
128 |     	bloch_phaseX_z = 1;}
129 | 
130 |     if (adj_dims) {
131 |         sx = Rx(0,0,0) - alpha * Vx(0,0,0);
132 |         Ex_0[0] = bloch_phaseYZ_x * (sx) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z));
133 | 
134 |         sy_p = Ry(+1,0,0) - alpha * Vy(+1,0,0);    
135 |         Ey_p = bloch_phaseX_y * bloch_phaseYZ_y * (sy_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
136 | 
137 |         sz_p = Rz(+1,0,0) - alpha * Vz(+1,0,0);
138 |         Ez_p = bloch_phaseX_z * bloch_phaseYZ_z * (sz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
139 |     }
140 | 
141 |     __syncthreads();
142 | 
143 |     // Calculate H-fields and store in shared_memory.
144 |     {% if mu_equals_1 == True %}
145 |     // Hx.
146 |     if ((_ty != _tyy) && (_tz != _tzz)) {
147 |         Hx_0[0] =   (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
148 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
149 |     }
150 | 
151 |     // Hy.
152 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
153 |         Hy_0[0] =   (sx1(_X) * (Ez_0[0] - Ez_p) - 
154 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
155 |     }
156 | 
157 |     // Hz.
158 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
159 |         Hz_0[0] =   (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
160 |                     sx1(_X) * (Ey_0[0] - Ey_p));
161 |     }
162 |     {% else %}
163 |     // Hx.
164 |     if ((_ty != _tyy) && (_tz != _tzz)) {
165 |         Hx_0[0] =   mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
166 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
167 |     }
168 | 
169 |     // Hy.
170 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
171 |         Hy_0[0] =   my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 
172 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
173 |     }
174 | 
175 |     // Hz.
176 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
177 |         Hz_0[0] =   mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
178 |                     sx1(_X) * (Ey_0[0] - Ey_p));
179 |     }
180 |     {% endif %}
181 |     __syncthreads();
182 | 
183 |     // Write out the results.
184 |     if (_in_global && _in_local) {
185 |         {% if full_operator %}
186 |         Sx(0,0,0) = sx;
187 |         Sy(0,0,0) = sy;
188 |         Sz(0,0,0) = sz;
189 | 
190 |         tx = ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) *
191 |                     (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn])
192 |                     - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn])
193 |                     - ex(0,0,0) * Ex_0[0]));
194 |         ty = ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) *
195 |                     (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 
196 |                     - sx0(_X) * (Hz_0[0] - Hz_n) 
197 |                     - ey(0,0,0) * Ey_0[0]));
198 |         tz = ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) *
199 |                     (sx0(_X) * (Hy_0[0] - Hy_n) 
200 |                     - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 
201 |                     - ez(0,0,0) * Ez_0[0]));
202 | 
203 |         Tx(0,0,0) = tx;
204 |         Ty(0,0,0) = ty;
205 |         Tz(0,0,0) = tz;	
206 | 
207 |         omega_num += conj(tx) * sx + conj(ty) * sy + conj(tz) * sz;
208 |         omega_denom += (real(tx) * real(tx)) + (imag(tx) * imag(tx)) + 
209 | 	               (real(ty) * real(ty)) + (imag(ty) * imag(ty)) +
210 | 		       (real(tz) * real(tz)) + (imag(tz) * imag(tz));
211 |         //omega_num += tx * sx + ty * sy + tz * sz;
212 |         //omega_denom += tx * tx + ty * ty + tz * tz;
213 | 
214 |         {% else %}
215 |         Vx(0,0,0) = Hx_0[0];
216 |         Vy(0,0,0) = Hy_0[0];
217 |         Vz(0,0,0) = Hz_0[0];
218 | 
219 |         {% endif %}
220 |     }
221 |     __syncthreads();
222 | }
223 | 


--------------------------------------------------------------------------------
/maxwell-solver/gce/grid.py:
--------------------------------------------------------------------------------
  1 | """ Defines the Grid class for GCE. """
  2 | 
  3 | from pycuda import gpuarray as ga
  4 | from pycuda import driver as drv
  5 | from gce.space import get_space_info
  6 | from gce.data import Data
  7 | import numpy as np
  8 | from mpi4py.MPI import COMM_WORLD as comm
  9 | import threading
 10 | 
 11 | 
 12 | 
 13 | class Grid(Data):
 14 |     """ Grid class for GCE. 
 15 |     
 16 |     Grids store modifiable information on a 3D rectangular grid.
 17 | 
 18 |     Grids may be split up along the x-axis for parallel processing. 
 19 |     If a particular Grid requires adjacent values in the x-direction,
 20 |     then the needed adjacent cells can be synchronized through use of the 
 21 |     x_overlap option and the synchronize(), synchronize_start(), and
 22 |     synchronize_isdone() functions.
 23 | 
 24 |     Derives from the Data class.
 25 | 
 26 |     New methods:
 27 |     __init__ -- Loads a (possibly empty) array onto the GPU.
 28 |     synchronize -- Used to synchronize Grids with non-zero x_overlap (blocking).
 29 |     synchronize_start -- Initiate non-blocking synchronization.
 30 |     synchronize_isdone -- Advance and complete non-blocking synchronization.
 31 | 
 32 |     New variables:
 33 |     none
 34 |     
 35 |     """
 36 | 
 37 | 
 38 |     def __init__(self, array_or_dtype, x_overlap=0):
 39 |         """ Create a spatial grid on the GPU(s).
 40 | 
 41 |         Input variables
 42 |         array_or_dtype -- can either be a numpy array of the same shape as
 43 |             the global space, or a numpy dtype. If a valid array is passed, 
 44 |             it will be loaded on to the GPU. If a dtype is passed, then
 45 |             an array of zeros, of that dtype will be loaded onto the GPU.
 46 | 
 47 |         Optional variables
 48 |         x_overlap -- the number of adjacent cells in either the negative or
 49 |             positive x-direction that need to simultaneously be accessed along
 50 |             with the current cell. Must be a non-negative integer. Default
 51 |             value is 0.
 52 | 
 53 |         """
 54 | 
 55 |         shape = get_space_info()['shape'] # Get the shape of the space.
 56 |         xr = get_space_info()['x_range'] # Get the local x_range.
 57 |         all_x_ranges = get_space_info()['all_x_ranges'] # Get the local x_range.
 58 |         local_shape = (xr[1]-xr[0], shape[1], shape[2])
 59 | 
 60 |         self._set_gce_type('grid') # Set the gce type to grid.
 61 | 
 62 |         # Make sure overlap option is valid.
 63 |         if type(x_overlap) is not int:
 64 |             raise TypeError('x_overlap must be an integer.')
 65 |         elif x_overlap < 0:
 66 |             raise TypeError('x_overlap must be a non-negative integer.')
 67 | 
 68 |         if comm.rank == 0:
 69 |             # Process the array_or_dtype input variable.
 70 |             if type(array_or_dtype) is np.ndarray: # Input is an array.
 71 |                 array = array_or_dtype
 72 | 
 73 |                 # Make sure the array is of the correct shape.
 74 |                 if array.shape != shape:
 75 |                     raise TypeError('Shape of array does not match shape of space.')
 76 | 
 77 |                 # Make sure the array is of a valid datatype.
 78 |                 self._get_dtype(array.dtype.type)
 79 | 
 80 | 
 81 |             elif type(array_or_dtype) is type: # Input is a datatype.
 82 |                 self._get_dtype(array_or_dtype) # Validate the dtype.
 83 |                 array = np.zeros(shape, dtype=self.dtype) # Make a zeros array.
 84 | 
 85 |             else: # Invalid input.
 86 |                 raise TypeError('Input variable must be a numpy array or dtype')
 87 | 
 88 |             # Prepare array to be scattered.
 89 |             array = [array[r[0]:r[1],:,:] for r in all_x_ranges]
 90 | 
 91 |         else:
 92 |             array = None
 93 | 
 94 |         array = comm.scatter(array)
 95 |         self._get_dtype(array.dtype.type)
 96 | 
 97 | #         # Narrow down the array to local x_range.
 98 | #         array = array[xr[0]:xr[1],:,:]
 99 | 
100 |         # Add padding to array, if needed.
101 |         self._xlap = x_overlap
102 |         if self._xlap is not 0:
103 |             padding = np.empty((self._xlap,) + shape[1:3], dtype=array.dtype)
104 |             array = np.concatenate((padding, array, padding), axis=0)
105 | 
106 |         self.to_gpu(array) # Load onto device.
107 | 
108 |         # Determine information needed for synchronization.
109 |         if self._xlap is not 0:
110 |             # Calculates the pointer to the x offset in a grid.
111 |             ptr_dx = lambda x_pos: self.data.ptr + self.data.dtype.itemsize * \
112 |                                         x_pos * shape[1] * shape[2]
113 |             
114 |             # Pointers to different sections of the grid that are relevant
115 |             # for synchronization.
116 |             self._sync_ptrs = { 'forw_src': ptr_dx(xr[1]-xr[0]), \
117 |                                 'back_dest': ptr_dx(0), \
118 |                                 'back_src': ptr_dx(self._xlap), \
119 |                                 'forw_dest': ptr_dx(xr[1]-xr[0] + self._xlap)}
120 | 
121 |             # Buffers used during synchronization.
122 |             self._sync_buffers = [drv.pagelocked_empty( \
123 |                                     (self._xlap, shape[1], shape[2]), \
124 |                                     self.dtype) for k in range(4)]
125 | 
126 |             # Streams used during synchronization.
127 |             self._sync_streams = [drv.Stream() for k in range(4)]
128 | 
129 |             # Used to identify neighboring MPI nodes with whom to synchronize.
130 |             self._sync_adj = get_space_info()['mpi_adj']
131 | 
132 |             # Offset in bytes to the true start of the grid.
133 |             # This is used to "hide" overlap areas from the kernel.
134 |             self._xlap_offset = self.data.dtype.itemsize * \
135 |                                 self._xlap * shape[1] * shape[2]
136 | 
137 |             self.synchronize() # Synchronize the grid.
138 |             comm.Barrier() # Wait for all grids to synchronize before proceeding.
139 | 
140 |     def get(self):
141 |         """ Redefined so that we don't get overlap data. """
142 |         # Get our section of the grid (excluding overlap).
143 |         if self._xlap is 0:
144 |             data = self.data.get()
145 |         else:
146 |             data = self.data.get()[self._xlap:-self._xlap,:,:]
147 |         
148 | #         return np.concatenate(comm.allgather(data), axis=0) # Super-simple.
149 | 
150 |         result = comm.gather(data) # Gather all peices to root.
151 |         if comm.Get_rank() == 0:
152 |             # Root node glues everything together.
153 |             return np.concatenate(result, axis=0) 
154 |         else: 
155 |             return None
156 | 
157 |     def _get_raw(self):
158 |         """ Output even the overlap data. Just for debugging/testing. """
159 |         return self.data.get()
160 | 
161 |     def synchronize(self):
162 |         """ Blocking synchronization.  """
163 | 
164 |         if self._xlap is 0:
165 |             raise TypeError('No need to synchronize Grid with no overlaps.')
166 | 
167 |         self.synchronize_start()
168 |         while not self.synchronize_isdone():
169 |             pass
170 | 
171 |     def synchronize_start(self):
172 |         """ Start the synchronization process. """
173 | 
174 |         # Use shorter, easier names for class variables.
175 |         bufs = self._sync_buffers
176 |         ptrs = self._sync_ptrs
177 |         streams = self._sync_streams
178 |         adj = self._sync_adj
179 | 
180 |         # Start the transfer operations needed.
181 |         self._sync_tags = [mpi_tag() for k in range(2)] # Mpi message tags.
182 | 
183 |         # Forward send.
184 |         drv.memcpy_dtoh_async(bufs[0], ptrs['forw_src'], stream=streams[0]) 
185 | 
186 |         # Backward send.
187 |         drv.memcpy_dtoh_async(bufs[1], ptrs['back_src'], stream=streams[1]) 
188 | 
189 |         # Forward receive.
190 |         self._sync_req_forw = comm.Irecv(bufs[2], source=adj['back'], \
191 |                                             tag=self._sync_tags[0])
192 | 
193 |         # Backward receive.
194 |         self._sync_req_back = comm.Irecv(bufs[3], source=adj['forw'], \
195 |                                             tag=self._sync_tags[1])
196 | 
197 |         # Signalling variables needed to complete transfers.
198 |         self._sync_part2_start = [False, False, False, False]
199 | 
200 | 
201 |     def synchronize_isdone(self):
202 |         """ Complete synchronization process. """
203 | 
204 |         # Use shorter, easier names for class variables.
205 |         bufs = self._sync_buffers
206 |         ptrs = self._sync_ptrs
207 |         streams = self._sync_streams
208 |         adj = self._sync_adj
209 |         part2_start = self._sync_part2_start 
210 |         is_done = [False, False, False, False]
211 | 
212 |         # Forward send.
213 |         if streams[0].is_done(): # Device-to-host copy completed.
214 |             if not part2_start[0]: # Initialize MPI send.
215 |                 comm.Isend(bufs[0], dest=adj['forw'], tag=self._sync_tags[0])
216 |                 part2_start[0] = True
217 |                 is_done[0] = True
218 |             else: # No more work to do.
219 |                 is_done[0] = True
220 | 
221 |         # Backward send.
222 |         if streams[1].is_done(): # Device-to-host copy completed.
223 |             if not part2_start[1]: # Initialize MPI send.
224 |                 comm.Isend(bufs[1], dest=adj['back'], tag=self._sync_tags[1])
225 |                 part2_start[1] = True
226 |                 is_done[1] = True
227 |             else: # No more work to do.
228 |                 is_done[1] = True
229 | 
230 |         # Forward receive.
231 |         if self._sync_req_forw.Test(): # MPI receive completed.
232 |             if not part2_start[2]: # Initialize host-to-device copy.
233 |                 drv.memcpy_htod_async(ptrs['back_dest'], bufs[2], \
234 |                                         stream=streams[2]) # Host-to-device.
235 |                 part2_start[2] = True
236 |             elif streams[2].is_done(): # Host-to-device copy completed.
237 |                 is_done[2] = True
238 | 
239 |         # Backward receive.
240 |         if self._sync_req_back.Test(): # MPI receive completed.
241 |             if not part2_start[3]: # Initialize host-to-device copy.
242 |                 drv.memcpy_htod_async(ptrs['forw_dest'], bufs[3], \
243 |                                         stream=streams[3]) # Host-to-device.
244 |                 part2_start[3] = True
245 |             elif streams[3].is_done(): # Host-to-device copy completed.
246 |                 is_done[3] = True
247 |         # print '~', is_done[0:4],
248 |         # Return true only when all four transfers are complete.
249 |         return all(is_done) 
250 | 
251 | 
252 | __MPI_TAG_NUM = 0 # Global variable used to generate unique mpi tags.
253 | 
254 | def mpi_tag():
255 |     """ Get a new, unique mpi tag number. """
256 |     global __MPI_TAG_NUM # Get the global variable.
257 |     tag = __MPI_TAG_NUM # The variable to return.
258 |     __MPI_TAG_NUM += 1 
259 |     return tag
260 | 


--------------------------------------------------------------------------------
/maxwell-solver/kernels/fdfd_matrix_multiplication_pec_pmc.cu:
--------------------------------------------------------------------------------
  1 | // Mark the threads that need to load from global memory.
  2 | const bool adj_dims =  (((_X >= -1) && (_X <= {{ dims[0] }})) && \
  3 |                         ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \
  4 |                         ((_Z >= -1) && (_Z <= {{ dims[2] }})));
  5 | 
  6 | // Set relevant field pointers to create wrap-around periodic grid.
  7 | {{ type }} bloch_phaseYZ_x = 1.0;
  8 | {{ type }} bloch_phaseYZ_y = 1.0;
  9 | {{ type }} bloch_phaseYZ_z = 1.0;
 10 | 
 11 | {{ type }} pc_yz_factor[3] = { 1.0, 1.0, 1.0 };
 12 | int pc_iy_Ex = 0;
 13 | int pc_iz_Ex = 0;
 14 | int pc_iy_Ey = 0;
 15 | int pc_iz_Ey = 0;
 16 | int pc_iy_Ez = 0;
 17 | int pc_iz_Ez = 0;
 18 | 
 19 | if (_Y == -1) {
 20 |     _Y = {{ dims[1]-1 }};
 21 | 	bloch_phaseYZ_x *= conj(bloch_y(0));
 22 | 	bloch_phaseYZ_y *= conj(bloch_y(1));
 23 | 	bloch_phaseYZ_z *= conj(bloch_y(2));
 24 | 	if ( pemc(2) == 1 ) {
 25 |         //PEC (anti-symmetric)
 26 |         _Y = 0;
 27 | 		pc_yz_factor[0] = -1.0;
 28 | 		pc_yz_factor[2] = -1.0;
 29 | 		pc_iy_Ex = 1;
 30 | 		pc_iy_Ey = 0;
 31 | 		pc_iy_Ez = 1;
 32 | 	} else if ( pemc(2) == 2 ) {
 33 |         //PMC (symmetric)
 34 |         _Y = 0;
 35 | 		pc_yz_factor[1] = -1.0;
 36 | 		pc_iy_Ex = 1;
 37 | 		pc_iy_Ey = 0;
 38 | 		pc_iy_Ez = 1;
 39 | 	}
 40 | }
 41 | if (_Y == {{ dims[1]-1 }}) {
 42 | 	if ( pemc(3) == 1 ) {
 43 | 		pc_iy_Ey = -1;
 44 | 	}
 45 | 	if ( pemc(3) == 2 ) {
 46 | 		pc_iy_Ey = -1;
 47 | 		pc_yz_factor[1] = -1.0;
 48 | 	}
 49 | }
 50 | if (_Y == {{ dims[1] }}) {
 51 |     _Y = 0;
 52 | 	bloch_phaseYZ_x *= bloch_y(0);
 53 | 	bloch_phaseYZ_y *= bloch_y(1);
 54 | 	bloch_phaseYZ_z *= bloch_y(2);
 55 | 	if ( pemc(3) == 1 ) {
 56 |         //PEC
 57 |         _Y = {{ dims[1]-1 }};
 58 | 		pc_yz_factor[0] = -1.0;
 59 | 		pc_yz_factor[2] = -1.0;
 60 | 		pc_iy_Ex = -1;
 61 | 		pc_iy_Ey = 0;
 62 | 		pc_iy_Ez = -1;
 63 | 	}
 64 |     if ( pemc(3) == 2 ) {
 65 |         //PMC
 66 |         _Y = {{ dims[1]-1 }};
 67 | 		pc_yz_factor[1] = -1.0;
 68 | 		pc_iy_Ex = -1;
 69 | 		pc_iy_Ey = 0;
 70 | 		pc_iy_Ez = -1;
 71 | 	}
 72 | }
 73 | if (_Z == -1) {
 74 |     _Z = {{ dims[2]-1 }};
 75 | 	bloch_phaseYZ_x *= conj(bloch_z(0));
 76 | 	bloch_phaseYZ_y *= conj(bloch_z(1));
 77 | 	bloch_phaseYZ_z *= conj(bloch_z(2));
 78 | 	if ( pemc(4) == 1 ) {
 79 |         _Z = 0;
 80 | 		pc_yz_factor[0] = -1.0;
 81 | 		pc_yz_factor[1] = -1.0;
 82 | 		pc_iz_Ex = 1;
 83 | 		pc_iz_Ey = 1;
 84 | 		pc_iz_Ez = 0;
 85 | 	} else if ( pemc(4) == 2 ) {
 86 |         _Z = 0;
 87 | 		pc_yz_factor[2] = -1.0;
 88 | 		pc_iz_Ex = 1;
 89 | 		pc_iz_Ey = 1;
 90 | 		pc_iz_Ez = 0;
 91 | 	}
 92 | }
 93 | if (_Z == {{ dims[2]-1 }}) {
 94 | 
 95 | 	if ( pemc(5) == 1 ) {
 96 | 		pc_iz_Ez = -1;
 97 | 	}
 98 | 	if ( pemc(5) == 2 ) {
 99 | 		pc_iz_Ez = -1;
100 | 		pc_yz_factor[2] = -1.0;
101 | 	}
102 | }
103 | if (_Z == {{ dims[2] }}) {
104 |     _Z = 0;
105 | 	bloch_phaseYZ_x *= bloch_z(0);
106 | 	bloch_phaseYZ_y *= bloch_z(1);
107 | 	bloch_phaseYZ_z *= bloch_z(2);
108 | 	if ( pemc(5) == 1 ) {
109 |         _Z = {{ dims[2]-1 }};
110 | 		pc_yz_factor[0] = -1.0;
111 | 		pc_yz_factor[1] = -1.0;
112 | 		pc_iz_Ex = -1;
113 | 		pc_iz_Ey = -1;
114 | 		pc_iz_Ez = 0;
115 | 	} else if ( pemc(5) == 2 ) {
116 |         _Z = {{ dims[2]-1 }};
117 | 		pc_yz_factor[2] = 0.0;
118 | 		pc_iz_Ex = -1;
119 | 		pc_iz_Ey = -1;
120 | 		pc_iz_Ez = 0;
121 | 	}
122 | }
123 | 
124 | // Some definitions for shared memory.
125 | // Used to get unpadded thread indices.
126 | #define s_ty (_ty + 1)
127 | #define s_tz (_tz + 1)
128 | #define s_tyy (_tyy + 2)
129 | #define s_tzz (_tzz + 2)
130 | 
131 | // Helper definitions.
132 | #define s_next_field (s_tyy * s_tzz)
133 | #define s_to_local (s_ty * s_tzz + (s_tz))   
134 | #define s_zp +1
135 | #define s_zn -1
136 | #define s_yp +s_tzz
137 | #define s_yn -s_tzz
138 | 
139 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
140 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
141 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
142 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
143 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
144 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
145 | 
146 | // Local memory.
147 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n;
148 | {{ type }} bx, by, bz;
149 | {{ type }} px, py, pz, py_p, pz_p;
150 | 
151 | int xn, xp;
152 | {{ type }} bloch_phaseX_x = 1;
153 | {{ type }} bloch_phaseX_y = 1;
154 | {{ type }} bloch_phaseX_z = 1;
155 | {{ type }} pc_x_factor[3] = { 1.0, 1.0, 1.0 };
156 | int pc_ix_Ex = -1;
157 | int pc_ix_Ey = -1;
158 | int pc_ix_Ez = -1;
159 | if (_X == 0) { 
160 |     bloch_phaseX_x = conj(bloch_x(0));
161 |     bloch_phaseX_y = conj(bloch_x(1));
162 |     bloch_phaseX_z = conj(bloch_x(2));
163 |     if ( pemc(0) == 1 ) {
164 |         pc_x_factor[1] = -1.0;
165 |         pc_x_factor[2] = -1.0;
166 |         pc_ix_Ex = 0;
167 |         pc_ix_Ey = 1;
168 |         pc_ix_Ez = 1;
169 |         xn = 0;
170 |     } else if ( pemc(0) == 2 ) {
171 |         pc_x_factor[0] = -1.0;
172 |         pc_ix_Ex = 0;
173 |         pc_ix_Ey = 1;
174 |         pc_ix_Ez = 1;
175 |         xn = 0;
176 |     } else {
177 |         pc_ix_Ex = -1;
178 |         pc_ix_Ey = -1;
179 |         pc_ix_Ez = -1;
180 |         xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction.
181 |     }
182 | } else {
183 |     xn = -1;}
184 | 
185 | // Load E-fields into shared memory.
186 | if (adj_dims) {
187 |     // Load in p = x
188 |     Ex_0[0] = bloch_phaseX_x * bloch_phaseYZ_x * pc_x_factor[0] * pc_yz_factor[0] *
189 |                 (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z)) *
190 |                  Xx(pc_ix_Ex, pc_iy_Ex, pc_iz_Ex); 
191 |     Ey_0[0] = bloch_phaseX_y * bloch_phaseYZ_y * pc_x_factor[1] * pc_yz_factor[1] *
192 |                 (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z)) *
193 |                  Xy(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey);
194 |     Ez_0[0] = bloch_phaseX_z * bloch_phaseYZ_z * pc_x_factor[2] * pc_yz_factor[2] *
195 |                 (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z)) *
196 |                  Xz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez);
197 | 
198 |     py_p = Xy(0, pc_iy_Ey, pc_iz_Ey);
199 |     Ey_p = bloch_phaseYZ_y * pc_yz_factor[1] * (py_p) * 
200 |             (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
201 | 
202 |     pz_p = Xz(0, pc_iy_Ez, pc_iz_Ez);
203 |     Ez_p = bloch_phaseYZ_z * pc_yz_factor[2] * (pz_p) * 
204 |             (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
205 | }
206 | __syncthreads();
207 | 
208 | // Calculate H-fields and store in shared_memory.
209 | // Hy.
210 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
211 |     Hy_0[0] = my(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey) *
212 |                 (sx1(_X+xn) * (Ez_0[0] - Ez_p) - sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
213 | }
214 | 
215 | // Hz.
216 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
217 |     Hz_0[0] = mz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez) * 
218 |                 (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - sx1(_X+xn) * (Ey_0[0] - Ey_p));
219 | }
220 | __syncthreads();
221 | 
222 | // reset the pemc factors and ix's
223 | pc_x_factor[0] = 1.0;
224 | pc_x_factor[1] = 1.0;
225 | pc_x_factor[2] = 1.0;
226 | pc_ix_Ex = 0;
227 | pc_ix_Ey = 1;
228 | pc_ix_Ez = 1;
229 | // start loop in x direction
230 | for (; _X < _x_end ; _X += _txx) {
231 |     // We've moved ahead in X, so transfer appropriate field values.
232 |     Ey_0[0] = Ey_p;
233 |     Ez_0[0] = Ez_p;
234 |     Hy_n = Hy_0[0];
235 |     Hz_n = Hz_0[0];
236 | 
237 |     py = py_p;
238 |     pz = pz_p;
239 | 
240 |     // Load E-fields into shared memory.
241 |     if (_X == {{ dims[0]-1 }}){
242 |         if ( pemc(1) == 1 ) {
243 |             // PEC
244 |             pc_x_factor[1] = -1.0;
245 |             pc_x_factor[2] = -1.0;
246 |             pc_ix_Ex = -1;
247 |             pc_ix_Ey = -1;
248 |             pc_ix_Ez = -1;
249 |             xp = 0;
250 |         } else if ( pemc(1) == 2 ) {
251 |             // PMC
252 |             pc_x_factor[0] = -1.0;
253 |             pc_ix_Ex = -1;
254 |             pc_ix_Ey = -1;
255 |             pc_ix_Ez = -1;
256 |             xp = 0;
257 |         } else {
258 |             // bloch
259 |             bloch_phaseX_x = bloch_x(0);
260 |             bloch_phaseX_y = bloch_x(1);
261 |             bloch_phaseX_z = bloch_x(2);
262 |             xp = {{ -(dims[0]-1) }}; // Wrap-around step in the negative direction.
263 |         }
264 |     } else {
265 |         xp = +1;
266 |     	bloch_phaseX_x = 1;
267 |     	bloch_phaseX_y = 1;
268 |     	bloch_phaseX_z = 1;
269 |     }
270 | 
271 |     if (adj_dims) {
272 |         px = Xx(pc_ix_Ex,pc_iy_Ex,pc_iz_Ex);
273 |         Ex_0[0] = bloch_phaseYZ_x * pc_x_factor[0] * pc_yz_factor[0] *
274 |                     (px) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z));
275 | 
276 |         py_p = Xy(pc_ix_Ey,pc_iy_Ey,pc_iz_Ey);
277 |         Ey_p = bloch_phaseX_y * bloch_phaseYZ_y * pc_x_factor[1] * pc_yz_factor[1] *
278 |                     (py_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
279 | 
280 |         pz_p = Xz(pc_ix_Ez,pc_iy_Ez,pc_iz_Ez);
281 |         Ez_p = bloch_phaseX_z * bloch_phaseYZ_z * pc_x_factor[2] * pc_yz_factor[2] *
282 |                     (pz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
283 |     }
284 | 
285 |     __syncthreads();
286 | 
287 |     // Calculate H-fields and store in shared_memory.
288 |     {% if mu_equals_1 == True %}
289 |     // Hx.
290 |     if ((_ty != _tyy) && (_tz != _tzz)) {
291 |         Hx_0[0] =   (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
292 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
293 |     }
294 | 
295 |     // Hy.
296 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
297 |         Hy_0[0] =   (sx1(_X) * (Ez_0[0] - Ez_p) - 
298 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
299 |     }
300 | 
301 |     // Hz.
302 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
303 |         Hz_0[0] =   (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
304 |                     sx1(_X) * (Ey_0[0] - Ey_p));
305 |     }
306 |     {% else %}
307 |     // Hx.
308 |     if ((_ty != _tyy) && (_tz != _tzz)) {
309 |         Hx_0[0] =   mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
310 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
311 |     }
312 | 
313 |     // Hy.
314 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
315 |         Hy_0[0] =   my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 
316 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
317 |     }
318 | 
319 |     // Hz.
320 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
321 |         Hz_0[0] =   mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
322 |                     sx1(_X) * (Ey_0[0] - Ey_p));
323 |     }
324 |     {% endif %}
325 |     __syncthreads();
326 | 
327 |     // Write out the results.
328 |     if (_in_global && _in_local) {
329 |         {% if full_operator %}
330 |         bx = ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) *
331 |                     (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn])
332 |                     - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn])
333 |                     - ex(0,0,0) * Ex_0[0]));
334 |         by = ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) *
335 |                     (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 
336 |                     - sx0(_X) * (Hz_0[0] - Hz_n) 
337 |                     - ey(0,0,0) * Ey_0[0]));
338 |         bz = ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) *
339 |                     (sx0(_X) * (Hy_0[0] - Hy_n) 
340 |                     - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 
341 |                     - ez(0,0,0) * Ez_0[0]));
342 | 
343 |         Bx(0,0,0) = bx;
344 |         By(0,0,0) = by;
345 |         Bz(0,0,0) = bz;
346 | 
347 |         {% else %}
348 |         Bx(0,0,0) = Hx_0[0];
349 |         By(0,0,0) = Hy_0[0];
350 |         Bz(0,0,0) = Hz_0[0];
351 | 
352 |         {% endif %}
353 |     }
354 |     __syncthreads();
355 | }
356 | 


--------------------------------------------------------------------------------
/maxwell-solver/kernels/fdfd_residual_pec_pmc.cu:
--------------------------------------------------------------------------------
  1 | // Mark the threads that need to load from global memory.
  2 | const bool adj_dims =  (((_X >= -1) && (_X <= {{ dims[0] }})) && \
  3 |                         ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \
  4 |                         ((_Z >= -1) && (_Z <= {{ dims[2] }})));
  5 | 
  6 | // Set relevant field pointers to create wrap-around periodic grid.
  7 | {{ type }} bloch_phaseYZ_x = 1.0;
  8 | {{ type }} bloch_phaseYZ_y = 1.0;
  9 | {{ type }} bloch_phaseYZ_z = 1.0;
 10 | 
 11 | {{ type }} pc_yz_factor[3] = { 1.0, 1.0, 1.0 };
 12 | int pc_iy_Ex = 0;
 13 | int pc_iz_Ex = 0;
 14 | int pc_iy_Ey = 0;
 15 | int pc_iz_Ey = 0;
 16 | int pc_iy_Ez = 0;
 17 | int pc_iz_Ez = 0;
 18 | 
 19 | if (_Y == -1) {
 20 |     _Y = {{ dims[1]-1 }};
 21 | 	bloch_phaseYZ_x *= conj(bloch_y(0));
 22 | 	bloch_phaseYZ_y *= conj(bloch_y(1));
 23 | 	bloch_phaseYZ_z *= conj(bloch_y(2));
 24 | 	if ( pemc(2) == 1 ) {
 25 |         //PEC (anti-symmetric)
 26 |         _Y = 0;
 27 | 		pc_yz_factor[0] = -1.0;
 28 | 		pc_yz_factor[2] = -1.0;
 29 | 		pc_iy_Ex = 1;
 30 | 		pc_iy_Ey = 0;
 31 | 		pc_iy_Ez = 1;
 32 | 	} else if ( pemc(2) == 2 ) {
 33 |         //PMC (symmetric)
 34 |         _Y = 0;
 35 | 		pc_yz_factor[1] = -1.0;
 36 | 		pc_iy_Ex = 1;
 37 | 		pc_iy_Ey = 0;
 38 | 		pc_iy_Ez = 1;
 39 | 	}
 40 | }
 41 | if (_Y == {{ dims[1]-1 }}) {
 42 | 	if ( pemc(3) == 1 ) {
 43 | 		pc_iy_Ey = -1;
 44 | 	}
 45 | 	if ( pemc(3) == 2 ) {
 46 | 		pc_iy_Ey = -1;
 47 | 		pc_yz_factor[1] = -1.0;
 48 | 	}
 49 | }
 50 | if (_Y == {{ dims[1] }}) {
 51 |     _Y = 0;
 52 | 	bloch_phaseYZ_x *= bloch_y(0);
 53 | 	bloch_phaseYZ_y *= bloch_y(1);
 54 | 	bloch_phaseYZ_z *= bloch_y(2);
 55 | 	if ( pemc(3) == 1 ) {
 56 |         //PEC
 57 |         _Y = {{ dims[1]-1 }};
 58 | 		pc_yz_factor[0] = -1.0;
 59 | 		pc_yz_factor[2] = -1.0;
 60 | 		pc_iy_Ex = -1;
 61 | 		pc_iy_Ey = 0;
 62 | 		pc_iy_Ez = -1;
 63 | 	}
 64 |     if ( pemc(3) == 2 ) {
 65 |         //PMC
 66 |         _Y = {{ dims[1]-1 }};
 67 | 		pc_yz_factor[1] = -1.0; //this value does not matter
 68 | 		pc_iy_Ex = -1;
 69 | 		pc_iy_Ey = 0;
 70 | 		pc_iy_Ez = -1;
 71 | 	}
 72 | }
 73 | if (_Z == -1) {
 74 |     _Z = {{ dims[2]-1 }};
 75 | 	bloch_phaseYZ_x *= conj(bloch_z(0));
 76 | 	bloch_phaseYZ_y *= conj(bloch_z(1));
 77 | 	bloch_phaseYZ_z *= conj(bloch_z(2));
 78 | 	if ( pemc(4) == 1 ) {
 79 |         //PEC (anti-symmetric)
 80 |         _Z = 0;
 81 | 		pc_yz_factor[0] = -1.0;
 82 | 		pc_yz_factor[1] = -1.0;
 83 | 		pc_iz_Ex = 1;
 84 | 		pc_iz_Ey = 1;
 85 | 		pc_iz_Ez = 0;
 86 | 	} else if ( pemc(4) == 2 ) {
 87 |         //PMC (symmetric)
 88 |         _Z = 0;
 89 | 		pc_yz_factor[2] = -1.0;
 90 | 		pc_iz_Ex = 1;
 91 | 		pc_iz_Ey = 1;
 92 | 		pc_iz_Ez = 0;
 93 | 	}
 94 | }
 95 | if (_Z == {{ dims[2]-1 }}) {
 96 | 
 97 | 	if ( pemc(5) == 1 ) {
 98 | 		pc_iz_Ez = -1;
 99 | 	}
100 | 	if ( pemc(5) == 2 ) {
101 | 		pc_iz_Ez = -1;
102 | 		pc_yz_factor[2] = -1.0;
103 | 	}
104 | }
105 | if (_Z == {{ dims[2] }}) {
106 |     _Z = 0;
107 | 	bloch_phaseYZ_x *= bloch_z(0);
108 | 	bloch_phaseYZ_y *= bloch_z(1);
109 | 	bloch_phaseYZ_z *= bloch_z(2);
110 | 	if ( pemc(5) == 1 ) {
111 |         //PEC
112 |         _Z = {{ dims[2]-1 }};
113 | 		pc_yz_factor[0] = -1.0;
114 | 		pc_yz_factor[1] = -1.0;
115 | 		pc_iz_Ex = -1;
116 | 		pc_iz_Ey = -1;
117 | 		pc_iz_Ez = 0;
118 | 	} else if ( pemc(5) == 2 ) {
119 |         //PMC
120 |         _Z = {{ dims[2]-1 }};
121 | 		pc_yz_factor[2] = 0.0; //this value does not matter
122 | 		pc_iz_Ex = -1;
123 | 		pc_iz_Ey = -1;
124 | 		pc_iz_Ez = 0;
125 | 	}
126 | }
127 | 
128 | // Some definitions for shared memory.
129 | // Used to get unpadded thread indices.
130 | #define s_ty (_ty + 1)
131 | #define s_tz (_tz + 1)
132 | #define s_tyy (_tyy + 2)
133 | #define s_tzz (_tzz + 2)
134 | 
135 | // Helper definitions.
136 | #define s_next_field (s_tyy * s_tzz)
137 | #define s_to_local (s_ty * s_tzz + (s_tz))   
138 | #define s_zp +1
139 | #define s_zn -1
140 | #define s_yp +s_tzz
141 | #define s_yn -s_tzz
142 | 
143 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
144 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
145 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
146 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
147 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
148 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
149 | 
150 | // Local memory.
151 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n;
152 | {{ type }} rx, ry, rz;
153 | {{ type }} px, py, pz, py_p, pz_p;
154 | 
155 | int xn, xp;
156 | {{ type }} bloch_phaseX_x = 1;
157 | {{ type }} bloch_phaseX_y = 1;
158 | {{ type }} bloch_phaseX_z = 1;
159 | {{ type }} pc_x_factor[3] = { 1.0, 1.0, 1.0 };
160 | int pc_ix_Ex = -1;
161 | int pc_ix_Ey = -1;
162 | int pc_ix_Ez = -1;
163 | if (_X == 0) { 
164 |     bloch_phaseX_x = conj(bloch_x(0));
165 |     bloch_phaseX_y = conj(bloch_x(1));
166 |     bloch_phaseX_z = conj(bloch_x(2));
167 |     if ( pemc(0) == 1 ) {
168 |         pc_x_factor[1] = -1.0;
169 |         pc_x_factor[2] = -1.0;
170 |         pc_ix_Ex = 0;
171 |         pc_ix_Ey = 1;
172 |         pc_ix_Ez = 1;
173 |         xn = 0;
174 |     } else if ( pemc(0) == 2 ) {
175 |         pc_x_factor[0] = -1.0;
176 |         pc_ix_Ex = 0;
177 |         pc_ix_Ey = 1;
178 |         pc_ix_Ez = 1;
179 |         xn = 0;
180 |     } else {
181 |         pc_ix_Ex = -1;
182 |         pc_ix_Ey = -1;
183 |         pc_ix_Ez = -1;
184 |         xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction.
185 |     }
186 | } else {
187 |     xn = -1;}
188 | 
189 | // Load E-fields into shared memory.
190 | if (adj_dims) {
191 |     // Load in p = x
192 |     Ex_0[0] = bloch_phaseX_x * bloch_phaseYZ_x * pc_x_factor[0] * pc_yz_factor[0] *
193 |                 (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z)) *
194 |                  Xx(pc_ix_Ex, pc_iy_Ex, pc_iz_Ex); 
195 |     Ey_0[0] = bloch_phaseX_y * bloch_phaseYZ_y * pc_x_factor[1] * pc_yz_factor[1] *
196 |                 (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z)) *
197 |                  Xy(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey);
198 |     Ez_0[0] = bloch_phaseX_z * bloch_phaseYZ_z * pc_x_factor[2] * pc_yz_factor[2] *
199 |                 (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z)) *
200 |                  Xz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez);
201 | 
202 |     py_p = Xy(0, pc_iy_Ey, pc_iz_Ey);
203 |     Ey_p = bloch_phaseYZ_y * pc_yz_factor[1] * (py_p) * 
204 |             (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
205 | 
206 |     pz_p = Xz(0, pc_iy_Ez, pc_iz_Ez);
207 |     Ez_p = bloch_phaseYZ_z * pc_yz_factor[2] * (pz_p) * 
208 |             (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
209 | }
210 | __syncthreads();
211 | 
212 | // Calculate H-fields and store in shared_memory.
213 | // Hy.
214 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
215 |     Hy_0[0] = my(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey) * 
216 |                 (sx1(_X+xn) * (Ez_0[0] - Ez_p) - sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
217 | }
218 | 
219 | // Hz.
220 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
221 |     Hz_0[0] = mz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez) * 
222 |                 (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - sx1(_X+xn) * (Ey_0[0] - Ey_p));
223 | }
224 | __syncthreads();
225 | 
226 | // reset the pemc factors and ix's
227 | pc_x_factor[0] = 1.0;
228 | pc_x_factor[1] = 1.0;
229 | pc_x_factor[2] = 1.0;
230 | pc_ix_Ex = 0;
231 | pc_ix_Ey = 1;
232 | pc_ix_Ez = 1;
233 | // start loop in x direction
234 | for (; _X < _x_end ; _X += _txx) {
235 |     // We've moved ahead in X, so transfer appropriate field values.
236 |     Ey_0[0] = Ey_p;
237 |     Ez_0[0] = Ez_p;
238 |     Hy_n = Hy_0[0];
239 |     Hz_n = Hz_0[0];
240 | 
241 |     py = py_p;
242 |     pz = pz_p;
243 | 
244 |     // Load E-fields into shared memory.
245 |     if (_X == {{ dims[0]-1 }}){
246 |         if ( pemc(1) == 1 ) {
247 |             // PEC
248 |             pc_x_factor[1] = -1.0;
249 |             pc_x_factor[2] = -1.0;
250 |             pc_ix_Ex = -1;
251 |             pc_ix_Ey = -1;
252 |             pc_ix_Ez = -1;
253 |             xp = 0;
254 |         } else if ( pemc(1) == 2 ) {
255 |             // PMC
256 |             pc_x_factor[0] = -1.0;
257 |             pc_ix_Ex = -1;
258 |             pc_ix_Ey = -1;
259 |             pc_ix_Ez = -1;
260 |             xp = 0;
261 |         } else {
262 |             // bloch
263 |             bloch_phaseX_x = bloch_x(0);
264 |             bloch_phaseX_y = bloch_x(1);
265 |             bloch_phaseX_z = bloch_x(2);
266 |             xp = {{ -(dims[0]-1) }}; // Wrap-around step in the negative direction.
267 |         }
268 |     } else {
269 |         xp = +1;
270 |     	bloch_phaseX_x = 1;
271 |     	bloch_phaseX_y = 1;
272 |     	bloch_phaseX_z = 1;
273 |     }
274 | 
275 |     if (adj_dims) {
276 |         px = Xx(pc_ix_Ex,pc_iy_Ex,pc_iz_Ex);
277 |         Ex_0[0] = bloch_phaseYZ_x * pc_x_factor[0] * pc_yz_factor[0] *
278 |                     (px) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z));
279 | 
280 |         py_p = Xy(pc_ix_Ey,pc_iy_Ey,pc_iz_Ey);
281 |         Ey_p = bloch_phaseX_y * bloch_phaseYZ_y * pc_x_factor[1] * pc_yz_factor[1] *
282 |                     (py_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
283 | 
284 |         pz_p = Xz(pc_ix_Ez,pc_iy_Ez,pc_iz_Ez);
285 |         Ez_p = bloch_phaseX_z * bloch_phaseYZ_z * pc_x_factor[2] * pc_yz_factor[2] *
286 |                     (pz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
287 |     }
288 | 
289 |     __syncthreads();
290 | 
291 |     // Calculate H-fields and store in shared_memory.
292 |     {% if mu_equals_1 == True %}
293 |     // Hx.
294 |     if ((_ty != _tyy) && (_tz != _tzz)) {
295 |         Hx_0[0] =   (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
296 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
297 |     }
298 | 
299 |     // Hy.
300 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
301 |         Hy_0[0] =   (sx1(_X) * (Ez_0[0] - Ez_p) - 
302 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
303 |     }
304 | 
305 |     // Hz.
306 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
307 |         Hz_0[0] =   (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
308 |                     sx1(_X) * (Ey_0[0] - Ey_p));
309 |     }
310 |     {% else %}
311 |     // Hx.
312 |     if ((_ty != _tyy) && (_tz != _tzz)) {
313 |         Hx_0[0] =   mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
314 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
315 |     }
316 | 
317 |     // Hy.
318 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
319 |         Hy_0[0] =   my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 
320 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
321 |     }
322 | 
323 |     // Hz.
324 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
325 |         Hz_0[0] =   mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
326 |                     sx1(_X) * (Ey_0[0] - Ey_p));
327 |     }
328 |     {% endif %}
329 |     __syncthreads();
330 | 
331 |     // Write out the results.
332 |     if (_in_global && _in_local) {
333 |         {% if full_operator %}
334 |         rx = Bx(0,0,0) - ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) *
335 |                     (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn])
336 |                     - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn])
337 |                     - ex(0,0,0) * Ex_0[0]));
338 |         ry = By(0,0,0) - ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) *
339 |                     (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 
340 |                     - sx0(_X) * (Hz_0[0] - Hz_n) 
341 |                     - ey(0,0,0) * Ey_0[0]));
342 |         rz = Bz(0,0,0) - ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) *
343 |                     (sx0(_X) * (Hy_0[0] - Hy_n) 
344 |                     - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 
345 |                     - ez(0,0,0) * Ez_0[0]));
346 | 
347 |         Rx(0,0,0) = rx;
348 |         Ry(0,0,0) = ry;
349 |         Rz(0,0,0) = rz;
350 | 
351 |         {% else %}
352 |         Rx(0,0,0) = Hx_0[0];
353 |         Ry(0,0,0) = Hy_0[0];
354 |         Rz(0,0,0) = Hz_0[0];
355 | 
356 |         {% endif %}
357 |     }
358 |     __syncthreads();
359 | }
360 | 


--------------------------------------------------------------------------------
/maxwell-solver/kernels/omega_bloch_pmc_pec.cu:
--------------------------------------------------------------------------------
  1 | // Mark the threads that need to load from global memory.
  2 | const bool adj_dims =  (((_X >= -1) && (_X <= {{ dims[0] }})) && \
  3 |                         ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \
  4 |                         ((_Z >= -1) && (_Z <= {{ dims[2] }})));
  5 | 
  6 | // Set relevant field pointers to create wrap-around periodic grid.
  7 | {{ type }} bloch_phaseYZ_x = 1.0;
  8 | {{ type }} bloch_phaseYZ_y = 1.0;
  9 | {{ type }} bloch_phaseYZ_z = 1.0;
 10 | 
 11 | {{ type }} pc_yz_factor[3] = { 1.0, 1.0, 1.0 };
 12 | int pc_iy_Ex = 0;
 13 | int pc_iz_Ex = 0;
 14 | int pc_iy_Ey = 0;
 15 | int pc_iz_Ey = 0;
 16 | int pc_iy_Ez = 0;
 17 | int pc_iz_Ez = 0;
 18 | 
 19 | if (_Y == -1) {
 20 |     _Y = {{ dims[1]-1 }};
 21 | 	bloch_phaseYZ_x *= bloch_y(0);
 22 | 	bloch_phaseYZ_y *= bloch_y(1);
 23 | 	bloch_phaseYZ_z *= bloch_y(2);
 24 | 	if ( pemc(2) == 1 ) {
 25 |         //PEC (anti-symmetric)
 26 |         _Y = 0;
 27 | 		pc_yz_factor[0] = -1.0;
 28 | 		pc_yz_factor[2] = -1.0;
 29 | 		pc_iy_Ex = 1;
 30 | 		pc_iy_Ey = 0;
 31 | 		pc_iy_Ez = 1;
 32 | 	} else if ( pemc(2) == 2 ) {
 33 |         //PMC (symmetric)
 34 |         _Y = 0;
 35 | 		pc_yz_factor[1] = -1.0;
 36 | 		pc_iy_Ex = 1;
 37 | 		pc_iy_Ey = 0;
 38 | 		pc_iy_Ez = 1;
 39 | 	}
 40 | }
 41 | if (_Y == {{ dims[1]-1 }}) {
 42 | 	if ( pemc(3) == 1 ) {
 43 | 		pc_iy_Ey = -1;
 44 | 	}
 45 | 	if ( pemc(3) == 2 ) {
 46 | 		pc_iy_Ey = -1;
 47 | 		pc_yz_factor[1] = -1.0;
 48 | 	}
 49 | }
 50 | if (_Y == {{ dims[1] }}) {
 51 |     _Y = 0;
 52 | 	bloch_phaseYZ_x *= conj(bloch_y(0));
 53 | 	bloch_phaseYZ_y *= conj(bloch_y(1));
 54 | 	bloch_phaseYZ_z *= conj(bloch_y(2));
 55 | 	if ( pemc(3) == 1 ) {
 56 |         //PEC
 57 |         _Y = {{ dims[1]-1 }};
 58 | 		pc_yz_factor[0] = -1.0;
 59 | 		pc_yz_factor[2] = -1.0;
 60 | 		pc_iy_Ex = -1;
 61 | 		pc_iy_Ey = 0;
 62 | 		pc_iy_Ez = -1;
 63 | 	}
 64 |     if ( pemc(3) == 2 ) {
 65 |         //PMC
 66 |         _Y = {{ dims[1]-1 }};
 67 | 		pc_yz_factor[1] = -1.0;
 68 | 		pc_iy_Ex = -1;
 69 | 		pc_iy_Ey = 0;
 70 | 		pc_iy_Ez = -1;
 71 | 	}
 72 | }
 73 | if (_Z == -1) {
 74 |     _Z = {{ dims[2]-1 }};
 75 | 	bloch_phaseYZ_x *= bloch_z(0);
 76 | 	bloch_phaseYZ_y *= bloch_z(1);
 77 | 	bloch_phaseYZ_z *= bloch_z(2);
 78 | 	if ( pemc(4) == 1 ) {
 79 |         //PEC (anti-symmetric)
 80 |         _Z = 0;
 81 | 		pc_yz_factor[0] = -1.0;
 82 | 		pc_yz_factor[1] = -1.0;
 83 | 		pc_iz_Ex = 1;
 84 | 		pc_iz_Ey = 1;
 85 | 		pc_iz_Ez = 0;
 86 | 	} else if ( pemc(4) == 2 ) {
 87 |         //PMC (symmetric)
 88 |         _Z = 0;
 89 | 		pc_yz_factor[2] = -1.0;
 90 | 		pc_iz_Ex = 1;
 91 | 		pc_iz_Ey = 1;
 92 | 		pc_iz_Ez = 0;
 93 | 	}
 94 | }
 95 | if (_Z == {{ dims[2]-1 }}) {
 96 | 	if ( pemc(5) == 1 ) {
 97 | 		pc_iz_Ez = -1;
 98 | 	}
 99 | 	if ( pemc(5) == 2 ) {
100 | 		pc_iz_Ez = -1;
101 | 		pc_yz_factor[2] = -1.0;
102 | 	}
103 | }
104 | if (_Z == {{ dims[2] }}) {
105 |     _Z = 0;
106 | 	bloch_phaseYZ_x *= conj(bloch_z(0));
107 | 	bloch_phaseYZ_y *= conj(bloch_z(1));
108 | 	bloch_phaseYZ_z *= conj(bloch_z(2));
109 | 	if ( pemc(5) == 1 ) {
110 |         //PEC
111 |         _Z = {{ dims[2]-1 }};
112 | 		pc_yz_factor[0] = -1.0;
113 | 		pc_yz_factor[1] = -1.0;
114 | 		pc_iz_Ex = -1;
115 | 		pc_iz_Ey = -1;
116 | 		pc_iz_Ez = 0;
117 | 	} else if ( pemc(5) == 2 ) {
118 |         //PMC
119 |         _Z = {{ dims[2]-1 }};
120 | 		pc_yz_factor[2] = 0.0; //this value does not matter
121 | 		pc_iz_Ex = -1;
122 | 		pc_iz_Ey = -1;
123 | 		pc_iz_Ez = 0;
124 | 	}
125 | }
126 | 
127 | // Some definitions for shared memory.
128 | // Used to get unpadded thread indices.
129 | #define s_ty (_ty + 1)
130 | #define s_tz (_tz + 1)
131 | #define s_tyy (_tyy + 2)
132 | #define s_tzz (_tzz + 2)
133 | 
134 | // Helper definitions.
135 | #define s_next_field (s_tyy * s_tzz)
136 | #define s_to_local (s_ty * s_tzz + (s_tz))   
137 | #define s_zp +1
138 | #define s_zn -1
139 | #define s_yp +s_tzz
140 | #define s_yn -s_tzz
141 | 
142 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
143 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
144 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
145 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
146 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
147 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
148 | 
149 | // Local memory.
150 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n;
151 | {{ type }} tx, ty, tz;
152 | {{ type }} sx, sy, sz, sy_p, sz_p;
153 | 
154 | int xn, xp;
155 | {{ type }} bloch_phaseX_x = 1;
156 | {{ type }} bloch_phaseX_y = 1;
157 | {{ type }} bloch_phaseX_z = 1;
158 | {{ type }} pc_x_factor[3] = { 1.0, 1.0, 1.0 };
159 | int pc_ix_Ex = -1;
160 | int pc_ix_Ey = -1;
161 | int pc_ix_Ez = -1;
162 | if (_X == 0) { 
163 |     bloch_phaseX_x = bloch_x(0);
164 |     bloch_phaseX_y = bloch_x(1);
165 |     bloch_phaseX_z = bloch_x(2);
166 |     if ( pemc(0) == 1 ) {
167 |         pc_x_factor[1] = -1.0;
168 |         pc_x_factor[2] = -1.0;
169 |         pc_ix_Ex = 0;
170 |         pc_ix_Ey = 1;
171 |         pc_ix_Ez = 1;
172 |         xn = 0;
173 |     } else if ( pemc(0) == 2 ) {
174 |         pc_x_factor[0] = -1.0;
175 |         pc_ix_Ex = 0;
176 |         pc_ix_Ey = 1;
177 |         pc_ix_Ez = 1;
178 |         xn = 0;
179 |     } else {
180 |         pc_ix_Ex = -1;
181 |         pc_ix_Ey = -1;
182 |         pc_ix_Ez = -1;
183 |         xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction.
184 |     }
185 | } else {
186 |     xn = -1;}
187 | 
188 | // Load E-fields into shared memory.
189 | if (adj_dims) {
190 |     // Load in s = r - alpha * v.
191 |     Ex_0[0] = bloch_phaseX_x * bloch_phaseYZ_x * pc_x_factor[0] * pc_yz_factor[0] *
192 |                 (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z)) *
193 |                 (  Rx(pc_ix_Ex, pc_iy_Ex, pc_iz_Ex) 
194 |                  - alpha * Vx(pc_ix_Ex, pc_iy_Ex, pc_iz_Ex));
195 |     Ey_0[0] = bloch_phaseX_y * bloch_phaseYZ_y * pc_x_factor[1] * pc_yz_factor[1] *
196 |                 (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z)) *
197 |                 (  Ry(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey)
198 |                  - alpha * Vy(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey));
199 |     Ez_0[0] = bloch_phaseX_z * bloch_phaseYZ_z * pc_x_factor[2] * pc_yz_factor[2] *
200 |                 (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z)) *
201 |                 (  Rz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez) 
202 |                  - alpha * Vz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez));
203 | 
204 | 
205 |     // Ey_p = Ry(0,0,0) - alpha * Vy(0,0,0);
206 |     sy_p = Ry(0, pc_iy_Ey, pc_iz_Ey) 
207 |             - alpha * Vy(0, pc_iy_Ey, pc_iz_Ey);
208 |     Ey_p = bloch_phaseYZ_y * pc_yz_factor[1] * (sy_p) * 
209 |             (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
210 | 
211 |     // Ez_p = Rz(0,0,0) - alpha * Vz(0,0,0);
212 |     sz_p = Rz(0, pc_iy_Ez, pc_iz_Ez) 
213 |             - alpha * Vz(0, pc_iy_Ez, pc_iz_Ez);
214 |     Ez_p = bloch_phaseYZ_z * pc_yz_factor[2] * (sz_p) * 
215 |             (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
216 | }
217 | __syncthreads();
218 | 
219 | // Calculate H-fields and store in shared_memory.
220 | // Hy.
221 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
222 |     Hy_0[0] = my(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey) * 
223 |                 (sx1(_X+xn) * (Ez_0[0] - Ez_p) - sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
224 | }
225 | 
226 | // Hz.
227 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
228 |     Hz_0[0] = mz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez) * 
229 |                 (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - sx1(_X+xn) * (Ey_0[0] - Ey_p));
230 | }
231 | __syncthreads();
232 | 
233 | // reset the pemc factors and ix's
234 | pc_x_factor[0] = 1.0;
235 | pc_x_factor[1] = 1.0;
236 | pc_x_factor[2] = 1.0;
237 | pc_ix_Ex = 0;
238 | pc_ix_Ey = 1;
239 | pc_ix_Ez = 1;
240 | // start loop in x direction
241 | for (; _X < _x_end ; _X += _txx) {
242 |     // We've moved ahead in X, so transfer appropriate field values.
243 |     Ey_0[0] = Ey_p;
244 |     Ez_0[0] = Ez_p;
245 |     Hy_n = Hy_0[0];
246 |     Hz_n = Hz_0[0];
247 | 
248 |     sy = sy_p;
249 |     sz = sz_p;
250 | 
251 |     // Load E-fields into shared memory.
252 |     if (_X == {{ dims[0]-1 }}){
253 |         if ( pemc(1) == 1 ) {
254 |             // PEC
255 |             pc_x_factor[1] = -1.0;
256 |             pc_x_factor[2] = -1.0;
257 |             pc_ix_Ex = -1;
258 |             pc_ix_Ey = -1;
259 |             pc_ix_Ez = -1;
260 |             xp = 0;
261 |         } else if ( pemc(1) == 2 ) {
262 |             // PMC
263 |             pc_x_factor[0] = -1.0;
264 |             pc_ix_Ex = -1;
265 |             pc_ix_Ey = -1;
266 |             pc_ix_Ez = -1;
267 |             xp = 0;
268 |         } else {
269 |             // bloch
270 |             bloch_phaseX_x = conj(bloch_x(0));
271 |             bloch_phaseX_y = conj(bloch_x(1));
272 |             bloch_phaseX_z = conj(bloch_x(2));
273 |             xp = {{ -(dims[0]-1) }}; // Wrap-around step in the negative direction.
274 |         }
275 |     } else {
276 |         xp = +1;
277 |     	bloch_phaseX_x = 1;
278 |     	bloch_phaseX_y = 1;
279 |     	bloch_phaseX_z = 1;
280 |     }
281 | 
282 |     if (adj_dims) {
283 |         sx = Rx(pc_ix_Ex,pc_iy_Ex,pc_iz_Ex) - alpha * Vx(pc_ix_Ex,pc_iy_Ex,pc_iz_Ex);
284 |         Ex_0[0] = bloch_phaseYZ_x * pc_x_factor[0] * pc_yz_factor[0] *
285 |                     (sx) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z));
286 | 
287 |         sy_p = Ry(pc_ix_Ey,pc_iy_Ey,pc_iz_Ey) - alpha * Vy(pc_ix_Ey,pc_iy_Ey,pc_iz_Ey);
288 |         Ey_p = bloch_phaseX_y * bloch_phaseYZ_y * pc_x_factor[1] * pc_yz_factor[1] *
289 |                     (sy_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
290 | 
291 |         sz_p = Rz(pc_ix_Ez,pc_iy_Ez,pc_iz_Ez) - alpha * Vz(pc_ix_Ez,pc_iy_Ez,pc_iz_Ez);
292 |         Ez_p = bloch_phaseX_z * bloch_phaseYZ_z * pc_x_factor[2] * pc_yz_factor[2] *
293 |                     (sz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
294 |     }
295 | 
296 |     __syncthreads();
297 | 
298 |     // Calculate H-fields and store in shared_memory.
299 |     {% if mu_equals_1 == True %}
300 |     // Hx.
301 |     if ((_ty != _tyy) && (_tz != _tzz)) {
302 |         Hx_0[0] =   (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
303 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
304 |     }
305 | 
306 |     // Hy.
307 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
308 |         Hy_0[0] =   (sx1(_X) * (Ez_0[0] - Ez_p) - 
309 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
310 |     }
311 | 
312 |     // Hz.
313 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
314 |         Hz_0[0] =   (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
315 |                     sx1(_X) * (Ey_0[0] - Ey_p));
316 |     }
317 |     {% else %}
318 |     // Hx.
319 |     if ((_ty != _tyy) && (_tz != _tzz)) {
320 |         Hx_0[0] =   mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
321 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
322 |     }
323 | 
324 |     // Hy.
325 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
326 |         Hy_0[0] =   my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 
327 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
328 |     }
329 | 
330 |     // Hz.
331 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
332 |         Hz_0[0] =   mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
333 |                     sx1(_X) * (Ey_0[0] - Ey_p));
334 |     }
335 |     {% endif %}
336 |     __syncthreads();
337 | 
338 |     // Write out the results.
339 |     if (_in_global && _in_local) {
340 |         {% if full_operator %}
341 |         Sx(0,0,0) = sx;
342 |         Sy(0,0,0) = sy;
343 |         Sz(0,0,0) = sz;
344 | 
345 |         tx = ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) *
346 |                     (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn])
347 |                     - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn])
348 |                     - ex(0,0,0) * Ex_0[0]));
349 |         ty = ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) *
350 |                     (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 
351 |                     - sx0(_X) * (Hz_0[0] - Hz_n) 
352 |                     - ey(0,0,0) * Ey_0[0]));
353 |         tz = ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) *
354 |                     (sx0(_X) * (Hy_0[0] - Hy_n) 
355 |                     - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 
356 |                     - ez(0,0,0) * Ez_0[0]));
357 | 
358 |         Tx(0,0,0) = tx;
359 |         Ty(0,0,0) = ty;
360 |         Tz(0,0,0) = tz;	
361 | 
362 |         omega_num += conj(tx) * sx + conj(ty) * sy + conj(tz) * sz;
363 |         omega_denom += (real(tx) * real(tx)) + (imag(tx) * imag(tx)) + 
364 | 	               (real(ty) * real(ty)) + (imag(ty) * imag(ty)) +
365 | 		       (real(tz) * real(tz)) + (imag(tz) * imag(tz));
366 |         //omega_num += tx * sx + ty * sy + tz * sz;
367 |         //omega_denom += tx * tx + ty * ty + tz * tz;
368 | 
369 |         {% else %}
370 |         Vx(0,0,0) = Hx_0[0];
371 |         Vy(0,0,0) = Hy_0[0];
372 |         Vz(0,0,0) = Hz_0[0];
373 | 
374 |         {% endif %}
375 |     }
376 |     __syncthreads();
377 | }
378 | 


--------------------------------------------------------------------------------
/maxwell-solver/kernels/alpha_bloch_pmc_pec.cu:
--------------------------------------------------------------------------------
  1 | // Mark the threads that need to load from global memory.
  2 | const bool adj_dims =  (((_X >= -1) && (_X <= {{ dims[0] }})) && \
  3 |                         ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \
  4 |                         ((_Z >= -1) && (_Z <= {{ dims[2] }})));
  5 | 
  6 | // Set relevant field pointers to create wrap-around periodic grid.
  7 | {{ type }} bloch_phaseYZ_x = 1.0;
  8 | {{ type }} bloch_phaseYZ_y = 1.0;
  9 | {{ type }} bloch_phaseYZ_z = 1.0;
 10 | 
 11 | {{ type }} pc_yz_factor[3] = { 1.0, 1.0, 1.0 };
 12 | int pc_iy_Ex = 0;
 13 | int pc_iz_Ex = 0;
 14 | int pc_iy_Ey = 0;
 15 | int pc_iz_Ey = 0;
 16 | int pc_iy_Ez = 0;
 17 | int pc_iz_Ez = 0;
 18 | 
 19 | if (_Y == -1) {
 20 |     _Y = {{ dims[1]-1 }};
 21 | 	bloch_phaseYZ_x *= bloch_y(0);
 22 | 	bloch_phaseYZ_y *= bloch_y(1);
 23 | 	bloch_phaseYZ_z *= bloch_y(2);
 24 | 	if ( pemc(2) == 1 ) {
 25 |         //PEC (anti-symmetric)
 26 |         _Y = 0;
 27 | 		pc_yz_factor[0] = -1.0;
 28 | 		pc_yz_factor[2] = -1.0;
 29 | 		pc_iy_Ex = 1;
 30 | 		pc_iy_Ey = 0;
 31 | 		pc_iy_Ez = 1;
 32 | 	} else if ( pemc(2) == 2 ) {
 33 |         //PMC (symmetric)
 34 |         _Y = 0;
 35 | 		pc_yz_factor[1] = -1.0;
 36 | 		pc_iy_Ex = 1;
 37 | 		pc_iy_Ey = 0;
 38 | 		pc_iy_Ez = 1;
 39 | 	}
 40 | }
 41 | if (_Y == {{ dims[1]-1 }}) {
 42 | 	if ( pemc(3) == 1 ) {
 43 | 		pc_iy_Ey = -1;
 44 | 	}
 45 | 	if ( pemc(3) == 2 ) {
 46 | 		pc_iy_Ey = -1;
 47 | 		pc_yz_factor[1] = -1.0;
 48 | 	}
 49 | }
 50 | if (_Y == {{ dims[1] }}) {
 51 |     _Y = 0;
 52 | 	bloch_phaseYZ_x *= conj(bloch_y(0));
 53 | 	bloch_phaseYZ_y *= conj(bloch_y(1));
 54 | 	bloch_phaseYZ_z *= conj(bloch_y(2));
 55 | 	if ( pemc(3) == 1 ) {
 56 |         //PEC
 57 |         _Y = {{ dims[1]-1 }};
 58 | 		pc_yz_factor[0] = -1.0;
 59 | 		pc_yz_factor[2] = -1.0;
 60 | 		pc_iy_Ex = -1;
 61 | 		pc_iy_Ey = 0;
 62 | 		pc_iy_Ez = -1;
 63 | 	}
 64 |     if ( pemc(3) == 2 ) {
 65 |         //PMC
 66 |         _Y = {{ dims[1]-1 }};
 67 | 		pc_yz_factor[1] = -1.0;
 68 | 		pc_iy_Ex = -1;
 69 | 		pc_iy_Ey = 0;
 70 | 		pc_iy_Ez = -1;
 71 | 	}
 72 | }
 73 | if (_Z == -1) {
 74 |     _Z = {{ dims[2]-1 }};
 75 | 	bloch_phaseYZ_x *= bloch_z(0);
 76 | 	bloch_phaseYZ_y *= bloch_z(1);
 77 | 	bloch_phaseYZ_z *= bloch_z(2);
 78 | 	if ( pemc(4) == 1 ) {
 79 |         //PEC (anti-symmetric)
 80 |         _Z = 0;
 81 | 		pc_yz_factor[0] = -1.0;
 82 | 		pc_yz_factor[1] = -1.0;
 83 | 		pc_iz_Ex = 1;
 84 | 		pc_iz_Ey = 1;
 85 | 		pc_iz_Ez = 0;
 86 | 	} else if ( pemc(4) == 2 ) {
 87 |         //PMC (symmetric)
 88 |         _Z = 0;
 89 | 		pc_yz_factor[2] = -1.0;
 90 | 		pc_iz_Ex = 1;
 91 | 		pc_iz_Ey = 1;
 92 | 		pc_iz_Ez = 0;
 93 | 	}
 94 | }
 95 | if (_Z == {{ dims[2]-1 }}) {
 96 | 	if ( pemc(5) == 1 ) {
 97 | 		pc_iz_Ez = -1;
 98 | 	}
 99 | 	if ( pemc(5) == 2 ) {
100 | 		pc_iz_Ez = -1;
101 | 		pc_yz_factor[2] = -1.0;
102 | 	}
103 | }
104 | if (_Z == {{ dims[2] }}) {
105 |     _Z = 0;
106 | 	bloch_phaseYZ_x *= conj(bloch_z(0));
107 | 	bloch_phaseYZ_y *= conj(bloch_z(1));
108 | 	bloch_phaseYZ_z *= conj(bloch_z(2));
109 | 	if ( pemc(5) == 1 ) {
110 |         //PEC
111 |         _Z = {{ dims[2]-1 }};
112 | 		pc_yz_factor[0] = -1.0;
113 | 		pc_yz_factor[1] = -1.0;
114 | 		pc_iz_Ex = -1;
115 | 		pc_iz_Ey = -1;
116 | 		pc_iz_Ez = 0;
117 | 	} else if ( pemc(5) == 2 ) {
118 |         //PMC
119 |         _Z = {{ dims[2]-1 }};
120 | 		pc_yz_factor[2] = 0.0; //this value does not matter
121 | 		pc_iz_Ex = -1;
122 | 		pc_iz_Ey = -1;
123 | 		pc_iz_Ez = 0;
124 | 	}
125 | }
126 | 
127 | // Some definitions for shared memory.
128 | // Used to get unpadded thread indices.
129 | #define s_ty (_ty + 1)
130 | #define s_tz (_tz + 1)
131 | #define s_tyy (_tyy + 2)
132 | #define s_tzz (_tzz + 2)
133 | 
134 | // Helper definitions.
135 | #define s_next_field (s_tyy * s_tzz)
136 | #define s_to_local (s_ty * s_tzz + (s_tz))   
137 | #define s_zp +1
138 | #define s_zn -1
139 | #define s_yp +s_tzz
140 | #define s_yn -s_tzz
141 | 
142 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
143 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
144 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
145 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
146 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
147 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local;
148 | 
149 | // Local memory.
150 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n;
151 | {{ type }} vx, vy, vz;
152 | {{ type }} px, py, pz, py_p, pz_p;
153 | 
154 | int xn, xp;
155 | {{ type }} bloch_phaseX_x = 1;
156 | {{ type }} bloch_phaseX_y = 1;
157 | {{ type }} bloch_phaseX_z = 1;
158 | {{ type }} pc_x_factor[3] = { 1.0, 1.0, 1.0 };
159 | int pc_ix_Ex = -1;
160 | int pc_ix_Ey = -1;
161 | int pc_ix_Ez = -1;
162 | if (_X == 0) { 
163 |     bloch_phaseX_x = bloch_x(0);
164 |     bloch_phaseX_y = bloch_x(1);
165 |     bloch_phaseX_z = bloch_x(2);
166 |     if ( pemc(0) == 1 ) {
167 |         pc_x_factor[1] = -1.0;
168 |         pc_x_factor[2] = -1.0;
169 |         pc_ix_Ex = 0;
170 |         pc_ix_Ey = 1;
171 |         pc_ix_Ez = 1;
172 |         xn = 0;
173 |     } else if ( pemc(0) == 2 ) {
174 |         pc_x_factor[0] = -1.0;
175 |         pc_ix_Ex = 0;
176 |         pc_ix_Ey = 1;
177 |         pc_ix_Ez = 1;
178 |         xn = 0;
179 |     } else {
180 |         pc_ix_Ex = -1;
181 |         pc_ix_Ey = -1;
182 |         pc_ix_Ez = -1;
183 |         xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction.
184 |     }
185 | } else {
186 |     xn = -1;}
187 | 
188 | // Load E-fields into shared memory.
189 | if (adj_dims) {
190 |     // Load in p = r + beta * p.
191 |     Ex_0[0] = bloch_phaseX_x * bloch_phaseYZ_x * pc_x_factor[0] * pc_yz_factor[0] *
192 |                 (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z)) *
193 | 	        (  Rx(pc_ix_Ex, pc_iy_Ex, pc_iz_Ex) 
194 | 		 + beta * ( Px(pc_ix_Ex, pc_iy_Ex, pc_iz_Ex) 
195 | 			    - omega * Vx(pc_ix_Ex, pc_iy_Ex, pc_iz_Ex)));
196 |     Ey_0[0] = bloch_phaseX_y * bloch_phaseYZ_y * pc_x_factor[1] * pc_yz_factor[1] *
197 |                 (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z)) *
198 | 	        (  Ry(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey)
199 | 		 + beta * ( Py(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey) 
200 | 			    - omega * Vy(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey)));
201 |     Ez_0[0] = bloch_phaseX_z * bloch_phaseYZ_z * pc_x_factor[2] * pc_yz_factor[2] *
202 |                 (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z)) *
203 | 	        (  Rz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez) 
204 | 		 + beta * ( Pz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez) 
205 | 			    - omega * Vz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez)));
206 | 
207 |     // Ey_p = Ry(0,0,0) + beta * Ey(0,0,0);
208 |     py_p = Ry(0, pc_iy_Ey, pc_iz_Ey) 
209 |             + beta * (Py(0, pc_iy_Ey, pc_iz_Ey) 
210 |                         - omega * Vy(0, pc_iy_Ey, pc_iz_Ey));
211 |     Ey_p = bloch_phaseYZ_y * pc_yz_factor[1] * (py_p) * 
212 |             (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
213 | 
214 |     // Ez_p = Rz(0,0,0) + beta * Ez(0,0,0);
215 |     pz_p = Rz(0, pc_iy_Ez, pc_iz_Ez) 
216 |             + beta * (Pz(0, pc_iy_Ez, pc_iz_Ez)
217 |                         - omega * Vz(0, pc_iy_Ez, pc_iz_Ez));
218 |     Ez_p = bloch_phaseYZ_z * pc_yz_factor[2] * (pz_p) * 
219 |             (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
220 | }
221 | __syncthreads();
222 | 
223 | // Calculate H-fields and store in shared_memory.
224 | // Hy.
225 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
226 |     Hy_0[0] = my(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey) * 
227 |                 (sx1(_X+xn) * (Ez_0[0] - Ez_p) - sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
228 | }
229 | 
230 | // Hz.
231 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
232 |     Hz_0[0] = mz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez) * 
233 |                 (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - sx1(_X+xn) * (Ey_0[0] - Ey_p));
234 | }
235 | __syncthreads();
236 | 
237 | // reset the pemc factors and ix's
238 | pc_x_factor[0] = 1.0;
239 | pc_x_factor[1] = 1.0;
240 | pc_x_factor[2] = 1.0;
241 | pc_ix_Ex = 0;
242 | pc_ix_Ey = 1;
243 | pc_ix_Ez = 1;
244 | // start loop in x direction
245 | for (; _X < _x_end ; _X += _txx) {
246 |     // We've moved ahead in X, so transfer appropriate field values.
247 |     Ey_0[0] = Ey_p;
248 |     Ez_0[0] = Ez_p;
249 |     Hy_n = Hy_0[0];
250 |     Hz_n = Hz_0[0];
251 | 
252 |     py = py_p;
253 |     pz = pz_p;
254 | 
255 |     // Load E-fields into shared memory.
256 |     if (_X == {{ dims[0]-1 }}){
257 |         if ( pemc(1) == 1 ) {
258 |             // PEC
259 |             pc_x_factor[1] = -1.0;
260 |             pc_x_factor[2] = -1.0;
261 |             pc_ix_Ex = -1;
262 |             pc_ix_Ey = -1;
263 |             pc_ix_Ez = -1;
264 |             xp = 0;
265 |         } else if ( pemc(1) == 2 ) {
266 |             // PMC
267 |             pc_x_factor[0] = -1.0;
268 |             pc_ix_Ex = -1;
269 |             pc_ix_Ey = -1;
270 |             pc_ix_Ez = -1;
271 |             xp = 0;
272 |         } else {
273 |             // bloch
274 |             bloch_phaseX_x = conj(bloch_x(0));
275 |             bloch_phaseX_y = conj(bloch_x(1));
276 |             bloch_phaseX_z = conj(bloch_x(2));
277 |             xp = {{ -(dims[0]-1) }}; // Wrap-around step in the negative direction.
278 |         }
279 |     } else {
280 |         xp = +1;
281 |     	bloch_phaseX_x = 1;
282 |     	bloch_phaseX_y = 1;
283 |     	bloch_phaseX_z = 1;
284 |     }
285 | 
286 |     if (adj_dims) {
287 |         px = Rx(pc_ix_Ex,pc_iy_Ex,pc_iz_Ex) + beta * ( 
288 | 			Px(pc_ix_Ex,pc_iy_Ex,pc_iz_Ex) - omega * Vx(pc_ix_Ex,pc_iy_Ex,pc_iz_Ex));
289 |         Ex_0[0] = bloch_phaseYZ_x * pc_x_factor[0] * pc_yz_factor[0] *
290 |                     (px) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z));
291 | 
292 |         py_p = Ry(pc_ix_Ey,pc_iy_Ey,pc_iz_Ey) + beta * ( 
293 | 			Py(pc_ix_Ey,pc_iy_Ey,pc_iz_Ey) - omega * Vy(pc_ix_Ez,pc_iy_Ey,pc_iz_Ey));
294 |         Ey_p = bloch_phaseX_y * bloch_phaseYZ_y * pc_x_factor[1] * pc_yz_factor[1] *
295 |                     (py_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z));
296 | 
297 |         pz_p = Rz(pc_ix_Ez,pc_iy_Ez,pc_iz_Ez) + beta * ( 
298 | 			Pz(pc_ix_Ez,pc_iy_Ez,pc_iz_Ez) - omega * Vz(pc_ix_Ez,pc_iy_Ez,pc_iz_Ez));
299 |         Ez_p = bloch_phaseX_z * bloch_phaseYZ_z * pc_x_factor[2] * pc_yz_factor[2] *
300 |                     (pz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z));
301 |     }
302 | 
303 |     __syncthreads();
304 | 
305 |     // Calculate H-fields and store in shared_memory.
306 |     {% if mu_equals_1 == True %}
307 |     // Hx.
308 |     if ((_ty != _tyy) && (_tz != _tzz)) {
309 |         Hx_0[0] =   (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
310 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
311 |     }
312 | 
313 |     // Hy.
314 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
315 |         Hy_0[0] =   (sx1(_X) * (Ez_0[0] - Ez_p) - 
316 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
317 |     }
318 | 
319 |     // Hz.
320 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
321 |         Hz_0[0] =   (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
322 |                     sx1(_X) * (Ey_0[0] - Ey_p));
323 |     }
324 |     {% else %}
325 |     // Hx.
326 |     if ((_ty != _tyy) && (_tz != _tzz)) {
327 |         Hx_0[0] =   mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 
328 |                     sy1(_Y) * (Ez_0[0] - Ez_0[s_yp]));
329 |     }
330 | 
331 |     // Hy.
332 |     if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) {
333 |         Hy_0[0] =   my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 
334 |                     sz1(_Z) * (Ex_0[0] - Ex_0[s_zp]));
335 |     }
336 | 
337 |     // Hz.
338 |     if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) {
339 |         Hz_0[0] =   mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 
340 |                     sx1(_X) * (Ey_0[0] - Ey_p));
341 |     }
342 |     {% endif %}
343 |     __syncthreads();
344 | 
345 |     // Write out the results.
346 |     if (_in_global && _in_local) {
347 |         {% if full_operator %}
348 |         P1x(0,0,0) = px;
349 |         P1y(0,0,0) = py;
350 |         P1z(0,0,0) = pz;
351 | 
352 |         vx = ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) *
353 |                     (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn])
354 |                     - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn])
355 |                     - ex(0,0,0) * Ex_0[0]));
356 |         vy = ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) *
357 |                     (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 
358 |                     - sx0(_X) * (Hz_0[0] - Hz_n) 
359 |                     - ey(0,0,0) * Ey_0[0]));
360 |         vz = ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) *
361 |                     (sx0(_X) * (Hy_0[0] - Hy_n) 
362 |                     - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 
363 |                     - ez(0,0,0) * Ez_0[0]));
364 | 
365 |         V1x(0,0,0) = vx;
366 |         V1y(0,0,0) = vy;
367 |         V1z(0,0,0) = vz;
368 | 
369 |         alpha_denom += (R_hatHx(0,0,0) * vx) + (R_hatHy(0,0,0) * vy) + (R_hatHz(0,0,0) * vz);
370 | 
371 |         {% else %}
372 |         V1x(0,0,0) = Hx_0[0];
373 |         V1y(0,0,0) = Hy_0[0];
374 |         V1z(0,0,0) = Hz_0[0];
375 | 
376 |         {% endif %}
377 |     }
378 |     __syncthreads();
379 | }
380 | 


--------------------------------------------------------------------------------
/maxwell-solver/fdfd.py:
--------------------------------------------------------------------------------
  1 | import h5py
  2 | import numpy as np
  3 | import maxwell_ops_lumped
  4 | from solvers import bicg
  5 | from gce.grid import Grid
  6 | from mpi4py.MPI import COMM_WORLD as comm
  7 | import time, sys, tempfile, os
  8 | 
  9 | from pycuda import driver
 10 | 
 11 | 
 12 | def simulate(name, check_success_only=False):
 13 |     """ Read simulation from input file, simulate, and write out results. """
 14 |     print_comm0('starting simulate')
 15 | 
 16 |     # Reset the environment variables pointing to the temporary directory.
 17 |     tempfile.tempdir = '/tmp'
 18 | 
 19 |     # Create the reporter function.
 20 |     write_status = lambda msg: open(name + '.status', 'a').write(msg)
 21 |     if comm.Get_rank() == 0:
 22 |         # write_status('EXEC initializing\n')
 23 |         def rep(err, info: str = None):
 24 |             if info is None:
 25 |                 write_status('%e\n' % np.abs(err))
 26 |             else:
 27 |                 write_status('%s %e\n' % (info, np.abs(err)))
 28 |     else:  # No reporting needed for non-root nodes.
 29 | 
 30 |         def rep(err, info: str = None):
 31 |             pass
 32 | 
 33 |     # Get input parameters.
 34 |     params = get_parameters(name)
 35 |     solver = params['solver']
 36 |     if solver == 'CG':
 37 |         # Define operations needed for the CG operation.(bicg is CG)
 38 |         b, x, ops, post_cond, gpu_func = bicg.ops(params)
 39 |         # Solve!
 40 |         start_time = time.time()
 41 |         x, err, success, iters = bicg.solve_symm_lumped(b, x=x, \
 42 |                                                 max_iters=params['max_iters'], \
 43 |                                                 reporter=rep, \
 44 |                                                 err_thresh=params['err_thresh'], \
 45 |                                                 gpu_func=gpu_func, \
 46 |                                                 **ops)
 47 |         stop_time = time.time()
 48 | 
 49 |     elif solver == 'biCGSTAB':
 50 |         # Define operations needed for the biCGSTAB operation.
 51 |         b, x, r_hatH, ops, post_cond, gpu_func = bicg.ops_biCGSTAB(params)
 52 | 
 53 |         # Solve!
 54 |         start_time = time.time()
 55 |         x, err, success, iters = bicg.solve_asymm_biCGSTAB( b, r_hatH, x=x, \
 56 |                                                     max_iters=params['max_iters'], \
 57 |                                                     reporter=rep, \
 58 |                                                     err_thresh=params['err_thresh'], \
 59 |                                                     gpu_func=gpu_func,\
 60 |                                                     **ops)
 61 |         stop_time = time.time()
 62 | 
 63 |     elif solver == 'lgmres':
 64 |         from solvers import lgmres
 65 |         # Define operations needed for the lumped bicg operation.
 66 |         b, x, lgmres_functions, post_cond, gpu_func = lgmres.ops_lgmres(params)
 67 | 
 68 |         options = {
 69 |             'maxiters': params['max_iters'],
 70 |             'inner_m': 15,
 71 |             'outer_k': 2,
 72 |             'tol': params['err_thresh']
 73 |         }
 74 |         # Solve!
 75 |         start_time = time.time()
 76 |         x, err, success, iters = lgmres.solve_asymm_lgmres( b, x=x, \
 77 |                                                             reporter=rep, \
 78 |                                                             lgmres_func=lgmres_functions,\
 79 |                                                             options=options, \
 80 |                                                             gpu_func=gpu_func)
 81 |         stop_time = time.time()
 82 | 
 83 |     elif solver == 'Jacobi-Davidson':
 84 |         from solvers import lgmres, JacDav
 85 |         # Check if x is zeros and do a simulation with biCGSTAB if so
 86 |         if not np.any(params['x']):
 87 |             b, x, lgmres_functions, post_cond, gpu_func = lgmres.ops_lgmres(
 88 |                 params)
 89 |             options = {
 90 |                 'maxiters': 300,  #params['max_iters'],
 91 |                 'inner_m': 15,
 92 |                 'outer_k': 2,
 93 |                 'tol': 10 * params['err_thresh']
 94 |             }
 95 |             print_comm0('zero E0 - initial simulation needed')
 96 |             x_start, err, success, iters = lgmres.solve_asymm_lgmres( b, x=x, \
 97 |                                                                 reporter=rep, \
 98 |                                                                 lgmres_func=lgmres_functions,\
 99 |                                                                 options=options, \
100 |                                                                 gpu_func=gpu_func)
101 |             #b, x0, r_hatH, ops, post_cond, gpu_func = maxwell_ops_lumped.ops_lgmres(params)
102 |             #x_start, err, success, iters = bicg.solve_asymm_biCGSTAB( b, r_hatH, x=x0, \
103 |             #                                            max_iters=params['max_iters'], \
104 |             #                                            reporter=rep, \
105 |             #                                            err_thresh=10*params['err_thresh'], \
106 |             #                                            gpu_func=gpu_func,\
107 |             #                                            **ops)
108 |             params['x'] = [
109 |                 E.get() for E in x_start
110 |             ]  #.get() will get the data from the gpu and gather it to the root
111 |             if comm.Get_rank() == 0:
112 |                 params['x'] = post_cond(params['x'])  # Apply postconditioner
113 |             #shp = params['x'][0].shape
114 |             #params['x']=[np.random.rand(shp[0], shp[1], shp[2]) for i in range(3)]
115 |             del x_start
116 |         else:
117 |             print_comm0('none zero E0 - No initial simulation needed')
118 | 
119 |         # Change the precompution that was done on the permitivity (j and m is kept)
120 |         # So, undo eps = omega**2*eps
121 |         if comm.Get_rank() == 0:
122 |             for k in range(3):
123 |                 params['e'][k] = (params['omega']**(-2) * params['e'][k])
124 | 
125 |         # Define operations needed for the JacDav operation.
126 |         print_comm0('preparing solver')
127 |         t0, gpu_post_cond_eps_norm, post_cond, JacDav_func, gpu_func = \
128 |             JacDav.ops_JacDav(params)
129 | 
130 |         # Set the solver options
131 |         options_JacDav = {
132 |             'maxiters': 100,
133 |             'n_eig': params['n_eig'],
134 |             'target': params['omega']**2,
135 |             'm_max': 40,
136 |             'm_min': 2,
137 |             'tol': params['err_thresh']
138 |         }
139 |         options_lgmres = {'maxiters': 25, 'inner_m': 15, 'outer_k': 3}
140 | 
141 |         # Solve!
142 |         start_time = time.time()
143 |         print_comm0('start solver')  # t can not be 0
144 | 
145 |         q, Q, success, err, iters = \
146 |                         JacDav.solve_eig_JacDav( t0 = t0, \
147 |                                                  reporter = rep, \
148 |                                                  JacDav_func = JacDav_func, \
149 |                                                  options_lgmres = options_lgmres, \
150 |                                                  options_JacDav = options_JacDav,
151 |                                                  gpu_func = gpu_func)
152 |         stop_time = time.time()
153 |         print_comm0('time: ' + str(stop_time - start_time))
154 | 
155 |         # remove the eps_norm
156 |         for Qi in Q:
157 |             gpu_post_cond_eps_norm(Qi)
158 | 
159 |     if check_success_only:  # Don't write output, just see if we got a success.
160 |         return success
161 | 
162 |     # Gather results onto root's host memory.
163 |     if solver == 'Jacobi-Davidson':
164 |         Q_result = [[E.get() for E in x] for x in Q]
165 |         result = {  'Q': Q_result, \
166 |                     'q': q,
167 |                     'err': err, \
168 |                     'success': success, \
169 |                     'iters': iters, \
170 |                     'time': (stop_time-start_time)}
171 |     else:
172 |         result = {  'E': [E.get() for E in x], \
173 |                     'err': err, \
174 |                     'success': success, \
175 |                     'iters': iters, \
176 |                     'time': (stop_time-start_time)}
177 |     print_comm0(result['time'])
178 | 
179 |     # Write results to output file.
180 |     if comm.Get_rank() == 0:
181 |         if solver == 'Jacobi-Davidson':
182 |             for i in range(len(result['Q'])):
183 |                 result['Q'][i] = post_cond(
184 |                     result['Q'][i])  # Apply postconditioner
185 |         else:
186 |             result['E'] = post_cond(result['E'])  # Apply postconditioner.
187 |         write_results(name, result)
188 | 
189 |     return success
190 | 
191 | 
192 | def get_parameters(name):
193 |     """ Reads the simulation parameters from the input hdf5 file. """
194 | 
195 |     if comm.rank == 0:
196 |         f = h5py.File(name + '.grid', 'r')
197 |         files_to_delete = [name + '.grid']
198 | 
199 |         omega = np.complex128(f['omega_r'][0] + 1j * f['omega_i'][0])
200 |         shape = tuple([int(s) for s in f['shape'][:]])
201 |         n_eig = int(f['n_eig'][0])
202 | 
203 |         # bloch boundary conditions
204 |         bloch_phase = f['bloch_phase'][...]
205 | 
206 |         # PEC or PMC boundary conditions
207 |         pemc = f['pemc'][...].astype('int32')
208 | 
209 |         # get solver
210 |         EM_solvers = ['CG', 'biCGSTAB', 'lgmres', 'Jacobi-Davidson']
211 |         solver = EM_solvers[f['solver'][...]]
212 | 
213 |         # Function used to read in a 1D complex vector fields.
214 |         get_1D_fields = lambda a: [(f[a+'_'+u+'r'][:] + 1j * f[a+'_'+u+'i'][:]).\
215 |                                 astype(np.complex128) for u in 'xyz']
216 | 
217 |         # Read in s and t vectors.
218 |         s = get_1D_fields('sp')
219 |         t = get_1D_fields('sd')
220 | 
221 |         # Read in max_iters and err_thresh.
222 |         max_iters = int(f['max_iters'][0])
223 |         err_thresh = float(f['err_thresh'][0])
224 | 
225 |         # Function used to read in 3D complex vector fields.
226 |         def get_3D_fields(a):
227 |             field = []
228 |             # Check if field data all in one HDF5 file.
229 |             if (a + '_xr') in f:
230 |                 for k in range(3):
231 |                     key = a + '_' + 'xyz' [k]
232 |                     field.append(
233 |                         (f[key + 'r'][:] + 1j * f[key + 'i'][:]).astype(
234 |                             np.complex128))
235 |                 return field
236 | 
237 |             for k in range(3):
238 |                 key = name + '.' + a + '_' + 'xyz' [k]
239 |                 field.append((h5py.File(key + 'r')['data'][:] + \
240 |                         1j * h5py.File(key + 'i')['data'][:]).astype(np.complex128))
241 |                 files_to_delete.append(key + 'r')
242 |                 files_to_delete.append(key + 'i')
243 |             return field
244 | 
245 |         e = get_3D_fields('e')  # Permittivity (eps).
246 |         j = get_3D_fields('J')  # Current source.
247 |         m = get_3D_fields('m')  # Permeability (mu).
248 |         x = get_3D_fields('A')  # Initial fields (E0).
249 | 
250 |         f.close()  # Close file.
251 | 
252 |         # Delete input files.
253 |         for filename in files_to_delete:
254 |             os.remove(filename)
255 | 
256 |         # Do some simple pre-computation.
257 |         for k in range(3):
258 |             m[k] = m[k]**-1
259 |             e[k] = omega**2 * e[k]
260 |             j[k] = -1j * omega * j[k]
261 | 
262 |         params = {'omega': omega, 'shape': shape, 'n_eig': n_eig,\
263 |                   'max_iters': max_iters, 'err_thresh': err_thresh, \
264 |                   's': s, 't': t, 'bloch_phase': bloch_phase, \
265 |                   'pemc': pemc, 'solver': solver}
266 |     else:
267 |         params = None
268 | 
269 |     params = comm.bcast(params)
270 | 
271 |     if comm.rank == 0:
272 |         params['e'] = e
273 |         params['m'] = m
274 |         params['j'] = j
275 |         params['x'] = x
276 |     else:
277 |         for field_name in 'emjx':
278 |             params[field_name] = [None] * 3
279 | 
280 |     return params
281 | 
282 | 
283 | def write_results(name, result):
284 |     """ Write out the results to an hdf5 file. """
285 | 
286 |     my_write = lambda fieldname, data: h5py.File(name + '.' + fieldname, 'w').\
287 |                                             create_dataset('data', data=data)
288 | 
289 |     if 'q' in list(result.keys()):
290 |         my_write('iter_info', np.array([result['iters']]).astype(np.float32))
291 |         my_write('time_info', np.array([result['time']]).astype(np.float32))
292 |         my_write('qr', np.real(np.array([result['q']])).astype(np.float32))
293 |         my_write('qi', np.imag(np.array([result['q']])).astype(np.float32))
294 | 
295 |         # Write out the datasets.
296 |         for i in range(len(result['q'])):
297 |             for k in range(3):
298 |                 my_write('Q' + str(i) + '_' + 'xyz'[k] + 'r', \
299 |                         np.real(result['Q'][i][k]).astype(np.float32))
300 |                 my_write('Q' + str(i)+ '_' + 'xyz'[k] + 'i', \
301 |                         np.imag(result['Q'][i][k]).astype(np.float32))
302 |         my_write = lambda fieldname, data: h5py.File(name + '.' + fieldname, 'w').\
303 |                                                 create_dataset('data', data=data)
304 |     else:
305 |         my_write('iter_info', np.array([result['iters']]).astype(np.float32))
306 |         my_write('time_info', np.array([result['time']]).astype(np.float32))
307 | 
308 |         # Write out the datasets.
309 |         for k in range(3):
310 |             my_write('E_' + 'xyz'[k] + 'r', \
311 |                     np.real(result['E'][k]).astype(np.float32))
312 |             my_write('E_' + 'xyz'[k] + 'i', \
313 |                     np.imag(result['E'][k]).astype(np.float32))
314 | 
315 | 
316 | def print_comm0(txt: str):
317 |     if comm.Get_rank() == 0:
318 |         print(txt)
319 | 
320 | 
321 | if __name__ == '__main__':  # Allows calls from command line.
322 |     if comm.rank == 0:
323 |         print('start in main')
324 |     simulate(sys.argv[1])  # Specify name of the job.
325 | 


--------------------------------------------------------------------------------
/maxwell-solver/gce/kernel.py:
--------------------------------------------------------------------------------
  1 | """ Defines the Kernel class for GCE. """
  2 | from pycuda import compiler
  3 | from pycuda import driver as drv
  4 | from jinja2 import Environment, PackageLoader
  5 | from gce.space import get_space_info
  6 | from gce.out import batch_reduce
  7 | import numpy as np
  8 | from mpi4py.MPI import COMM_WORLD as comm
  9 | 
 10 | # Load the jinja environment when the module is loaded.
 11 | _template_file = 'kernel.cu'
 12 | _jinja_env = Environment(loader=PackageLoader(__name__, '.'))
 13 | 
 14 | 
 15 | class Kernel:
 16 |     """ Create an executable kernel for GCE.
 17 | 
 18 |     A Kernel executable allows for the modification of Grid objects and the 
 19 |     computation of Outs. Kernels accept Grid, Const, Out, and certain numpy
 20 |     scalar objects as their input. 
 21 | 
 22 |     Kernels work by traversing the 3D space in the x-direction and executing
 23 |     user-specified cuda code at every grid point. For more information on the
 24 |     conventions and available tools for defining Kernels, please see the 
 25 |     KERNEL_DOC file.
 26 | 
 27 |     Additionally, Kernels will self-optimize runtime parameters. Such parmaters
 28 |     include only block size for now though.
 29 | 
 30 |     Methods:
 31 |     __init__ -- Define the executable kernel.
 32 |     __call__ -- Execute the kernel.
 33 | 
 34 |     Example usage:
 35 |     fun = Kernel(((x, 'x'), (y, 'y')), code)
 36 |     fun()
 37 |     """
 38 | 
 39 |     def __init__(self, code, *vars, **kwargs):
 40 |         """ Prepare a cuda function that will execute on the GCE space.
 41 | 
 42 |         Input variables:
 43 |         code -- The looped cuda code to be executed.
 44 |         vars -- (name, gce_type, numpy_type) of the input arguments.
 45 | 
 46 |         Keyword variables:
 47 |         pre_loop -- Cuda code that is executed before the loop code.
 48 |         shape_filter -- Can be either 'all', 'skinny', or 'square'.
 49 |         padding -- (yn, yp, zn, zp), describes the number of "extra" threads
 50 |             to be run on the border of each thread block.
 51 |         smem_per_thread -- Number of bytes of shared memory needed by a thread.
 52 |         """
 53 | 
 54 |         # Make sure there are no extraneous keyword arguments.
 55 |         if any([key not in \
 56 |                 ('pre_loop', 'shape_filter', 'padding', 'smem_per_thread')
 57 |                 for key in kwargs.keys()]):
 58 |             raise TypeError('Invalid key used.')
 59 | 
 60 |         # Process keyword arguments.
 61 |         pre_code = kwargs.get('pre_loop', '')
 62 |         shape_filter = kwargs.get('shape_filter', 'skinny')
 63 |         padding = kwargs.get('padding', (0, 0, 0, 0))
 64 |         smem_per_thread = kwargs.get('smem_per_thread', 0)
 65 | 
 66 |         # Dictionary for conversion from numpy to cuda types.
 67 |         cuda_types = {np.float32: 'float', np.float64: 'double', \
 68 |                         np.int32: 'int', \
 69 |                         np.complex64: 'pycuda::complex<float>', \
 70 |                         np.complex128: 'pycuda::complex<double>'}
 71 |         # Dictionary for conversion from numpy to alternate type for Consts.
 72 |         alt_types = {np.float32: 'float', np.float64: 'double', \
 73 |                         np.complex64: 'float2', np.complex128: 'double2'}
 74 | 
 75 |         # Process vars.
 76 |         params = [{'name': v[0], \
 77 |                 'gce_type': v[1], \
 78 |                 'dtype': v[2], \
 79 |                 'cuda_type': cuda_types[v[2]]} for v in vars]
 80 | 
 81 |         # Get the template and render it using jinja2.
 82 |         shape = get_space_info()['shape']  # Shape of the space.
 83 |         template = _jinja_env.get_template(_template_file)
 84 |         cuda_source = template.render(  params=params, \
 85 |                                         padding=padding, \
 86 |                                         dims =get_space_info()['shape'], \
 87 |                                         x_range=get_space_info()['x_range'], \
 88 |                                         preloop_code=pre_code, \
 89 |                                         loop_code=code, \
 90 |                                         flat_tag='_f')
 91 | 
 92 |         # Compile the code into a callable cuda function.
 93 |         mod = compiler.SourceModule(cuda_source)
 94 |         # mod = compiler.SourceModule(cuda_source, options=['-Xptxas', '-dlcm=cg']) # Global skips L1 cache.
 95 |         self.fun = mod.get_function('_gce_kernel')
 96 | 
 97 |         # Prefer 48KB of L1 cache when possible.
 98 |         self.fun.set_cache_config(drv.func_cache.PREFER_L1)
 99 | 
100 |         # Get address of global variable in module.
101 |         # Note: contains a work-around for problems with complex types.
102 |         my_get_global = lambda name: mod.get_global('_' + name + '_temp')
103 | 
104 |         # Useful information about the kernel.
105 |         self._kernel_info = {'max_threads': self.fun.max_threads_per_block, \
106 |                             'const_bytes': self.fun.const_size_bytes, \
107 |                             'local_bytes': self.fun.local_size_bytes, \
108 |                             'num_regs': self.fun.num_regs}
109 | 
110 |         # Get some valid execution configurations.
111 |         self.exec_configs = self._get_exec_configs( \
112 |                                         self.fun.max_threads_per_block, \
113 |                                         padding, smem_per_thread, shape_filter)
114 | 
115 |         # Prepare the function by telling pycuda the types of the inputs.
116 |         arg_types = []
117 |         for p in params:
118 |             if p['gce_type'] is 'number':
119 |                 arg_types.append(p['dtype'])
120 | #             elif p['gce_type'] is 'const':
121 | #                 arg_types.append(p['dtype'])
122 | #                 # pass # Consts don't actually get passed in.
123 |             else:
124 |                 arg_types.append(np.intp)
125 |         self.fun.prepare([np.int32, np.int32] + arg_types)
126 | 
127 |         # Define the function which we will use to execute the kernel.
128 |         # TODO: Make a shortcut version with lower overhead.
129 |         # Used for asynchronous execution and timing.
130 |         stream = drv.Stream()
131 |         start, start2, pad_done, sync_done, comp_done, all_done = \
132 |             [drv.Event() for k in range(6)]
133 | 
134 |         # Kernel execution over a range of x-values.
135 |         def execute_range(x_start, x_end, gpu_params, cfg, stream):
136 |             """ Defines asynchronous kernel execution for a range of x. """
137 |             self.fun.prepared_async_call( \
138 |                 cfg['grid_shape'][::-1], \
139 |                 cfg['block_shape'][::-1] + (1,), \
140 |                 stream, \
141 |                 *([np.int32(x_start), np.int32(x_end)] + gpu_params), \
142 |                 shared_size=cfg['smem_size'])
143 | 
144 |         x_start, x_end = get_space_info()['x_range']  # This node's range.
145 | 
146 |         def execute(cfg, *args, **kwargs):
147 | 
148 |             # Parse keyword arguments.
149 |             post_sync_grids = kwargs.get('post_sync', None)
150 | 
151 |             # Parse the inputs.
152 |             gpu_params = []
153 |             for k in range(len(params)):
154 |                 if params[k]['gce_type'] is 'number':
155 |                     gpu_params.append(params[k]['dtype'](args[k]))
156 |                 elif params[k]['gce_type'] is 'const':  # Load Const.
157 |                     gpu_params.append(args[k].data.ptr)
158 |                     # Const no longer actually "const" in cuda code.
159 | 
160 | 
161 | #                     d_ptr, size_in_bytes = my_get_global(params[k]['name'])
162 | #                     drv.memcpy_dtod(d_ptr, args[k].data.gpudata, size_in_bytes)
163 |                 elif params[k]['gce_type'] is 'grid':
164 |                     if args[k]._xlap is 0:
165 |                         gpu_params.append(args[k].data.ptr)
166 |                     else:
167 |                         gpu_params.append(args[k].data.ptr + \
168 |                                             args[k]._xlap_offset)
169 |                 elif params[k]['gce_type'] is 'out':
170 |                     args[k].data.fill(args[k].dtype(0))  # Initialize the Out.
171 |                     gpu_params.append(args[k].data.ptr)
172 |                 else:
173 |                     raise TypeError('Invalid input type.')
174 | 
175 |             # See if we need to synchronize grids after kernel execution.
176 |             if post_sync_grids is None:
177 |                 sync_pad = 0
178 |             else:
179 |                 sync_pad = max([g._xlap for g in post_sync_grids])
180 | 
181 |             start2.record(stream)
182 |             comm.Barrier()
183 |             start.record(stream)
184 | 
185 |             # Execute kernel in padded regions first.
186 |             execute_range(x_start, x_start + sync_pad, gpu_params, cfg, stream)
187 |             execute_range(x_end - sync_pad, x_end, gpu_params, cfg, stream)
188 |             pad_done.record(stream)  # Just for timing purposes.
189 |             stream.synchronize()  # Wait for execution to finish.
190 | 
191 |             # Begin kernel execution in remaining "core" region.
192 |             execute_range(x_start + sync_pad, x_end - sync_pad, gpu_params,
193 |                           cfg, stream)
194 |             comp_done.record(stream)  # Timing only.
195 | 
196 |             # While core kernel is executing, perform synchronization.
197 |             if post_sync_grids is not None:  # Synchronization needed.
198 |                 for grid in post_sync_grids:
199 |                     grid.synchronize_start()  # Start synchronization.
200 | 
201 |                 # Keep on checking until everything is done.
202 |                 while not (all([grid.synchronize_isdone() \
203 |                                 for grid in post_sync_grids]) and \
204 |                         stream.is_done()):
205 |                     pass
206 | 
207 |             else:  # Nothing to synchronize.
208 |                 stream.synchronize()  # Just wait for execution to finish.
209 | 
210 |             sync_done.record()  # Timing.
211 | 
212 |             # Obtain the result for all Outs.
213 |             batch_reduce(*[args[k] for k in range(len(params)) \
214 |                                 if params[k]['gce_type'] is 'out'])
215 |             all_done.record()  # Timing.
216 |             all_done.synchronize()
217 | 
218 |             return comp_done.time_since(
219 |                 start)  # Return time needed to execute the function.
220 | 
221 |         self.execute = execute  # Save execution function in Kernel instance.
222 |         self.min_exec_time = float('inf')  # Stores the fastest execution time.
223 | 
224 |     def __call__(self, *args, **kwargs):
225 |         """ Execute the kernel. 
226 | 
227 |         Each valid execution configuration will be tried once, and then the
228 |         fastest configuration will be used for all remaining calls.
229 |         """
230 |         if self.exec_configs:  # As long as list is not empty, choose from list.
231 |             cfg = self.exec_configs.pop()  # Choose execution configuration.
232 | 
233 |             # Execute.
234 |             exec_time = self.execute(cfg, *args, **kwargs)
235 | 
236 |             # Check if this was the fastest execution to-date.
237 |             if exec_time < self.min_exec_time:  # Found a new fastest config.
238 |                 self.min_exec_time = exec_time
239 |                 self.fastest_cfg = cfg
240 | 
241 |         else:  # If config list empty, go with the fastest configuration found.
242 |             cfg = self.fastest_cfg
243 |             exec_time = self.execute(cfg, *args, **kwargs)
244 | 
245 |         # Return results.
246 |         return exec_time, cfg
247 | 
248 |     def _get_exec_configs(self, threads_max, padding, smem_per_thread, \
249 |                         shape_filter):
250 |         """ Find all valid execution configurations. """
251 | 
252 |         # Padding of the kernel.
253 |         y_pad = sum(padding[0:2])
254 |         z_pad = sum(padding[2:4])
255 | 
256 |         # Shared memory requirements.
257 |         smem_size = lambda b_shape: smem_per_thread * \
258 |                                             (b_shape[0] * b_shape[1])
259 | 
260 |         # The kind of shapes that we are interested in.
261 |         if shape_filter is 'skinny':  # Only z-dominant shapes.
262 |             my_filter = lambda b_shape: (b_shape[0] < b_shape[1]) and \
263 |      (b_shape[1] > 8) and ((b_shape[1] % 16) == 0)
264 |         elif shape_filter is 'square':  # Only square-ish shapes.
265 |             my_filter = lambda b_shape: (b_shape[0] < 2 * b_shape[1]) and \
266 |                                         (b_shape[1] < 2 * b_shape[0]) and \
267 |      (b_shape[0] > 8) and \
268 |      (b_shape[1] > 8)
269 |         elif shape_filter is 'all':  # All shapes okay.
270 |             my_filter = lambda b_shape: b_shape[1] > 1  # Must be greater than 1.
271 |         else:
272 |             raise TypeError('Unrecognized shape filter.')
273 | 
274 |         # Function defining valid block shapes.
275 |         smem_max = get_space_info()['max_shared_mem']
276 |         is_valid_shape = lambda b_shape: (smem_size(b_shape) < smem_max) and \
277 |                                             my_filter(b_shape) and \
278 |                                             (b_shape[0] * b_shape[1]) <= \
279 |                                                 threads_max
280 | 
281 |         # Create a list of all valid block shapes.
282 |         valid_block_shapes = []
283 |         z_max = get_space_info()['max_block_z']
284 |         y_max = get_space_info()['max_block_y']
285 |         for j in range(y_pad + 1, y_max + 1):
286 |             for k in range(z_pad + 1, z_max + 1):
287 |                 if is_valid_shape((j, k)):
288 |                     valid_block_shapes.append((j,
289 |                                                k))  # Block shape is (yy,zz).
290 | 
291 |         # A hack for profiling
292 |         # valid_block_shapes = ((31,16),)
293 |         # valid_block_shapes = ((17,22),)
294 | 
295 |         if not valid_block_shapes:  # Make sure the list is not empty.
296 |             raise TypeError('No valid shapes found.')
297 | 
298 |         # Create a list of all possible execution configurations.
299 |         # Note that the convention for both block_shape and grid_shape is
300 |         # (yy,zz). Among other things, this leads to the (slightly)
301 |         # tricky computation of grid_shape.
302 |         sp_shape = get_space_info()['shape']  # Shape of the space.
303 |         return [{   'block_shape': vbs, \
304 |                     'grid_shape': (int((sp_shape[1]-1)/(vbs[0]-y_pad)) + 1, \
305 |                                     int((sp_shape[2]-1)/(vbs[1]-z_pad)) + 1), \
306 |                     'smem_size': smem_size(vbs)}
307 |                 for vbs in valid_block_shapes]
308 | 


--------------------------------------------------------------------------------
/maxwell-solver/maxwell_ops_lumped.py:
--------------------------------------------------------------------------------
  1 | """ Implements the operations needed to solve Maxwell's equations in 3D. """
  2 | 
  3 | import numpy as np
  4 | import copy
  5 | from jinja2 import Environment, PackageLoader, Template
  6 | from gce.space import initialize_space, get_space_info
  7 | from gce.grid import Grid
  8 | from gce.const import Const
  9 | from gce.out import Out
 10 | from gce.kernel import Kernel
 11 | from typing import List
 12 | from mpi4py.MPI import COMM_WORLD as comm
 13 | 
 14 | # Execute when module is loaded.
 15 | # Load the jinja environment.
 16 | jinja_env = Environment(loader=PackageLoader(__name__, 'kernels'))
 17 | 
 18 | 
 19 | def conditioners(params, dtype):
 20 |     """ Form the functions for both the preconditioner and postconditioner. """
 21 | 
 22 |     #
 23 |     #     # Code for the post step function.
 24 |     #     code = """
 25 |     #         if (_in_global) {
 26 |     #             Ex(0,0,0) *= tx1(_X) * ty0(_Y) * tz0(_Z);
 27 |     #             Ey(0,0,0) *= tx0(_X) * ty1(_Y) * tz0(_Z);
 28 |     #             Ez(0,0,0) *= tx0(_X) * ty0(_Y) * tz1(_Z);
 29 |     #         } """
 30 |     def reshaper(f):
 31 |         for k in range(3):
 32 |             new_shape = [1, 1, 1]
 33 |             new_shape[k] = f[k].size
 34 |             f[k] = f[k].reshape(new_shape)
 35 |         return f
 36 | 
 37 |     # Consts that are used.
 38 |     sqrt_sc_pml_0 = reshaper([dtype(np.sqrt(s)**1) for s in params['s']])
 39 |     sqrt_sc_pml_1 = reshaper([dtype(np.sqrt(t)**1) for t in params['t']])
 40 |     inv_sqrt_sc_pml_0 = reshaper([dtype(np.sqrt(s)**-1) for s in params['s']])
 41 |     inv_sqrt_sc_pml_1 = reshaper([dtype(np.sqrt(t)**-1) for t in params['t']])
 42 | 
 43 |     # Define the actual functions.
 44 | 
 45 |     def apply_cond(x, t0, t1):
 46 |         x[0] *= t1[0] * t0[1] * t0[2]
 47 |         x[1] *= t0[0] * t1[1] * t0[2]
 48 |         x[2] *= t0[0] * t0[1] * t1[2]
 49 |         return x
 50 | 
 51 |     def pre_step(x):
 52 |         return apply_cond(x, sqrt_sc_pml_0, sqrt_sc_pml_1)
 53 | 
 54 |     def post_step(x):
 55 |         return apply_cond(x, inv_sqrt_sc_pml_0, inv_sqrt_sc_pml_1)
 56 | 
 57 |     return pre_step, post_step
 58 | 
 59 | 
 60 | def _get_cuda_type(dtype):
 61 |     """ Convert numpy type into cuda type. """
 62 |     if dtype is np.complex64:
 63 |         return 'pycuda::complex<float>'
 64 |     elif dtype is np.complex128:
 65 |         return 'pycuda::complex<double>'
 66 |     else:
 67 |         raise TypeError('Invalid dtype.')
 68 | 
 69 | 
 70 | # GPU operations
 71 | #---------
 72 | def make_gpu_copy(dtype):
 73 |     """ Returns a function that does B=A """
 74 |     # Code for the rho step function.
 75 |     code = Template("""
 76 |         if (_in_global) {
 77 |             Bx(0,0,0) = Ax(0,0,0);
 78 |             By(0,0,0) = Ay(0,0,0);
 79 |             Bz(0,0,0) = Az(0,0,0);
 80 |         } """).render(type=_get_cuda_type(dtype))
 81 | 
 82 |     # Compile the code using gce.Kernel
 83 |     grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']]
 84 |     copy_fun = Kernel(code, \
 85 |                     *[(name, 'grid', dtype) for name in grid_names], \
 86 |                     shape_filter='skinny')
 87 | 
 88 |     # Define the actual function.
 89 |     def gpu_copy(A, B):
 90 |         copy_fun( \
 91 |                          *( A + B), \
 92 |                          post_sync=B) # r must be post-synced for upcoming alpha step.
 93 | 
 94 |     return gpu_copy
 95 | 
 96 | 
 97 | def make_gpu_norm(dtype):
 98 |     """ Returns a function c=vec_norm(A) that does c=sqrt(A'A) """
 99 |     # GPU Code in gce.kernel.
100 |     code = Template("""
101 |         if (_in_global) {
102 |             norm_a += conj(Ax(0,0,0))*Ax(0,0,0);
103 |             norm_a += conj(Ay(0,0,0))*Ay(0,0,0);
104 |             norm_a += conj(Az(0,0,0))*Az(0,0,0);
105 |         } """).render(type=_get_cuda_type(dtype))
106 | 
107 |     # Compile the code using gce.Kernel
108 |     grid_names = [A + i for A in ['A'] for i in ['x', 'y', 'z']]
109 |     prod_fun = Kernel(code, \
110 |                     ('norm_a', 'out', dtype), \
111 |                     *[(name, 'grid', dtype) for name in grid_names], \
112 |                     shape_filter='skinny')
113 |     norm_a = Out(dtype)
114 | 
115 |     # Define the actual function.
116 |     def gpu_norm(A):
117 |         prod_fun( norm_a,\
118 |                   *( A )) # remove the post_sync
119 |         return np.sqrt(norm_a.get())
120 | 
121 |     return gpu_norm
122 | 
123 | 
124 | def make_gpu_scale(dtype):
125 |     """ Returns a function scale(A, a) that does A=aA """
126 |     # Code for the rho step function.
127 |     code = Template("""
128 |         if (_in_global) {
129 |             Ax(0,0,0) = a*Ax(0,0,0);
130 |             Ay(0,0,0) = a*Ay(0,0,0);
131 |             Az(0,0,0) = a*Az(0,0,0);
132 |         } """).render(type=_get_cuda_type(dtype))
133 | 
134 |     # Compile the code using gce.Kernel
135 |     grid_names = [A + i for A in ['A'] for i in ['x', 'y', 'z']]
136 |     Sum_fun = Kernel(code, \
137 |                     ('a', 'number', dtype), \
138 |                     *[(name, 'grid', dtype) for name in grid_names], \
139 |                     shape_filter='skinny')
140 | 
141 |     # Define the actual function.
142 |     def gpu_scale(A, a):
143 |         Sum_fun(dtype(a), \
144 |                 *( A ), \
145 |                  post_sync=A) # r must be post-synced for upcoming alpha step.
146 | 
147 |     return gpu_scale
148 | 
149 | 
150 | def make_gpu_conj(dtype):
151 |     """ Returns a function that does B=conj(A) """
152 |     # Code for the rho step function.
153 |     code = Template("""
154 |         if (_in_global) {
155 |             Bx(0,0,0) = conj(Ax(0,0,0));
156 |             By(0,0,0) = conj(Ay(0,0,0));
157 |             Bz(0,0,0) = conj(Az(0,0,0));
158 |         } """).render(type=_get_cuda_type(dtype))
159 | 
160 |     # Compile the code using gce.Kernel
161 |     grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']]
162 |     conj_fun = Kernel(code, \
163 |                     *[(name, 'grid', dtype) for name in grid_names], \
164 |                     shape_filter='skinny')
165 | 
166 |     # Define the actual function.
167 |     def gpu_conj(A, B):
168 |         conj_fun( \
169 |                          *( A + B), \
170 |                          post_sync=B) # r must be post-synced for upcoming alpha step.
171 | 
172 |     return gpu_conj
173 | 
174 | 
175 | def make_gpu_dot(dtype):
176 |     """ Returns a function c=vec_dot(A, B) that does c=A'B """
177 |     # GPU Code in gce.kernel.
178 |     code = Template("""
179 |         if (_in_global) {
180 |             dot_ab += conj(Ax(0,0,0))*Bx(0,0,0);
181 |             dot_ab += conj(Ay(0,0,0))*By(0,0,0);
182 |             dot_ab += conj(Az(0,0,0))*Bz(0,0,0);
183 |         } """).render(type=_get_cuda_type(dtype))
184 | 
185 |     # Compile the code using gce.Kernel
186 |     grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']]
187 |     prod_fun = Kernel(code, \
188 |                     ('dot_ab', 'out', dtype), \
189 |                     *[(name, 'grid', dtype) for name in grid_names], \
190 |                     shape_filter='skinny')
191 | 
192 |     dot_ab = Out(dtype)
193 | 
194 |     # Define the actual function.
195 |     def gpu_dot(A, B):
196 |         prod_fun( dot_ab,\
197 |                   *( A + B))
198 |         return dot_ab.get()
199 | 
200 |     return gpu_dot
201 | 
202 | 
203 | def make_gpu_addvec(dtype):
204 |     """ Returns a function vec_addvec(A, b, B) that does A=A+bB """
205 |     # GPU Code in gce.Kernel
206 |     code = Template("""
207 |         if (_in_global) {
208 |             Ax(0,0,0) = Ax(0,0,0) + b*Bx(0,0,0);
209 |             Ay(0,0,0) = Ay(0,0,0) + b*By(0,0,0);
210 |             Az(0,0,0) = Az(0,0,0) + b*Bz(0,0,0);
211 |         } """).render(type=_get_cuda_type(dtype))
212 | 
213 |     # Compile the code using gce.Kernel
214 |     grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']]
215 |     Sum_fun = Kernel(code, \
216 |                     ('b', 'number', dtype), \
217 |                     *[(name, 'grid', dtype) for name in grid_names], \
218 |                     shape_filter='skinny')
219 | 
220 |     # Define the actual function.
221 |     def gpu_addvec(A, b, B):
222 |         Sum_fun( dtype(b), \
223 |                 *( A + B ), \
224 |                  post_sync=A) # r must be post-synced for upcoming alpha step.
225 | 
226 |     return gpu_addvec
227 | 
228 | 
229 | def make_gpu_scaled_copy(dtype):
230 |     """ Returns a function vec_scaled_copy(A, a, B) that does B=aA """
231 |     # GPU code for the Kernel
232 |     code = Template("""
233 |         if (_in_global) {
234 |             Bx(0,0,0) = a*Ax(0,0,0);
235 |             By(0,0,0) = a*Ay(0,0,0);
236 |             Bz(0,0,0) = a*Az(0,0,0);
237 |         } """).render(type=_get_cuda_type(dtype))
238 | 
239 |     # Compile the code using gce.Kernel
240 |     grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']]
241 |     Sum_fun = Kernel( code, \
242 |                       ('a', 'number', dtype), \
243 |                       *[(name, 'grid', dtype) for name in grid_names], \
244 |                       shape_filter='skinny')
245 | 
246 |     # Define the actual function.
247 |     def gpu_scaled_copy(A, a, B):
248 |         Sum_fun(dtype(a), \
249 |                 *( A + B ), \
250 |                 post_sync=B)
251 | 
252 |     return gpu_scaled_copy
253 | 
254 | 
255 | def make_gpu_sum(dtype):
256 |     """ Returns a function that does aA+bB=C """
257 |     # Code for the rho step function.
258 |     code = Template("""
259 |         if (_in_global) {
260 |             Cx(0,0,0) = a*Ax(0,0,0) + b*Bx(0,0,0);
261 |             Cy(0,0,0) = a*Ay(0,0,0) + b*By(0,0,0);
262 |             Cz(0,0,0) = a*Az(0,0,0) + b*Bz(0,0,0);
263 |         } """).render(type=_get_cuda_type(dtype))
264 | 
265 |     # Compile the code using gce.Kernel
266 |     grid_names = [A + i for A in ['A', 'B', 'C'] for i in ['x', 'y', 'z']]
267 |     Sum_fun = Kernel(code, \
268 |                     ('a', 'number', dtype), \
269 |                     ('b', 'number', dtype), \
270 |                     *[(name, 'grid', dtype) for name in grid_names], \
271 |                     shape_filter='skinny')
272 | 
273 |     # Define the actual function.
274 |     def gpu_sum(a, b, A, B, C):
275 |         Sum_fun(dtype(a), dtype(b), \
276 |                          *( A + B + C), \
277 |                          post_sync=C) # r must be post-synced for upcoming alpha step.
278 | 
279 |     return gpu_sum
280 | 
281 | 
282 | def make_gpu_weighted_sum(dtype):
283 |     """ Return weighted sum function """
284 |     # returns function vec_weighted_sum(V,y,U) that will do:
285 |     #   U = y1*V1 + y2*V2 + ... + yn*Vn
286 |     # Note: you can not have U in V !!!
287 |     gpu_scaled_copy = make_gpu_scaled_copy(dtype)
288 |     gpu_addvec = make_gpu_addvec(dtype)
289 | 
290 |     def gpu_weighted_sum(L: List[Grid], y: np.ndarray, A):
291 |         gpu_scaled_copy(L[0], y[0], A)
292 |         for i in range(1, len(y)):
293 |             gpu_addvec(A, y[i], L[i])
294 | 
295 |     return gpu_weighted_sum
296 | 
297 | 
298 | def make_gpu_fdfd_residual(params, dtype):
299 |     """ Return function get_residual(X, B, R) that will do R = B - AX """
300 | 
301 |     ### this will be wrong !!! this code is not adapted to the needed changes to make the
302 |     ### biCGstab work
303 | 
304 |     num_shared_banks = 6  # TODO Dries: does this need to be increased?
305 | 
306 |     # Render the pre-loop and in-loop code.
307 |     cuda_type = _get_cuda_type(dtype)
308 |     code_allpre = jinja_env.get_template('fdfd_residual_pec_pmc.cu').\
309 |                     render(dims=params['shape'], \
310 |                            type=cuda_type, \
311 |                            mu_equals_1=False, \
312 |                            full_operator=True)
313 | 
314 |     # Grid input parameters.
315 |     grid_params = [(A + i, 'grid', dtype) for A in ['X', 'B', 'R', 'e', 'm'] \
316 |                                             for i in ['x', 'y', 'z']]
317 | 
318 |     # Const input parameters.
319 |     const_names = ('sx0', 'sy0', 'sz0', 'sx1', 'sy1', 'sz1') + \
320 |                     ('sqrt_sx0', 'sqrt_sy0', 'sqrt_sz0', \
321 |                     'sqrt_sx1', 'sqrt_sy1', 'sqrt_sz1') + \
322 |                     ('bloch_x', 'bloch_y', 'bloch_z')
323 |     const_sizes = params['shape'] * 4 + tuple([3]) * 3
324 |     const_params = [(const_names[k], 'const', dtype, const_sizes[k]) \
325 |                         for k in range(len(const_sizes))]
326 |     const_params.append(('pemc', 'const', params['pemc'].dtype.type, 6))
327 | 
328 |     # Compile. (note shape_filter = 'square')
329 |     residual_fun = Kernel('', \
330 |                     *(grid_params + const_params), \
331 |                     pre_loop=code_allpre, \
332 |                     padding=(1,1,1,1), \
333 |                     smem_per_thread=num_shared_banks*16, \
334 |                     shape_filter='square')
335 | 
336 |     # Temporary variables.
337 | 
338 |     # Grid variables.
339 |     # !!!!! here eps is scattered over the GPUs when e intitialised
340 |     e = [Grid(dtype(f), x_overlap=1) for f in params['e']]
341 |     m = [Grid(dtype(f), x_overlap=1) for f in params['m']]  # Optional.
342 | 
343 |     # Constant variables.
344 |     sc_pml_0 = [Const(dtype(s**-1)) for s in params['s']]
345 |     sc_pml_1 = [Const(dtype(t**-1)) for t in params['t']]
346 |     sqrt_sc_pml_0 = [Const(dtype(np.sqrt(s**-1))) for s in params['s']]
347 |     sqrt_sc_pml_1 = [Const(dtype(np.sqrt(t**-1))) for t in params['t']]
348 |     bloch_x = [Const(dtype(params['bloch_phase'][0]))]
349 |     bloch_y = [Const(dtype(params['bloch_phase'][1]))]
350 |     bloch_z = [Const(dtype(params['bloch_phase'][2]))]
351 |     pemc = [Const(params['pemc'])]
352 | 
353 |     # Define the function
354 |     def gpu_fdfd_residual(X, B, R):
355 |         # Execute cuda code.
356 |         residual_fun( \
357 |                     *(X + B + R + e + m + \
358 |                         sc_pml_0 + sc_pml_1 + sqrt_sc_pml_0 + sqrt_sc_pml_1 + \
359 |                         bloch_x + bloch_y + bloch_z + pemc), \
360 |                     post_sync = R)
361 | 
362 |     return gpu_fdfd_residual
363 | 
364 | 
365 | def make_gpu_fdfd_matrix_multiplication(params, dtype):
366 |     """ Return function vec_matrix_multiplication(X, B) that will do AX=B """
367 | 
368 |     num_shared_banks = 6
369 | 
370 |     # Render the pre-loop and in-loop code.
371 |     cuda_type = _get_cuda_type(dtype)
372 |     code_allpre = jinja_env.get_template('fdfd_matrix_multiplication_pec_pmc.cu').\
373 |                     render(dims=params['shape'], \
374 |                             type=cuda_type, \
375 |                             mu_equals_1=False, \
376 |                             full_operator=True)
377 | 
378 |     # Grid input parameters.
379 |     grid_params = [(A + i, 'grid', dtype) for A in ['X', 'B', 'e', 'm'] \
380 |                                             for i in ['x', 'y', 'z']]
381 | 
382 |     # Const input parameters.
383 |     const_names = ('sx0', 'sy0', 'sz0', 'sx1', 'sy1', 'sz1') + \
384 |                     ('sqrt_sx0', 'sqrt_sy0', 'sqrt_sz0', \
385 |                     'sqrt_sx1', 'sqrt_sy1', 'sqrt_sz1') + \
386 |                     ('bloch_x', 'bloch_y', 'bloch_z')
387 |     const_sizes = params['shape'] * 4 + tuple([3]) * 3
388 |     const_params = [(const_names[k], 'const', dtype, const_sizes[k]) \
389 |                         for k in range(len(const_sizes))]
390 |     const_params.append(('pemc', 'const', params['pemc'].dtype.type, 6))
391 | 
392 |     # Compile. (note shape_filter = 'square')
393 |     A_multiplication_fun = Kernel('', \
394 |                                   *(grid_params + const_params), \
395 |                                   pre_loop=code_allpre, \
396 |                                   padding=(1,1,1,1), \
397 |                                   smem_per_thread=num_shared_banks*16, \
398 |                                   shape_filter='square')
399 | 
400 |     # Temporary variables.
401 | 
402 |     # Grid variables.
403 |     # !!!!! here eps is scattered over the GPUs when e intitialised
404 |     e = [Grid(dtype(f), x_overlap=1) for f in params['e']]
405 |     m = [Grid(dtype(f), x_overlap=1) for f in params['m']]  # Optional.
406 | 
407 |     # Constant variables.
408 |     sc_pml_0 = [Const(dtype(s**-1)) for s in params['s']]
409 |     sc_pml_1 = [Const(dtype(t**-1)) for t in params['t']]
410 |     sqrt_sc_pml_0 = [Const(dtype(np.sqrt(s**-1))) for s in params['s']]
411 |     sqrt_sc_pml_1 = [Const(dtype(np.sqrt(t**-1))) for t in params['t']]
412 |     bloch_x = [Const(dtype(params['bloch_phase'][0]))]
413 |     bloch_y = [Const(dtype(params['bloch_phase'][1]))]
414 |     bloch_z = [Const(dtype(params['bloch_phase'][2]))]
415 |     pemc = [Const(params['pemc'])]
416 | 
417 |     # Define the function
418 |     def gpu_fdfd_matrix_multiplication(X, B):
419 |         # Execute cuda code.
420 |         A_multiplication_fun( \
421 |                     *(X + B + e + m + \
422 |                         sc_pml_0 + sc_pml_1 + sqrt_sc_pml_0 + sqrt_sc_pml_1 + \
423 |                         bloch_x + bloch_y + bloch_z + pemc), \
424 |                     post_sync = B)
425 | 
426 |     return gpu_fdfd_matrix_multiplication
427 | 
428 | 
429 | def make_gpu_cond(dtype, cond):
430 |     """ Returns a function gpu_cond(A) that does A=A*C """
431 |     # GPU Code in gce.Kernel
432 |     code = Template("""
433 |         if (_in_global) {
434 |             Ax(0,0,0) = Ax(0,0,0)*Cx(0,0,0);
435 |             Ay(0,0,0) = Ay(0,0,0)*Cy(0,0,0);
436 |             Az(0,0,0) = Az(0,0,0)*Cz(0,0,0);
437 |         } """).render(type=_get_cuda_type(dtype))
438 | 
439 |     # Compile the code using gce.Kernel
440 |     grid_names = [A + i for A in ['A', 'C'] for i in ['x', 'y', 'z']]
441 |     Sum_fun = Kernel(code, \
442 |                     *[(name, 'grid', dtype) for name in grid_names], \
443 |                     shape_filter='skinny')
444 | 
445 |     C = cond
446 | 
447 |     # Define the actual function.
448 |     def gpu_cond(A):
449 |         Sum_fun(*( A + C ), \
450 |                  post_sync=A) # r must be post-synced for upcoming alpha step.
451 | 
452 |     return gpu_cond
453 | 
454 | 
455 | def make_DB_get_vec(dtype):
456 |     """ Returns a function that does aA+bB=C """
457 |     # Code for the rho step function.
458 |     temp = [Grid(dtype, x_overlap=1) for k in range(3)]
459 |     gpu_scaled_copy = make_gpu_scaled_copy(dtype)
460 | 
461 |     # Define the actual function.
462 |     def DB_get_vec(A):
463 |         gpu_scaled_copy(A, 1, temp)
464 |         out = [E.get() for E in temp]
465 |         return out
466 | 
467 |     return DB_get_vec
468 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------