├── maxwell-solver ├── __init__.py ├── gce │ ├── __init__.py │ ├── const.py │ ├── data.py │ ├── out.py │ ├── README.md │ ├── kernel.cu │ ├── space.py │ ├── grid.py │ └── kernel.py ├── solvers │ ├── __init__.py │ └── test_bicg.py ├── kernels │ ├── alpha_allpre.cu │ ├── fdfd_matrix_multiplication.cu │ ├── fdfd_residual.cu │ ├── alpha_biCGSTAB.cu │ ├── omega_bloch_allpre.cu │ ├── fdfd_matrix_multiplication_pec_pmc.cu │ ├── fdfd_residual_pec_pmc.cu │ ├── omega_bloch_pmc_pec.cu │ └── alpha_bloch_pmc_pec.cu ├── fdfd.py └── maxwell_ops_lumped.py ├── .gitignore ├── maxwell-server ├── maxwell-solver ├── unbuffered.py ├── maxwell_config.py ├── webserver.py └── simserver.py ├── start_maxwell_docker ├── maxwellfdfd.service ├── run_docker ├── start_maxwell ├── Dockerfile ├── maxwell_sweeper.py ├── README └── LICENSE /maxwell-solver/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__/** 2 | -------------------------------------------------------------------------------- /maxwell-solver/gce/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maxwell-solver/solvers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maxwell-server/maxwell-solver: -------------------------------------------------------------------------------- 1 | ../maxwell-solver/ -------------------------------------------------------------------------------- /start_maxwell_docker: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | source /pyenv/bin/activate 3 | python3 maxwell-server/simserver.py $NGPUS &> simserver.log & 4 | python3 maxwell-server/webserver.py $PORT &> webserver.log 5 | -------------------------------------------------------------------------------- /maxwell-server/unbuffered.py: -------------------------------------------------------------------------------- 1 | #Class for flushing a stream after every write 2 | 3 | class Unbuffered(object): 4 | def __init__(self, stream): 5 | self.stream = stream 6 | def write(self, data): 7 | self.stream.write(data) 8 | self.stream.flush() 9 | def __getattr__(self, attr): 10 | return getattr(self.stream, attr) 11 | 12 | -------------------------------------------------------------------------------- /maxwellfdfd.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=MaxwellFDFD webserver and simserver 3 | After=network-online.target nss-lookup.target 4 | Wants=network-online.target nss-lookup.target 5 | 6 | [Service] 7 | WorkingDirectory=/home/maxwell 8 | Type=forking 9 | ExecStart=/usr/bin/sudo -u maxwell /home/maxwell/start_maxwell 10 | 11 | [Install] 12 | WantedBy=multi-user.target 13 | -------------------------------------------------------------------------------- /maxwell-server/maxwell_config.py: -------------------------------------------------------------------------------- 1 | """ Configuration file for Maxwell. 2 | 3 | Holds constants and such... 4 | """ 5 | 6 | import os 7 | 8 | path = os.environ["MAXWELL_SERVER_FILES"] 9 | 10 | if not os.path.exists(path): 11 | os.makedirs(path) 12 | 13 | def list_requests(): 14 | return [f for f in os.listdir(path) \ 15 | if f[-len('.request'):] == '.request'] 16 | -------------------------------------------------------------------------------- /run_docker: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Builds and runs maxwell in Docker. 3 | PORT=9041 4 | MAXWELL_DOCKER_VOL=maxwell-vol 5 | MAXWELL_SERVER_FILES=/mnt/maxwell-server-files 6 | 7 | docker build -t maxwell . 8 | docker run --runtime=nvidia \ 9 | --mount "source=$MAXWELL_DOCKER_VAL,target=$MAXWELL_SERVER_FILES" \ 10 | -d \ 11 | -p $PORT:$PORT \ 12 | -e PORT=$PORT \ 13 | -e MAXWELL_SERVER_FILES="$MAXWELL_SERVER_FILES" \ 14 | -e NGPUS=1 \ 15 | maxwell #tail -f /dev/null 16 | -------------------------------------------------------------------------------- /start_maxwell: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Simple script for starting up maxwell 4 | 5 | # Location to store temporary files. 6 | MAXWELL_SERVER_FILES=/tmp/maxwell-server-files 7 | # Port to use for webserver. 8 | PORT=9041 9 | # Number of GPUS per solve. 10 | NGPUS=1 11 | 12 | # Main directory for Maxwell source code. 13 | BASEDIR=. 14 | # Location of Python virtualenv containing Maxwell dependencies. 15 | PYENV=maxwell-solver-env 16 | SERVER_DIR=maxwell-server 17 | PYTHON=python3 18 | 19 | cd $BASEDIR 20 | 21 | source $PYENV/bin/activate 22 | $PYTHON $SERVER_DIR/webserver.py $PORT &> $BASEDIR/webserver.log & 23 | $PYTHON $SERVER_DIR/simserver.py $NGPUS &> $BASEDIR/simserver.log 24 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.0-devel 2 | 3 | # Do as much installation as possible to make use of caching as installing 4 | # is very slow. 5 | # A few comments: 6 | # 1) openmpi seems to give trouble so use mpich2. 7 | # 2) Use a virtualenv to avoid outdated system packages (i.e. six). 8 | RUN apt-get update && \ 9 | apt-get install -y python3-pip \ 10 | python3-setuptools \ 11 | libhdf5-serial-dev \ 12 | mpich 13 | 14 | RUN pip3 install virtualenv 15 | RUN virtualenv -p python3 pyenv 16 | 17 | RUN /pyenv/bin/pip3 install numpy 18 | RUN /pyenv/bin/pip3 install pycuda jinja2 h5py mpi4py 19 | RUN /pyenv/bin/pip3 install scipy 20 | 21 | WORKDIR /app 22 | COPY . /app 23 | 24 | EXPOSE 9041 25 | 26 | CMD ["./start_maxwell_docker"] 27 | -------------------------------------------------------------------------------- /maxwell-solver/gce/const.py: -------------------------------------------------------------------------------- 1 | """ Defines the Const class for GCE. """ 2 | 3 | from pycuda import gpuarray as ga 4 | from gce.data import Data 5 | import numpy as np 6 | 7 | class Const(Data): 8 | """ Const class for GCE. 9 | 10 | Used to store globally accessible, but unchangeable data. 11 | 12 | Derives from the Data class. 13 | 14 | New methods: 15 | __init__ -- Store an array as a Const on the GPU. 16 | """ 17 | 18 | def __init__(self, array): 19 | """ Create a Const. 20 | 21 | Input variables 22 | array -- a numpy array of valid dtype. 23 | """ 24 | 25 | self._set_gce_type('const') 26 | if type(array) is not np.ndarray: # Make sure we actually got an array. 27 | raise TypeError('Array must be a numpy ndarray.') 28 | 29 | self._get_dtype(array.dtype.type) # Validate the array's dtype. 30 | self.to_gpu(array) # Load onto device. 31 | 32 | -------------------------------------------------------------------------------- /maxwell-solver/solvers/test_bicg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import unittest 3 | import bicg 4 | 5 | n = 10 6 | A0 = np.random.randn(n, n) 7 | b0 = np.random.randn(n) 8 | 9 | class TestBicg(unittest.TestCase): 10 | def test_asymm(self): 11 | A = A0 12 | b = b0 13 | 14 | def multA(x, y): 15 | y[:] = np.dot(A, x) 16 | 17 | def multAT(x, y): 18 | y[:] = np.dot(A.T, x) 19 | 20 | ops = {'multA': multA, 'multAT': multAT} 21 | 22 | x, err, success = bicg.solve_asymm(b, **ops) 23 | self.assertTrue(success) 24 | 25 | def test_symm(self): 26 | A = np.dot(A0.T, A0) # Make A symmetric. 27 | b = b0 28 | 29 | def multA(x, y): 30 | y[:] = np.dot(A, x) 31 | 32 | ops = {'multA': multA} 33 | 34 | x, err, success = bicg.solve_symm(b, **ops) 35 | self.assertTrue(success) 36 | 37 | def test_zlumped(self): 38 | A = np.dot(A0.T, A0) # Make A symmetric. 39 | b = b0 40 | 41 | def alpha_step(rho_k, rho_k_1, p, r, v): 42 | p[:] = r + (rho_k / rho_k_1) * p 43 | v[:] = np.dot(A, p) 44 | return rho_k / np.dot(p, v) # Return alpha. 45 | 46 | def rho_step(alpha, p, r, v, x): 47 | x[:] = x + alpha * p 48 | r[:] = r - alpha * v 49 | 50 | # Return rho and err. 51 | return np.dot(r, r), np.sqrt(np.dot(np.conj(r), r)) 52 | 53 | def zeros(): 54 | return np.zeros_like(b) 55 | ops = {'rho_step': rho_step, 'alpha_step': alpha_step, 'zeros': zeros} 56 | 57 | x, err, success = bicg.solve_symm_lumped(b, **ops) 58 | self.assertTrue(success) 59 | 60 | if __name__ == '__main__': 61 | unittest.main() 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /maxwell-solver/gce/data.py: -------------------------------------------------------------------------------- 1 | """ Defines the Data class for GCE. """ 2 | 3 | from gce.space import get_space_info 4 | import numpy as np 5 | from pycuda import gpuarray as ga 6 | 7 | class Data: 8 | """ Generic data class for GCE. 9 | 10 | The Grid, Const, and Out classes are derived from this class. 11 | 12 | Only supports datatypes: np.float32, np.float64, np.complex64, 13 | and np.complex128. 14 | 15 | Functions: 16 | to_gpu -- Load a numpy array on to the GPU. 17 | get -- Transfer data back to host memory. 18 | 19 | Variables: 20 | data -- GPUArray instance. 21 | dtype -- Numpy datatype of the data. 22 | cuda_type -- Corresponding cuda type of the data. 23 | """ 24 | 25 | def _get_dtype(self, dtype): 26 | """ Certify that the dtype is valid, and find the cuda datatype. """ 27 | if dtype not in (np.int32, np.float32, np.float64, np.complex64, np.complex128): 28 | raise TypeError('Array is of an unsupported dtype.') 29 | 30 | self.dtype = dtype # The numpy datatype. 31 | 32 | cuda_dict = {np.float32: 'float', np.float64: 'double', \ 33 | np.int32: 'int', \ 34 | np.complex64: 'pycuda::complex', \ 35 | np.complex128: 'pycuda::complex'} 36 | 37 | self.cuda_type = cuda_dict[self.dtype] # Corresponding cuda datatype. 38 | 39 | def _set_gce_type(self, type): 40 | """ Set whether we have a Grid, Const, or Out. """ 41 | if type in ('grid', 'const', 'out'): 42 | self.gce_type = type 43 | else: 44 | raise TypeError('Invalid gce type.') 45 | 46 | def to_gpu(self, array): 47 | """ Load data to the gpu. """ 48 | self.data = ga.to_gpu(array) 49 | 50 | def get(self): 51 | """ Get data from the gpu. """ 52 | return self.data.get() 53 | -------------------------------------------------------------------------------- /maxwell-solver/gce/out.py: -------------------------------------------------------------------------------- 1 | """ Defines the Out class for GCE. """ 2 | 3 | from pycuda import gpuarray as ga 4 | from pycuda.reduction import ReductionKernel 5 | from gce.space import get_space_info 6 | from gce.data import Data 7 | import numpy as np 8 | from mpi4py.MPI import COMM_WORLD as comm 9 | 10 | 11 | class Out(Data): 12 | """ Out class for GCE. 13 | 14 | Outs store reduction operations. Outs allow for reduction operations in 15 | the GCE framework by storing intermediary (y,z) values during a kernel 16 | operation, which are then reduced into a single value. See the Kernel 17 | class for additional information. 18 | 19 | Currently only the "sum" operation is supported. 20 | 21 | Derives from the Data class. 22 | 23 | New methods: 24 | __init__ -- Create an Out of a particular dtype and operation. 25 | get -- Redefined to retrieve the result of the reduction. 26 | 27 | """ 28 | 29 | def __init__(self, dtype, op='sum'): 30 | """ Create an Out. 31 | 32 | Input variables 33 | dtype -- numpy dtype. 34 | 35 | Keyword variables 36 | op -- type of reduction operation to perform. Default='sum'. 37 | At this time, only the "sum" operation is supported. 38 | """ 39 | 40 | self._set_gce_type('out') 41 | self._get_dtype(dtype) # Validate dtype. 42 | 43 | if op not in ('sum','prod'): # Validate op. 44 | raise TypeError('Invalid op.') 45 | self.op = op 46 | 47 | # Obtain the neutral value and store it in the result variable. 48 | neutral_val = {'sum': 0, 'prod': 1} 49 | 50 | # Create the intermediary values. 51 | shape = get_space_info()['shape'] 52 | self.to_gpu((neutral_val[op] * \ 53 | np.ones((1, shape[1], shape[2]))).astype(self.dtype)) 54 | 55 | def reduce(self): 56 | """ Compute the result. """ 57 | self.result = comm.allreduce(ga.sum(self.data).get()) 58 | 59 | def get(self): 60 | """ Redefine get() to return the result of the operation. """ 61 | return self.result 62 | 63 | 64 | def batch_reduce(*outs): 65 | """ Optimal (compared to self.reduce) when communication cost is latency bound. """ 66 | results = comm.allreduce(np.array([ga.sum(out.data).get() for out in outs])) 67 | for k in range(len(outs)): 68 | outs[k].result = results[k] 69 | 70 | -------------------------------------------------------------------------------- /maxwell_sweeper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """ Removes the temporary files for Maxwell automatically. 3 | 4 | The temporary files in Maxwell tend to build up over time and these files 5 | eventually take up enough space to warrant removing them. Rather than manually 6 | performing this task, this script automatically removes that are older than 7 | a certain date (default: 7 days). This script is intended to be placed in 8 | cron folder (be sure to give the script executable permissions). 9 | """ 10 | import datetime 11 | import logging 12 | import os 13 | 14 | # Temporary maxwell server files directory to sweep. 15 | MAXWELL_SERVER_FILES_DIR = os.environ["MAXWELL_SERVER_FILES"] 16 | # Number of days to retain temporary files. 17 | DELETE_THRESHOLD_DAY = 7 18 | # Logging format to use. 19 | LOG_FORMAT = '[%(asctime)-15s][%(levelname)s][%(module)s][%(funcName)s] %(message)s' 20 | # Place to store logs. 21 | LOG_LOCATION = '/home/maxwell/maxwell-sweeper.log' 22 | 23 | # Append to log file so that script can be run multiple times. 24 | logging.basicConfig(filename=LOG_LOCATION, filemode='a', 25 | format=LOG_FORMAT, level=logging.INFO) 26 | logger = logging.getLogger(__name__) 27 | 28 | def main(): 29 | logger.info('Beginning sweep...') 30 | # Keep track of number of deleted files. 31 | deleted_files = 0 32 | # List all the files 33 | for filename in os.listdir(MAXWELL_SERVER_FILES_DIR): 34 | fullpath = os.path.join(MAXWELL_SERVER_FILES_DIR, filename) 35 | try: 36 | # Get the last modified time and compare against current time. 37 | # Note that it is safer to retrieve the last modified timestamp 38 | # before obtaining the current timestamp to avoid the situation 39 | # where `last_modified > now`. 40 | last_modified = datetime.datetime.fromtimestamp( 41 | os.path.getmtime(fullpath)) 42 | now = datetime.datetime.today() 43 | 44 | delta_time = now - last_modified 45 | if delta_time.days > DELETE_THRESHOLD_DAY: 46 | logging.debug('Removing {0} ({1} days old)...'.format( 47 | fullpath, delta_time.days)) 48 | os.remove(fullpath) 49 | deleted_files += 1 50 | except: 51 | logger.exception('Error handling {0}'.format(fullpath)) 52 | logger.info('Sweep finished. Removed {0} files.'.format(deleted_files)) 53 | 54 | if __name__ == '__main__': 55 | main() 56 | -------------------------------------------------------------------------------- /maxwell-solver/gce/README.md: -------------------------------------------------------------------------------- 1 | TODO 2 | ==== 3 | 4 | * Clean-up and update documentation. 5 | 6 | 7 | What is GCE? 8 | ============ 9 | GCE stands for Gird Compute Engine, written by Jesse Lu in early 2012. 10 | 11 | 12 | What does GCE do? 13 | ================= 14 | GCE makes it easy to write fast 3D finite-difference applications 15 | for CUDA. 16 | 17 | 18 | How does GCE work? 19 | ================== 20 | GCE provides a simple interface for manipulating gridded 3D data 21 | on the GPU. 22 | GCE is based on simple memory and execution objects 23 | which hide non-essential features and details, 24 | allowing applications to be defined in a simple, abstract way. 25 | 26 | 27 | What is GCE built on? 28 | ===================== 29 | GCE is heavily dependent on PyCUDA. 30 | 31 | 32 | Interface overview 33 | ================== 34 | 35 | For a simple example of GCE at work, see test_example.py. 36 | 37 | Space 38 | ----- 39 | The space forms the context for colocating grids and kernels. 40 | For example, creating two grids on the same space tells GCE that 41 | these two grids should be overlaid on top of each other. 42 | In the same way, defining a kernel on the space defines which grid elements 43 | will be updated. 44 | As such, the space contains all the intra-process communication elements 45 | needed to synchronize grids and to execute kernels in parallel. 46 | Also, the space contains all the device information needed to run kernels 47 | on the GPU devices. 48 | 49 | Currently, the creation of only one global space is supported. 50 | 51 | Grid 52 | ---- 53 | Grids represent three-dimensional fields. 54 | To efficiently operate on Grids, every element in a Grid has limited visibility. 55 | This means that when operating on a Grid (with a Kernel), 56 | only the certain adjacent neighboring elements may be accessed. 57 | Specifically, only elements within a cube of length 2n+1 58 | (where n, the stencil size, is specified by the user) may be accessed. 59 | 60 | Const 61 | ----- 62 | Consts are global constant arrays that be accessed by any element of any grid. 63 | However, the values of Const elements may not be reliably changed, 64 | since such changes are not synchronized across devices. 65 | 66 | Out 67 | --- 68 | Outs are global scalars that are used to store the result of reduce operations 69 | on a space. 70 | Currently only sum operations are supported. 71 | 72 | Kernel 73 | ------ 74 | Kernels perform operations on Grids and 75 | accept Grids, Consts, and Outs as input parameters. 76 | Kernels perform both update and sum functions. 77 | Additionally, Kernels automatically provide self-optimization features 78 | such as block_size optimization and loop-unrolling. 79 | 80 | Writing code for Kernels 81 | ------------------------ 82 | GCE provides a number of simple conventions to make writing kernel code 83 | simpler: 84 | 85 | * Relative addressing of Grid elements. 86 | Computations that must be performed at every point on a Grid can be 87 | described using relative indexing. 88 | For example, copying from one Grid to another can be performed in the 89 | following way: 90 | x(0,0,0) = y(0,0,0); 91 | 92 | Optimization tips 93 | ================= 94 | * Remember to turn ECC off via 'nvidia-smi -e 0', 95 | this can later be checked using the test_pycuda_speed module. 96 | -------------------------------------------------------------------------------- /maxwell-server/webserver.py: -------------------------------------------------------------------------------- 1 | """ Web server for Maxwell. 2 | 3 | Allows users to upload simulations and download results through HTTP. 4 | 5 | Performs just three operations: 6 | 1. Receive job as a from client (POST). 7 | 2. Return job status or simulation result to client (GET). 8 | 3. Return queue status to client (HEAD). 9 | 10 | Defaults to use port 9041. 11 | """ 12 | 13 | import http.server 14 | from io import StringIO 15 | import cgi, shutil, tempfile, sys, os 16 | from socketserver import ThreadingMixIn 17 | import maxwell_config 18 | from unbuffered import Unbuffered 19 | 20 | 21 | class MaxwellHandler(http.server.BaseHTTPRequestHandler): 22 | """ Handler for the server. """ 23 | 24 | def do_POST(self): 25 | """ Accepts files from client. """ 26 | form = cgi.FieldStorage( \ 27 | fp=self.rfile, \ 28 | headers=self.headers, \ 29 | environ={'REQUEST_METHOD':'POST', \ 30 | 'CONTENT_TYPE':self.headers['Content-Type']}) 31 | 32 | # "{ip}-" prefix added in front of file name. 33 | try: 34 | filename = self.my_prefix() + form['key'].value 35 | f = open(filename, 'wb') 36 | except: 37 | self.send_error(400, "Upload failed.") 38 | return 39 | 40 | # Save file. 41 | shutil.copyfileobj(form['file'].file, f) 42 | f.close() 43 | 44 | # # Open permissions on file. 45 | # os.chmod(filename, 0777) 46 | 47 | self.send_response(200) 48 | self.send_header('Content-type', 'maxwell!') 49 | self.end_headers() 50 | 51 | def do_GET(self): 52 | """ Return file to client. """ 53 | 54 | if self.path == '/': # No file specified, treat as HEAD request. 55 | self.do_HEAD() 56 | return 57 | 58 | fname = self.my_prefix() + self.path.lstrip('/') 59 | try: 60 | f = open(fname, 'rb') 61 | except: 62 | self.send_error(404, "File not found.") 63 | return 64 | 65 | self.send_response(200) 66 | self.send_header('Content-type', 'maxwell!') 67 | self.end_headers() 68 | shutil.copyfileobj(f, self.wfile) 69 | f.close() 70 | 71 | # If ends with something like .E_xr then delete the file. 72 | ending = fname.split('.')[-1] 73 | if len(ending) == 4 and \ 74 | ending[0] in 'EH' and \ 75 | ending[1] == '_' and \ 76 | ending[2] in 'xyz' and \ 77 | ending [3] in 'ri': 78 | os.remove(fname) 79 | 80 | # print self.client_address 81 | 82 | def my_prefix(self): 83 | """ Produce the user-specific prefix for files. """ 84 | return os.path.join(maxwell_config.path, self.client_address[0] + ':') 85 | 86 | def do_HEAD(self): 87 | """ Returns the number of jobs in queue. """ 88 | self.send_response(200) 89 | self.send_header('Content-type', 'maxwell!') 90 | self.end_headers() 91 | 92 | num_requests = len(maxwell_config.list_requests()) 93 | shutil.copyfileobj(StringIO("%d jobs pending (maxwell-server)" \ 94 | % num_requests), self.wfile) 95 | 96 | 97 | class ThreadingHTTPServer(ThreadingMixIn, http.server.HTTPServer): 98 | """ We use a multi-process version of HTTPServer. """ 99 | pass 100 | 101 | 102 | if __name__ == '__main__': 103 | sys.stdout = Unbuffered(sys.stdout) 104 | 105 | # Determine the port to use. 106 | if len(sys.argv) == 2: 107 | port = int(sys.argv[1]) 108 | else: 109 | port = 9041 110 | 111 | server_address = ("", port) 112 | print("Serving at", server_address) 113 | 114 | httpd = ThreadingHTTPServer(server_address, MaxwellHandler) 115 | httpd.serve_forever() 116 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Maxwell is a multi-GPU implementation of finite-difference frequency domain solver. 2 | This code is intended to be used in conjunction with SPINS (github.com/stanfordnqp/spins-b). 3 | 4 | Overview 5 | ======== 6 | Maxwell is implemented as a server to which SPINS can send simulations to be run. 7 | This allows the actual simulation server (i.e. where the GPUs are) to be located separately 8 | from where the rest of the optimization code is running, though it is recommended to keep 9 | SPINS and Maxwell on the same machine if possible. 10 | 11 | At its core, running Maxwell involves running two separate services: 12 | 1. A webserver `maxwell-server/webserver.py` that manages sending and receiving simulation data over HTTP. 13 | 2. A simserver `maxwell-server/simserver.py` that manages running the simulations. 14 | Both services must be up and running for Maxwell to function properly. 15 | 16 | Maxwell can run in the follow methods: 17 | 1) Use the Dockerfile provided. This is the preferred mechanism as it creates an isolated environment for Maxwell. 18 | 2) Manually launching the webserver and simserver. This allows for most fine-grain control. 19 | 20 | 21 | Docker 22 | ====== 23 | We have Dockerized the Maxwell solver to make solver maintenance easier. 24 | In addition to CUDA Toolkit, all other Maxwell dependencies are listed in the Dockerfile. 25 | 26 | Installation 27 | ------------ 28 | 1. Install Docker (http://docker.com). 29 | 2. Install CUDA 10.0 Toolkit (https://developer.nvidia.com/cuda-10.0-download-archive). 30 | 3. Install NVIDIA-Docker (https://github.com/NVIDIA/nvidia-docker). 31 | 32 | Usage 33 | ----- 34 | The Dockerfile is contained the root Maxwell directory. 35 | We have also provided a script to build and launch the Docker container: 36 | 37 | $ ./run_docker 38 | 39 | To change the number of GPUs used per simulation, edit the `run_docker` script and set `NGPUS` to the desired value. 40 | 41 | Docker Quick Reference 42 | ---------------------- 43 | To list all running containers, 44 | 45 | $ docker ps ls 46 | 47 | To kill a container, 48 | 49 | $ docker kill [container-name-or-id] 50 | 51 | To clean up all containers (dead containers still take up disks space), 52 | 53 | $ docker system prune --volumes 54 | 55 | To examine the container state, launch an interactive bash session: 56 | 57 | $ docker exec -it [container-name-or-id] bash 58 | 59 | 60 | Manual Installation 61 | =================== 62 | Maxwell can be manually installed. Follow the installation procedure listed in 63 | `Dockerfile` for the installation procedure. See `./start_maxwell` for an 64 | example of how to launch Maxwell manually. 65 | 66 | 67 | Options 68 | ======= 69 | 70 | 1. Work directory: Maxwell requires a folder to store temporary data. 71 | This location is specified by the environment variable `MAXWELL_SERVER_FILES` 72 | that must be set for Maxwell to run. Maxwell must have permissions to read 73 | and write to this directory. 74 | 2. Port number: The webserver by default runs on port 9041. This can be changed 75 | by changing the argument passed to `webserver.py` 76 | (e.g. `python webserver.py 9042). If using Docker, change the `PORT` variable 77 | in `./run_docker`. Note that the port number change must be reflected in 78 | `spins` as well in order for this to work. 79 | 3. Number of GPUs per solve: By default, Maxwell will attempt to use 1 GPU 80 | per simulation. For larger simulations, it may be beneficial to use multiple 81 | GPUs per simulation. This can be used by changing the `NGPUS` value. 82 | 83 | 84 | Troubleshooting 85 | =============== 86 | 1. Check the Maxwell server log files. 87 | 88 | Maxwell consists of two separate servers: `webserver.py` manages sending and receiving data 89 | and `simserver.py` manages running the actual simulations. Both servers save logs named `webserver.log` and 90 | `simserver.log` respectively. 91 | 92 | 2. Check the individual simulation log files. 93 | 94 | Every simulation maintains its own log file. This can be found under `$MAXWELL_SERVER_FILES` directory. 95 | By default, the Docker sets this location to be `/mnt/maxwell-server-files` in the container. 96 | 97 | 98 | Acknowledgements 99 | ================ 100 | This code is primarily based off of https://github.com/JesseLu/maxwell-solver. 101 | -------------------------------------------------------------------------------- /maxwell-solver/gce/kernel.cu: -------------------------------------------------------------------------------- 1 | // These macros redefine the CUDA blocks and grids to be row-major, 2 | // instead of column major. 3 | 4 | #define _tx threadIdx.z 5 | #define _ty (signed int)(threadIdx.y - {{ padding[0] }}) 6 | #define _tz (signed int)(threadIdx.x - {{ padding[2] }}) 7 | 8 | #define _bx blockIdx.z 9 | #define _by blockIdx.y 10 | #define _bz blockIdx.x 11 | 12 | #define _txx blockDim.z 13 | #define _tyy (blockDim.y - {{ padding[0] + padding[1] }}) 14 | #define _tzz (blockDim.x - {{ padding[2] + padding[3] }}) 15 | 16 | #define _bxx gridDim.z 17 | #define _byy gridDim.y 18 | #define _bzz gridDim.x 19 | 20 | // Use the complex-value definition and operators included with pycuda. 21 | // This allows us to work with pycuda's GPUArray class. 22 | #include 23 | 24 | // Defines row major access to a 3D array. 25 | // dx, dy, dz are shifts from the present location of the field. 26 | // Note that there is an offset in the x-index 27 | #define _MY_OFFSET(dx,dy,dz) ((_X - {{ x_range[0] }} + dx) * {{ dims[1] }} * {{ dims[2] }} + \ 28 | (_Y + dy) * {{ dims[2] }} + (_Z + dz)) 29 | 30 | // Macros to access fields using the field(i,j,k) format, 31 | // where sx, sy, and sz are RELATIVE offsets in the x, y, and z directions 32 | // respectively. 33 | {%- for p in params if p.gce_type == 'grid' %} 34 | #define {{ p.name }}(dx,dy,dz) {{ p.name }}[_MY_OFFSET(dx,dy,dz)] 35 | {%- endfor %} 36 | 37 | // Constants. We have to have a crude work-around to avoid problems with 38 | // trying to use pycuda::complex types in constant memory. 39 | {# Commented out for now. 40 | {%- for p in params if p.gce_type == 'const' %} 41 | __constant__ {{ p.alt_type }} _{{ p.name }}_temp[{{ p.num_elems }}]; 42 | {%- endfor %} 43 | {%- for p in params if p.gce_type == 'const' %} 44 | {%- if p.cuda_type in ('pycuda::complex', 'pycuda::complex') %} 45 | #define {{ p.name }}(i) {{ p.cuda_type }}(_{{ p.name }}_temp[i].x, _{{ p.name }}_temp[i].y) 46 | {%- else %} 47 | #define {{ p.name }}(i) _{{ p.name }}_temp[i] 48 | {%- endif %} 49 | {%- endfor %} 50 | #} 51 | {%- for p in params if p.gce_type == 'const' %} 52 | #define {{ p.name }}(i) {{ p.name }}[i] 53 | {%- endfor %} 54 | 55 | // Dynamic allocation of shared memory 56 | extern __shared__ pycuda::complex _gce_smem[]; 57 | 58 | __global__ void _gce_kernel(const int _x_start, const int _x_end, 59 | {#- Add the fields as input parameters to the function. -#} 60 | {#{%- for p in params if p.gce_type != 'const' -%}#} 61 | {%- for p in params -%} 62 | {% if p.gce_type == 'const' -%} 63 | {{ p.cuda_type }}* {{ p.name }} 64 | {% elif p.gce_type == 'number' -%} 65 | {{ p.cuda_type }} {{ p.name }} 66 | {% elif p.gce_type == 'out' -%} 67 | {{ p.cuda_type }} *_{{ p.name }}_out 68 | {% else -%} 69 | {{ p.cuda_type }}* {{ p.name }} 70 | {% endif -%} 71 | {%- if not loop.last -%}, 72 | {%- else -%}) {% endif %} {% endfor %} 73 | { 74 | // Global index variables which determine where you are in the space, 75 | // and subsequently which grid point you will access. 76 | int _X = _tx + _txx * _bx + _x_start; 77 | int _Y = _ty + _tyy * _by; 78 | int _Z = _tz + _tzz * _bz; 79 | 80 | // Threads that are responsible for a grid point. 81 | const bool _in_global = (((_Y >= 0) && (_Y < {{ dims[1] }})) && \ 82 | ((_Z >= 0) && (_Z < {{ dims[2] }}))); 83 | 84 | // Threads that are not part of the thread block padding. 85 | const bool _in_local = (_ty >= 0) && (_ty < _tyy) && \ 86 | (_tz >= 0) && (_tz < _tzz); 87 | 88 | // Initialize the local variables for the Outs. 89 | {%- for p in params if p.gce_type == 'out' %} 90 | // {{ p.cuda_type }} {{ p.name }} = {{ p.cuda_type }}(0); 91 | {{ p.cuda_type }} {{ p.name }} = _{{ p.name }}_out[_Y * {{ dims[2] }} + _Z]; 92 | {%- endfor %} 93 | 94 | // User-defined "pre-loop" code. 95 | {{ preloop_code }} 96 | 97 | for (; _X < _x_end ; _X += _txx) { 98 | // User-defined "loop" code. 99 | {{ loop_code }} 100 | } 101 | 102 | // Save outs of non-padding threads to global memory. 103 | if (_in_local && _in_global) { 104 | {%- for p in params if p.gce_type == 'out' %} 105 | _{{ p.name }}_out[_Y * {{ dims[2] }} + _Z] = {{ p.name }}; 106 | {%- endfor %} 107 | } 108 | 109 | return; 110 | } 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /maxwell-server/simserver.py: -------------------------------------------------------------------------------- 1 | """ Simulation server for Maxwell. 2 | 3 | Executes uploaded jobs. 4 | 5 | Consists of an infinite loop which does the following: 6 | 0. Check that GCE is not running. 7 | 1. Find the oldest job. 8 | 2. Run the solver on it. 9 | 3. Repeat. 10 | 11 | """ 12 | 13 | import logging 14 | import os 15 | import os.path 16 | import shlex 17 | import subprocess 18 | import sys 19 | import time 20 | 21 | import maxwell_config 22 | import pycuda.driver 23 | import unbuffered 24 | 25 | LOG_FORMAT = '[%(asctime)-15s][%(levelname)s][%(module)s][%(funcName)s] %(message)s' 26 | logging.basicConfig(format=LOG_FORMAT) 27 | logger = logging.getLogger(__name__) 28 | logger.setLevel(logging.INFO) 29 | 30 | def get_num_gpus(): 31 | pycuda.driver.init() 32 | return pycuda.driver.Device.count() 33 | 34 | def check_process_running(pname): 35 | # Check if process with name 'pname' is running 36 | p1 = subprocess.Popen(['ps', 'ax'], 37 | stdout=subprocess.PIPE) 38 | p2 = subprocess.Popen(['grep', pname], 39 | stdin=p1.stdout, stdout=subprocess.PIPE) 40 | p3 = subprocess.Popen(['grep', '-v', 'grep'], 41 | stdin=p2.stdout, stdout=subprocess.PIPE) 42 | return p3.communicate()[0].find(b'\n') > -1 43 | 44 | def find_oldest_job(): 45 | req = maxwell_config.list_requests() # Get the requests. 46 | if not req: 47 | return None 48 | 49 | req_with_time = {} 50 | for r in req: 51 | req_with_time[r] = os.stat(os.path.join(maxwell_config.path, r)).st_ctime 52 | 53 | oldest_req = min(req_with_time) # Run this job. 54 | os.remove(os.path.join(maxwell_config.path, oldest_req)) 55 | return oldest_req[:-len('.request')] 56 | 57 | def main(): 58 | sys.stdout = unbuffered.Unbuffered(sys.stdout) 59 | 60 | path_to_solver_dir = os.path.abspath(__file__).replace( 61 | __file__.split('/')[-1], 'maxwell-solver') + '/' 62 | logger.info('Solver directory set to {0}'.format(path_to_solver_dir)) 63 | 64 | # Determine number of GPUs on system 65 | num_gpus = get_num_gpus() 66 | logger.info('Number of GPUs detected on system: {0}'.format(num_gpus)) 67 | 68 | # Determine number of GPUs to use per solve (user input) 69 | if len(sys.argv) > 1: 70 | gpus_per_solve = int(sys.argv[1]) 71 | else: 72 | gpus_per_solve = 2 73 | logger.info('Number of GPUs used per solve: {0}'.format(gpus_per_solve)) 74 | 75 | if gpus_per_solve > num_gpus: 76 | raise ValueError('Number of GPUs is {0}, but number of GPUs requested ' 77 | 'per solve is {1}, exceeding the number of available ' 78 | 'GPUs.'.format(num_gpus, gpus_per_solve)) 79 | 80 | # Generate GPU groupings. 81 | # Groupings take the form gpu1,gpu2,gpu3... 82 | # e.g. with 8 GPUs and 3 GPUs per solve we have: 83 | # solve_gpus = ['0,1,2','3,4,5'] 84 | num_solves = num_gpus // gpus_per_solve 85 | solve_gpus = [] 86 | for i in range(0, num_solves): 87 | start_num = i * gpus_per_solve 88 | solve_gpus.append(','.join( 89 | str(j) for j in range(start_num, start_num + gpus_per_solve))) 90 | 91 | # Managing loop 92 | solve_obj = [None]*num_solves 93 | solve_paths = ['']*num_solves 94 | out_files = [None]*num_solves 95 | 96 | logger.info('Ready to accept simulations.') 97 | 98 | while True: 99 | time.sleep(1) 100 | 101 | # Check for solve completion 102 | for i in range(len(solve_obj)): 103 | if solve_obj[i] and solve_obj[i].poll() is not None: 104 | logger.info('Simulation {0} ended with code {1}'.format( 105 | i, solve_obj[i].returncode)) 106 | 107 | # Close output log file 108 | out_files[i].close() 109 | 110 | # Used to let user know that files can be downloaded. 111 | time.sleep(0.5) 112 | filepath = os.path.join(maxwell_config.path, solve_paths[i] 113 | + '.finished') 114 | f = open(filepath, 'w') 115 | logger.debug('Writing finished file at {0}'.format(filepath)) 116 | f.write('{}'.format(solve_obj[i].returncode)) 117 | f.close() 118 | 119 | # Delete old job 120 | solve_obj[i] = None 121 | out_files[i] = None 122 | solve_paths[i] = '' 123 | 124 | # Ensure that GCE is not running 125 | if check_process_running('job_manager'): 126 | continue 127 | 128 | # Check for and start new solves 129 | for i in range(len(solve_obj)): 130 | if solve_obj[i]: 131 | continue 132 | solve_paths[i] = find_oldest_job() 133 | if solve_paths[i]: 134 | logger.info('Solving {0} as simulation {1}'.format( 135 | solve_paths[i], i)) 136 | 137 | tmp_env = os.environ.copy() 138 | tmp_env['CUDA_VISIBLE_DEVICES'] = solve_gpus[i] 139 | #logger.debug('Environment provided: {0}'.format(tmp_env)) 140 | 141 | out_file_log = os.path.join(maxwell_config.path, 142 | solve_paths[i] + '.log') 143 | out_files[i] = open(out_file_log, 'w') 144 | logger.debug('Outputing to log file: {0}'.format(out_file_log)) 145 | 146 | command = ('mpirun -n ' + str(gpus_per_solve) + ' python ' + 147 | path_to_solver_dir + 'fdfd.py ' + 148 | os.path.join(maxwell_config.path, solve_paths[i])) 149 | logger.debug('Running command {0}'.format(command)) 150 | 151 | solve_obj[i] = subprocess.Popen(shlex.split(command), 152 | stdout=out_files[i], 153 | stderr=subprocess.STDOUT, 154 | env=tmp_env) 155 | 156 | if __name__ == '__main__': 157 | main() 158 | -------------------------------------------------------------------------------- /maxwell-solver/gce/space.py: -------------------------------------------------------------------------------- 1 | """ Used to setup the global space for GCE. """ 2 | 3 | from mpi4py import MPI 4 | from mpi4py.MPI import COMM_WORLD as comm 5 | from pycuda import driver 6 | 7 | 8 | def _init_gpu(comm): 9 | """ Chooses a gpu and creates a context on it. """ 10 | # Find out how many GPUs are available to us on this node. 11 | driver.init() 12 | num_gpus = driver.Device.count() 13 | 14 | # Figure out the names of the other hosts. 15 | rank = comm.Get_rank() # Find out which process I am. 16 | name = MPI.Get_processor_name() # The name of my node. 17 | hosts = comm.allgather(name) # Get the names of all the other hosts 18 | 19 | # Find out which GPU to take (by precedence). 20 | gpu_id = hosts[0:rank].count(name) 21 | if gpu_id >= num_gpus: 22 | raise TypeError('No GPU available.') 23 | 24 | 25 | # Create a context on the appropriate device. 26 | for k in range(num_gpus): 27 | try: 28 | device = driver.Device((gpu_id + k) % num_gpus) 29 | context = device.make_context() 30 | except: 31 | continue 32 | else: 33 | # print "On %s: process %d taking gpu %d of %d.\n" % \ 34 | # (name, rank, gpu_id+k, num_gpus) 35 | break 36 | 37 | return device, context # Return device and context. 38 | 39 | # Global variable for the global space. 40 | # The leading double underscore should prevent outside modules from accessing 41 | # this variable. 42 | __GLOBAL_SPACE = None 43 | 44 | # Upon module initialization, claim a GPU and create a context on it. 45 | __DEVICE, __CONTEXT = _init_gpu(comm) 46 | 47 | import atexit 48 | atexit.register(__CONTEXT.pop) 49 | 50 | def initialize_space(shape): 51 | """ Form the space. """ 52 | global __GLOBAL_SPACE, __DEVICE, __CONTEXT 53 | __GLOBAL_SPACE = __Space(shape, __DEVICE, __CONTEXT) 54 | 55 | def get_space_info(): 56 | """ Returns all the info needed about a space. """ 57 | if __GLOBAL_SPACE is None: # Global space not yet initialized. 58 | raise TypeError('The global space is not initialized.') 59 | else: 60 | return __GLOBAL_SPACE.get_info() 61 | 62 | def print_space_info(): 63 | """ Prints out information about the space. """ 64 | if __GLOBAL_SPACE is None: # Global space not yet initialized. 65 | raise TypeError('The global space is not initialized.') 66 | info = __GLOBAL_SPACE.get_info() 67 | for name, val in info.iteritems(): 68 | print(name, val) 69 | 70 | # def destroy_space(): 71 | # """ Set global space to none. """ 72 | # global __GLOBAL_SPACE 73 | # __GLOBAL_SPACE.__del__() 74 | # __GLOBAL_SPACE = None 75 | 76 | class __Space(): 77 | """ Space forms the 3D context for Grid and Kernel objects. 78 | 79 | As of the current implementation, it is assumed that only one space 80 | will be created, and that all Const, Grid, and Kernel objects will 81 | operate on that space. 82 | 83 | """ 84 | 85 | def __init__(self, shape, device, context): 86 | """ Constructor for the Space class. 87 | 88 | Input variables 89 | shape -- Three-element tuple of positive integers defining the size of 90 | the space in the x-, y-, and z-directions. 91 | 92 | """ 93 | 94 | # Make sure shape has exactly three elements. 95 | if len(shape) is not 3: 96 | raise TypeError('Shape must have exactly three elements.') 97 | 98 | # Make sure they are all integers. 99 | if any([type(s) is not int for s in shape]): 100 | raise TypeError('Shape must have only integer elements.') 101 | 102 | # Make sure all elements are positive. 103 | if any([s < 1 for s in shape]): 104 | raise TypeError('Shape must have only integer elements.') 105 | 106 | # # Make sure stencil is a single, non-negative integer. 107 | # if (type(stencil) is not int) or (stencil < 0): 108 | # raise TypeError('Stencil must be a non-negative scalar integer.') 109 | # 110 | # Initialize the space. 111 | self.shape = shape 112 | 113 | # Get MPI information. 114 | rank = comm.Get_rank() 115 | size = comm.Get_size() 116 | 117 | # Nodes to pass forward and backward (along x) to. 118 | self.mpi_adj = {'forw': (rank+1)%size, 'back': (rank-1)%size} 119 | 120 | # Grid is too small to be partitioned. 121 | if (size > self.shape[0]): 122 | raise TypeError('Shape is too short along x to be partitioned.') 123 | 124 | # Create the context on the appropriate GPU. 125 | # self.device, self.context = self._init_gpu(comm) 126 | self.device = device 127 | self.context = context 128 | 129 | # Partition the space. 130 | # Each space is responsible for field[x_range[0]:x_range[1],:,:]. 131 | get_x_range = lambda r: (int(self.shape[0] * (float(r) / size)), \ 132 | int(self.shape[0] * (float(r+1) / size))) 133 | self.x_range = get_x_range(rank) 134 | 135 | self.all_x_ranges = [get_x_range(r) for r in range(size)] 136 | 137 | 138 | # def __del__(self): 139 | # """ Pop the cuda context on cleanup. """ 140 | # # Make sure the space was actually initialized. 141 | # if hasattr(self, 'context'): 142 | # self.context.pop() 143 | 144 | def get_info(self): 145 | """ Return information about the space as a dict. """ 146 | return {'shape': self.shape, \ 147 | 'x_range': self.x_range, \ 148 | 'all_x_ranges': self.all_x_ranges, \ 149 | 'mpi_adj': self.mpi_adj, \ 150 | 'max_shared_mem': self.device.max_shared_memory_per_block, \ 151 | 'max_block_z': self.device.max_block_dim_x, \ 152 | 'max_block_y': self.device.max_block_dim_y, \ 153 | 'max_threads': self.device.max_threads_per_block, \ 154 | 'mem_bandwidth': 1000 * self.device.memory_clock_rate/8 * \ 155 | self.device.global_memory_bus_width * 2, \ 156 | 'max_registers': self.device.max_registers_per_block, \ 157 | 'async_engine_count': self.device.async_engine_count, \ 158 | 'ecc_enabled': self.device.ecc_enabled, \ 159 | } 160 | -------------------------------------------------------------------------------- /maxwell-solver/kernels/alpha_allpre.cu: -------------------------------------------------------------------------------- 1 | // Mark the threads that need to load from global memory. 2 | const bool adj_dims = (((_X >= -1) && (_X <= {{ dims[0] }})) && \ 3 | ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \ 4 | ((_Z >= -1) && (_Z <= {{ dims[2] }}))); 5 | 6 | // Set relevant field pointers to create wrap-around periodic grid. 7 | if (_Y == -1) { 8 | _Y = {{ dims[1]-1 }}; 9 | } 10 | if (_Y == {{ dims[1] }}) { 11 | _Y = 0; 12 | } 13 | if (_Z == -1) { 14 | _Z = {{ dims[2]-1 }}; 15 | } 16 | if (_Z == {{ dims[2] }}) { 17 | _Z = 0; 18 | } 19 | 20 | // Some definitions for shared memory. 21 | // Used to get unpadded thread indices. 22 | #define s_ty (_ty + 1) 23 | #define s_tz (_tz + 1) 24 | #define s_tyy (_tyy + 2) 25 | #define s_tzz (_tzz + 2) 26 | 27 | // Helper definitions. 28 | #define s_next_field (s_tyy * s_tzz) 29 | #define s_to_local (s_ty * s_tzz + (s_tz)) 30 | #define s_zp +1 31 | #define s_zn -1 32 | #define s_yp +s_tzz 33 | #define s_yn -s_tzz 34 | 35 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 36 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 37 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 38 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 39 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 40 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 41 | 42 | // Local memory. 43 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n; 44 | {{ type }} vx, vy, vz; 45 | {{ type }} px, py, pz, py_p, pz_p; 46 | 47 | int xn, xp; 48 | if (_X == 0) 49 | xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction. 50 | else 51 | xn = -1; 52 | 53 | 54 | // Load E-fields into shared memory. 55 | if (adj_dims) { 56 | // Load in p = r + beta * p. 57 | Ex_0[0] = (Rx(-1,0,0) + beta * Px(-1,0,0)) * 58 | (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z)); 59 | Ey_0[0] = (Ry(-1,0,0) + beta * Py(-1,0,0)) * 60 | (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 61 | Ez_0[0] = (Rz(-1,0,0) + beta * Pz(-1,0,0)) * 62 | (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 63 | 64 | // Ey_p = Ry(0,0,0) + beta * Ey(0,0,0); 65 | py_p = Ry(0,0,0) + beta * Py(0,0,0); 66 | Ey_p = (py_p) * (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 67 | 68 | // Ez_p = Rz(0,0,0) + beta * Ez(0,0,0); 69 | pz_p = Rz(0,0,0) + beta * Pz(0,0,0); 70 | Ez_p = (pz_p) * (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 71 | } 72 | __syncthreads(); 73 | 74 | // Calculate H-fields and store in shared_memory. 75 | // Hy. 76 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 77 | Hy_0[0] = my(-1,0,0) * (sx1(_X+xn) * (Ez_0[0] - Ez_p) - 78 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 79 | } 80 | 81 | // Hz. 82 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 83 | Hz_0[0] = mz(-1,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 84 | sx1(_X+xn) * (Ey_0[0] - Ey_p)); 85 | } 86 | __syncthreads(); 87 | 88 | for (; _X < _x_end ; _X += _txx) { 89 | // We've moved ahead in X, so transfer appropriate field values. 90 | Ey_0[0] = Ey_p; 91 | Ez_0[0] = Ez_p; 92 | Hy_n = Hy_0[0]; 93 | Hz_n = Hz_0[0]; 94 | 95 | py = py_p; 96 | pz = pz_p; 97 | 98 | // Load E-fields into shared memory. 99 | if (_X == {{ dims[0]-1 }}) 100 | xp = {{ -(dims[0]-1) }}; 101 | else 102 | xp = +1; 103 | 104 | if (adj_dims) { 105 | px = Rx(0,0,0) + beta * Px(0,0,0); 106 | Ex_0[0] = (px) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z)); 107 | 108 | py_p = Ry(+1,0,0) + beta * Py(+1,0,0); 109 | Ey_p = (py_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 110 | 111 | pz_p = Rz(+1,0,0) + beta * Pz(+1,0,0); 112 | Ez_p = (pz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 113 | } 114 | 115 | __syncthreads(); 116 | 117 | // Calculate H-fields and store in shared_memory. 118 | {% if mu_equals_1 == True %} 119 | // Hx. 120 | if ((_ty != _tyy) && (_tz != _tzz)) { 121 | Hx_0[0] = (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 122 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 123 | } 124 | 125 | // Hy. 126 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 127 | Hy_0[0] = (sx1(_X) * (Ez_0[0] - Ez_p) - 128 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 129 | } 130 | 131 | // Hz. 132 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 133 | Hz_0[0] = (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 134 | sx1(_X) * (Ey_0[0] - Ey_p)); 135 | } 136 | {% else %} 137 | // Hx. 138 | if ((_ty != _tyy) && (_tz != _tzz)) { 139 | Hx_0[0] = mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 140 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 141 | } 142 | 143 | // Hy. 144 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 145 | Hy_0[0] = my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 146 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 147 | } 148 | 149 | // Hz. 150 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 151 | Hz_0[0] = mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 152 | sx1(_X) * (Ey_0[0] - Ey_p)); 153 | } 154 | {% endif %} 155 | __syncthreads(); 156 | 157 | // Write out the results. 158 | if (_in_global && _in_local) { 159 | {% if full_operator %} 160 | P1x(0,0,0) = px; 161 | P1y(0,0,0) = py; 162 | P1z(0,0,0) = pz; 163 | 164 | vx = ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) * 165 | (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn]) 166 | - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn]) 167 | - ex(0,0,0) * Ex_0[0])); 168 | vy = ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) * 169 | (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 170 | - sx0(_X) * (Hz_0[0] - Hz_n) 171 | - ey(0,0,0) * Ey_0[0])); 172 | vz = ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) * 173 | (sx0(_X) * (Hy_0[0] - Hy_n) 174 | - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 175 | - ez(0,0,0) * Ez_0[0])); 176 | 177 | Vx(0,0,0) = vx; 178 | Vy(0,0,0) = vy; 179 | Vz(0,0,0) = vz; 180 | 181 | alpha_denom += (px * vx) + (py * vy) + (pz * vz); 182 | 183 | {% else %} 184 | Vx(0,0,0) = Hx_0[0]; 185 | Vy(0,0,0) = Hy_0[0]; 186 | Vz(0,0,0) = Hz_0[0]; 187 | 188 | {% endif %} 189 | } 190 | __syncthreads(); 191 | } 192 | -------------------------------------------------------------------------------- /maxwell-solver/kernels/fdfd_matrix_multiplication.cu: -------------------------------------------------------------------------------- 1 | // Mark the threads that need to load from global memory. 2 | const bool adj_dims = (((_X >= -1) && (_X <= {{ dims[0] }})) && \ 3 | ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \ 4 | ((_Z >= -1) && (_Z <= {{ dims[2] }}))); 5 | 6 | // Set relevant field pointers to create wrap-around periodic grid. 7 | {{ type }} bloch_phaseYZ_x = 1.0; 8 | {{ type }} bloch_phaseYZ_y = 1.0; 9 | {{ type }} bloch_phaseYZ_z = 1.0; 10 | 11 | // Set relevant field pointers to create wrap-around periodic grid. 12 | if (_Y == -1) { 13 | _Y = {{ dims[1]-1 }}; 14 | bloch_phaseYZ_x *= conj(bloch_y(0)); 15 | bloch_phaseYZ_y *= conj(bloch_y(1)); 16 | bloch_phaseYZ_z *= conj(bloch_y(2)); 17 | } 18 | if (_Y == {{ dims[1] }}) { 19 | _Y = 0; 20 | bloch_phaseYZ_x *= bloch_y(0); 21 | bloch_phaseYZ_y *= bloch_y(1); 22 | bloch_phaseYZ_z *= bloch_y(2); 23 | } 24 | if (_Z == -1) { 25 | _Z = {{ dims[2]-1 }}; 26 | bloch_phaseYZ_x *= conj(bloch_z(0)); 27 | bloch_phaseYZ_y *= conj(bloch_z(1)); 28 | bloch_phaseYZ_z *= conj(bloch_z(2)); 29 | } 30 | if (_Z == {{ dims[2] }}) { 31 | _Z = 0; 32 | bloch_phaseYZ_x *= bloch_z(0); 33 | bloch_phaseYZ_y *= bloch_z(1); 34 | bloch_phaseYZ_z *= bloch_z(2); 35 | } 36 | 37 | // Some definitions for shared memory. 38 | // Used to get unpadded thread indices. 39 | #define s_ty (_ty + 1) 40 | #define s_tz (_tz + 1) 41 | #define s_tyy (_tyy + 2) 42 | #define s_tzz (_tzz + 2) 43 | 44 | // Helper definitions. 45 | #define s_next_field (s_tyy * s_tzz) 46 | #define s_to_local (s_ty * s_tzz + (s_tz)) 47 | #define s_zp +1 48 | #define s_zn -1 49 | #define s_yp +s_tzz 50 | #define s_yn -s_tzz 51 | 52 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 53 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 54 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 55 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 56 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 57 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 58 | 59 | // Local memory. 60 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n; 61 | {{ type }} vx, vy, vz; 62 | {{ type }} px, py, pz, py_p, pz_p; 63 | 64 | int xn, xp; 65 | {{ type }} bloch_phaseX_x = 1; 66 | {{ type }} bloch_phaseX_y = 1; 67 | {{ type }} bloch_phaseX_z = 1; 68 | if (_X == 0) { 69 | bloch_phaseX_x = conj(bloch_x(0)); 70 | bloch_phaseX_y = conj(bloch_x(1)); 71 | bloch_phaseX_z = conj(bloch_x(2)); 72 | xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction. 73 | } else { 74 | xn = -1; 75 | } 76 | 77 | // Load E-fields into shared memory. 78 | if (adj_dims) { 79 | // Load in p = r + beta * p. 80 | Ex_0[0] = bloch_phaseX_x * bloch_phaseYZ_x * (Xx(-1,0,0)) * 81 | (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z)); 82 | Ey_0[0] = bloch_phaseX_y * bloch_phaseYZ_y * (Xy(-1,0,0)) * 83 | (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 84 | Ez_0[0] = bloch_phaseX_z * bloch_phaseYZ_z * (Xz(-1,0,0)) * 85 | (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 86 | 87 | // Ey_p = Ry(0,0,0) + beta * Ey(0,0,0); 88 | py_p = Xy(0,0,0); 89 | Ey_p = bloch_phaseYZ_y * (py_p) * (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 90 | 91 | // Ez_p = Rz(0,0,0) + beta * Ez(0,0,0); 92 | pz_p = Xz(0,0,0); 93 | Ez_p = bloch_phaseYZ_z * (pz_p) * (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 94 | } 95 | __syncthreads(); 96 | 97 | // Calculate H-fields and store in shared_memory. 98 | // Hy. 99 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 100 | Hy_0[0] = my(-1,0,0) * (sx1(_X+xn) * (Ez_0[0] - Ez_p) - 101 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 102 | } 103 | 104 | // Hz. 105 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 106 | Hz_0[0] = mz(-1,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 107 | sx1(_X+xn) * (Ey_0[0] - Ey_p)); 108 | } 109 | __syncthreads(); 110 | 111 | for (; _X < _x_end ; _X += _txx) { 112 | // We've moved ahead in X, so transfer appropriate field values. 113 | Ey_0[0] = Ey_p; 114 | Ez_0[0] = Ez_p; 115 | Hy_n = Hy_0[0]; 116 | Hz_n = Hz_0[0]; 117 | 118 | py = py_p; 119 | pz = pz_p; 120 | 121 | // Load E-fields into shared memory. 122 | if (_X == {{ dims[0]-1 }}) { 123 | bloch_phaseX_x = bloch_x(0); 124 | bloch_phaseX_y = bloch_x(1); 125 | bloch_phaseX_z = bloch_x(2); 126 | xp = {{ -(dims[0]-1) }}; 127 | } else { 128 | xp = +1; 129 | bloch_phaseX_x = 1; 130 | bloch_phaseX_y = 1; 131 | bloch_phaseX_z = 1; 132 | } 133 | if (adj_dims) { 134 | px = Xx(0,0,0); 135 | Ex_0[0] = bloch_phaseYZ_x * (px) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z)); 136 | 137 | py_p = Xy(+1,0,0); 138 | Ey_p = bloch_phaseX_y * bloch_phaseYZ_y * (py_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 139 | 140 | pz_p = Xz(+1,0,0); 141 | Ez_p = bloch_phaseX_z * bloch_phaseYZ_z * (pz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 142 | } 143 | 144 | __syncthreads(); 145 | 146 | // Calculate H-fields and store in shared_memory. 147 | {% if mu_equals_1 == True %} 148 | // Hx. 149 | if ((_ty != _tyy) && (_tz != _tzz)) { 150 | Hx_0[0] = (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 151 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 152 | } 153 | 154 | // Hy. 155 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 156 | Hy_0[0] = (sx1(_X) * (Ez_0[0] - Ez_p) - 157 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 158 | } 159 | 160 | // Hz. 161 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 162 | Hz_0[0] = (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 163 | sx1(_X) * (Ey_0[0] - Ey_p)); 164 | } 165 | {% else %} 166 | // Hx. 167 | if ((_ty != _tyy) && (_tz != _tzz)) { 168 | Hx_0[0] = mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 169 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 170 | } 171 | 172 | // Hy. 173 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 174 | Hy_0[0] = my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 175 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 176 | } 177 | 178 | // Hz. 179 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 180 | Hz_0[0] = mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 181 | sx1(_X) * (Ey_0[0] - Ey_p)); 182 | } 183 | {% endif %} 184 | __syncthreads(); 185 | 186 | // Write out the results. 187 | if (_in_global && _in_local) { 188 | {% if full_operator %} 189 | 190 | vx = ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) * 191 | (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn]) 192 | - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn]) 193 | - ex(0,0,0) * Ex_0[0])); 194 | vy = ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) * 195 | (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 196 | - sx0(_X) * (Hz_0[0] - Hz_n) 197 | - ey(0,0,0) * Ey_0[0])); 198 | vz = ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) * 199 | (sx0(_X) * (Hy_0[0] - Hy_n) 200 | - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 201 | - ez(0,0,0) * Ez_0[0])); 202 | 203 | Bx(0,0,0) = vx; 204 | By(0,0,0) = vy; 205 | Bz(0,0,0) = vz; 206 | 207 | {% else %} 208 | Bx(0,0,0) = Hx_0[0]; 209 | By(0,0,0) = Hy_0[0]; 210 | Bz(0,0,0) = Hz_0[0]; 211 | 212 | {% endif %} 213 | } 214 | __syncthreads(); 215 | } 216 | -------------------------------------------------------------------------------- /maxwell-solver/kernels/fdfd_residual.cu: -------------------------------------------------------------------------------- 1 | // Mark the threads that need to load from global memory. 2 | const bool adj_dims = (((_X >= -1) && (_X <= {{ dims[0] }})) && \ 3 | ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \ 4 | ((_Z >= -1) && (_Z <= {{ dims[2] }}))); 5 | 6 | // Set relevant field pointers to create wrap-around periodic grid. 7 | {{ type }} bloch_phaseYZ_x = 1.0; 8 | {{ type }} bloch_phaseYZ_y = 1.0; 9 | {{ type }} bloch_phaseYZ_z = 1.0; 10 | 11 | // Set relevant field pointers to create wrap-around periodic grid. 12 | if (_Y == -1) { 13 | _Y = {{ dims[1]-1 }}; 14 | bloch_phaseYZ_x *= bloch_y(0); 15 | bloch_phaseYZ_y *= bloch_y(1); 16 | bloch_phaseYZ_z *= bloch_y(2); 17 | } 18 | if (_Y == {{ dims[1] }}) { 19 | _Y = 0; 20 | bloch_phaseYZ_x *= conj(bloch_y(0)); 21 | bloch_phaseYZ_y *= conj(bloch_y(1)); 22 | bloch_phaseYZ_z *= conj(bloch_y(2)); 23 | } 24 | if (_Z == -1) { 25 | _Z = {{ dims[2]-1 }}; 26 | bloch_phaseYZ_x *= bloch_z(0); 27 | bloch_phaseYZ_y *= bloch_z(1); 28 | bloch_phaseYZ_z *= bloch_z(2); 29 | } 30 | if (_Z == {{ dims[2] }}) { 31 | _Z = 0; 32 | bloch_phaseYZ_x *= conj(bloch_z(0)); 33 | bloch_phaseYZ_y *= conj(bloch_z(1)); 34 | bloch_phaseYZ_z *= conj(bloch_z(2)); 35 | } 36 | 37 | // Some definitions for shared memory. 38 | // Used to get unpadded thread indices. 39 | #define s_ty (_ty + 1) 40 | #define s_tz (_tz + 1) 41 | #define s_tyy (_tyy + 2) 42 | #define s_tzz (_tzz + 2) 43 | 44 | // Helper definitions. 45 | #define s_next_field (s_tyy * s_tzz) 46 | #define s_to_local (s_ty * s_tzz + (s_tz)) 47 | #define s_zp +1 48 | #define s_zn -1 49 | #define s_yp +s_tzz 50 | #define s_yn -s_tzz 51 | 52 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 53 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 54 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 55 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 56 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 57 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 58 | 59 | // Local memory. 60 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n; 61 | {{ type }} vx, vy, vz; 62 | {{ type }} px, py, pz, py_p, pz_p; 63 | 64 | int xn, xp; 65 | {{ type }} bloch_phaseX_x = 1; 66 | {{ type }} bloch_phaseX_y = 1; 67 | {{ type }} bloch_phaseX_z = 1; 68 | if (_X == 0) { 69 | bloch_phaseX_x = bloch_x(0); 70 | bloch_phaseX_y = bloch_x(1); 71 | bloch_phaseX_z = bloch_x(2); 72 | xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction. 73 | } else { 74 | xn = -1; 75 | } 76 | 77 | // Load E-fields into shared memory. 78 | if (adj_dims) { 79 | // Load in p = r + beta * p. 80 | Ex_0[0] = bloch_phaseX_x * bloch_phaseYZ_x * (Xx(-1,0,0)) * 81 | (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z)); 82 | Ey_0[0] = bloch_phaseX_y * bloch_phaseYZ_y * (Xy(-1,0,0)) * 83 | (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 84 | Ez_0[0] = bloch_phaseX_z * bloch_phaseYZ_z * (Xz(-1,0,0)) * 85 | (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 86 | 87 | // Ey_p = Ry(0,0,0) + beta * Ey(0,0,0); 88 | py_p = Xy(0,0,0); 89 | Ey_p = bloch_phaseYZ_y * (py_p) * (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 90 | 91 | // Ez_p = Rz(0,0,0) + beta * Ez(0,0,0); 92 | pz_p = Xz(0,0,0); 93 | Ez_p = bloch_phaseYZ_z * (pz_p) * (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 94 | } 95 | __syncthreads(); 96 | 97 | // Calculate H-fields and store in shared_memory. 98 | // Hy. 99 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 100 | Hy_0[0] = my(-1,0,0) * (sx1(_X+xn) * (Ez_0[0] - Ez_p) - 101 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 102 | } 103 | 104 | // Hz. 105 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 106 | Hz_0[0] = mz(-1,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 107 | sx1(_X+xn) * (Ey_0[0] - Ey_p)); 108 | } 109 | __syncthreads(); 110 | 111 | for (; _X < _x_end ; _X += _txx) { 112 | // We've moved ahead in X, so transfer appropriate field values. 113 | Ey_0[0] = Ey_p; 114 | Ez_0[0] = Ez_p; 115 | Hy_n = Hy_0[0]; 116 | Hz_n = Hz_0[0]; 117 | 118 | py = py_p; 119 | pz = pz_p; 120 | 121 | // Load E-fields into shared memory. 122 | if (_X == {{ dims[0]-1 }}) { 123 | bloch_phaseX_x = conj(bloch_x(0)); 124 | bloch_phaseX_y = conj(bloch_x(1)); 125 | bloch_phaseX_z = conj(bloch_x(2)); 126 | xp = {{ -(dims[0]-1) }}; 127 | } else { 128 | xp = +1; 129 | bloch_phaseX_x = 1; 130 | bloch_phaseX_y = 1; 131 | bloch_phaseX_z = 1; 132 | } 133 | if (adj_dims) { 134 | px = Xx(0,0,0); 135 | Ex_0[0] = bloch_phaseYZ_x * (px) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z)); 136 | 137 | py_p = Xy(+1,0,0); 138 | Ey_p = bloch_phaseX_y * bloch_phaseYZ_y * (py_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 139 | 140 | pz_p = Xz(+1,0,0); 141 | Ez_p = bloch_phaseX_z * bloch_phaseYZ_z * (pz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 142 | } 143 | 144 | __syncthreads(); 145 | 146 | // Calculate H-fields and store in shared_memory. 147 | {% if mu_equals_1 == True %} 148 | // Hx. 149 | if ((_ty != _tyy) && (_tz != _tzz)) { 150 | Hx_0[0] = (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 151 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 152 | } 153 | 154 | // Hy. 155 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 156 | Hy_0[0] = (sx1(_X) * (Ez_0[0] - Ez_p) - 157 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 158 | } 159 | 160 | // Hz. 161 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 162 | Hz_0[0] = (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 163 | sx1(_X) * (Ey_0[0] - Ey_p)); 164 | } 165 | {% else %} 166 | // Hx. 167 | if ((_ty != _tyy) && (_tz != _tzz)) { 168 | Hx_0[0] = mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 169 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 170 | } 171 | 172 | // Hy. 173 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 174 | Hy_0[0] = my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 175 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 176 | } 177 | 178 | // Hz. 179 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 180 | Hz_0[0] = mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 181 | sx1(_X) * (Ey_0[0] - Ey_p)); 182 | } 183 | {% endif %} 184 | __syncthreads(); 185 | 186 | // Write out the results. 187 | if (_in_global && _in_local) { 188 | {% if full_operator %} 189 | 190 | vx = Bx(0,0,0) - ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) * 191 | (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn]) 192 | - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn]) 193 | - ex(0,0,0) * Ex_0[0])); 194 | vy = By(0,0,0) - ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) * 195 | (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 196 | - sx0(_X) * (Hz_0[0] - Hz_n) 197 | - ey(0,0,0) * Ey_0[0])); 198 | vz = Bz(0,0,0) - ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) * 199 | (sx0(_X) * (Hy_0[0] - Hy_n) 200 | - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 201 | - ez(0,0,0) * Ez_0[0])); 202 | 203 | Rx(0,0,0) = vx; 204 | Ry(0,0,0) = vy; 205 | Rz(0,0,0) = vz; 206 | 207 | {% else %} 208 | Bx(0,0,0) = Hx_0[0]; 209 | By(0,0,0) = Hy_0[0]; 210 | Bz(0,0,0) = Hz_0[0]; 211 | 212 | {% endif %} 213 | } 214 | __syncthreads(); 215 | } 216 | -------------------------------------------------------------------------------- /maxwell-solver/kernels/alpha_biCGSTAB.cu: -------------------------------------------------------------------------------- 1 | // Mark the threads that need to load from global memory. 2 | const bool adj_dims = (((_X >= -1) && (_X <= {{ dims[0] }})) && \ 3 | ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \ 4 | ((_Z >= -1) && (_Z <= {{ dims[2] }}))); 5 | 6 | // Set relevant field pointers to create wrap-around periodic grid. 7 | {{ type }} bloch_phaseYZ_x = 1.0; 8 | {{ type }} bloch_phaseYZ_y = 1.0; 9 | {{ type }} bloch_phaseYZ_z = 1.0; 10 | 11 | // Set relevant field pointers to create wrap-around periodic grid. 12 | if (_Y == -1) { 13 | _Y = {{ dims[1]-1 }}; 14 | bloch_phaseYZ_x *= bloch_y(0); 15 | bloch_phaseYZ_y *= bloch_y(1); 16 | bloch_phaseYZ_z *= bloch_y(2); 17 | } 18 | if (_Y == {{ dims[1] }}) { 19 | _Y = 0; 20 | bloch_phaseYZ_x *= conj(bloch_y(0)); 21 | bloch_phaseYZ_y *= conj(bloch_y(1)); 22 | bloch_phaseYZ_z *= conj(bloch_y(2)); 23 | } 24 | if (_Z == -1) { 25 | _Z = {{ dims[2]-1 }}; 26 | bloch_phaseYZ_x *= bloch_z(0); 27 | bloch_phaseYZ_y *= bloch_z(1); 28 | bloch_phaseYZ_z *= bloch_z(2); 29 | } 30 | if (_Z == {{ dims[2] }}) { 31 | _Z = 0; 32 | bloch_phaseYZ_x *= conj(bloch_z(0)); 33 | bloch_phaseYZ_y *= conj(bloch_z(1)); 34 | bloch_phaseYZ_z *= conj(bloch_z(2)); 35 | } 36 | 37 | // Some definitions for shared memory. 38 | // Used to get unpadded thread indices. 39 | #define s_ty (_ty + 1) 40 | #define s_tz (_tz + 1) 41 | #define s_tyy (_tyy + 2) 42 | #define s_tzz (_tzz + 2) 43 | 44 | // Helper definitions. 45 | #define s_next_field (s_tyy * s_tzz) 46 | #define s_to_local (s_ty * s_tzz + (s_tz)) 47 | #define s_zp +1 48 | #define s_zn -1 49 | #define s_yp +s_tzz 50 | #define s_yn -s_tzz 51 | 52 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 53 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 54 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 55 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 56 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 57 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 58 | 59 | // Local memory. 60 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n; 61 | {{ type }} vx, vy, vz; 62 | {{ type }} px, py, pz, py_p, pz_p; 63 | 64 | int xn, xp; 65 | {{ type }} bloch_phaseX_x = 1; 66 | {{ type }} bloch_phaseX_y = 1; 67 | {{ type }} bloch_phaseX_z = 1; 68 | if (_X == 0) { 69 | bloch_phaseX_x = bloch_x(0); 70 | bloch_phaseX_y = bloch_x(1); 71 | bloch_phaseX_z = bloch_x(2); 72 | xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction. 73 | } else { 74 | xn = -1; 75 | } 76 | 77 | // Load E-fields into shared memory. 78 | if (adj_dims) { 79 | // Load in p = r + beta * p. 80 | Ex_0[0] = bloch_phaseX_x * bloch_phaseYZ_x * (Rx(-1,0,0) + beta * (Px(-1,0,0) - omega * Vx(-1,0,0))) * 81 | (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z)); 82 | Ey_0[0] = bloch_phaseX_y * bloch_phaseYZ_y * (Ry(-1,0,0) + beta * (Py(-1,0,0) - omega * Vy(-1,0,0))) * 83 | (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 84 | Ez_0[0] = bloch_phaseX_z * bloch_phaseYZ_z * (Rz(-1,0,0) + beta * (Pz(-1,0,0) - omega * Vz(-1,0,0))) * 85 | (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 86 | 87 | // Ey_p = Ry(0,0,0) + beta * Ey(0,0,0); 88 | py_p = Ry(0,0,0) + beta * (Py(0,0,0) - omega * Vy(0,0,0)); 89 | Ey_p = bloch_phaseYZ_y * (py_p) * (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 90 | 91 | // Ez_p = Rz(0,0,0) + beta * Ez(0,0,0); 92 | pz_p = Rz(0,0,0) + beta * (Pz(0,0,0) - omega * Vz(0,0,0)); 93 | Ez_p = bloch_phaseYZ_z * (pz_p) * (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 94 | } 95 | __syncthreads(); 96 | 97 | // Calculate H-fields and store in shared_memory. 98 | // Hy. 99 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 100 | Hy_0[0] = my(-1,0,0) * (sx1(_X+xn) * (Ez_0[0] - Ez_p) - 101 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 102 | } 103 | 104 | // Hz. 105 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 106 | Hz_0[0] = mz(-1,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 107 | sx1(_X+xn) * (Ey_0[0] - Ey_p)); 108 | } 109 | __syncthreads(); 110 | 111 | for (; _X < _x_end ; _X += _txx) { 112 | // We've moved ahead in X, so transfer appropriate field values. 113 | Ey_0[0] = Ey_p; 114 | Ez_0[0] = Ez_p; 115 | Hy_n = Hy_0[0]; 116 | Hz_n = Hz_0[0]; 117 | 118 | py = py_p; 119 | pz = pz_p; 120 | 121 | // Load E-fields into shared memory. 122 | if (_X == {{ dims[0]-1 }}) { 123 | bloch_phaseX_x = conj(bloch_x(0)); 124 | bloch_phaseX_y = conj(bloch_x(1)); 125 | bloch_phaseX_z = conj(bloch_x(2)); 126 | xp = {{ -(dims[0]-1) }}; 127 | } else { 128 | xp = +1; 129 | bloch_phaseX_x = 1; 130 | bloch_phaseX_y = 1; 131 | bloch_phaseX_z = 1; 132 | } 133 | if (adj_dims) { 134 | px = Rx(0,0,0) + beta * (Px(0,0,0) - omega * Vx(0,0,0)); 135 | Ex_0[0] = bloch_phaseYZ_x * (px) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z)); 136 | 137 | py_p = Ry(+1,0,0) + beta * (Py(+1,0,0) - omega * Vy(+1,0,0)); 138 | Ey_p = bloch_phaseX_y * bloch_phaseYZ_y * (py_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 139 | 140 | pz_p = Rz(+1,0,0) + beta * (Pz(+1,0,0) - omega * Vz(+1,0,0)); 141 | Ez_p = bloch_phaseX_z * bloch_phaseYZ_z * (pz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 142 | } 143 | 144 | __syncthreads(); 145 | 146 | // Calculate H-fields and store in shared_memory. 147 | {% if mu_equals_1 == True %} 148 | // Hx. 149 | if ((_ty != _tyy) && (_tz != _tzz)) { 150 | Hx_0[0] = (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 151 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 152 | } 153 | 154 | // Hy. 155 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 156 | Hy_0[0] = (sx1(_X) * (Ez_0[0] - Ez_p) - 157 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 158 | } 159 | 160 | // Hz. 161 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 162 | Hz_0[0] = (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 163 | sx1(_X) * (Ey_0[0] - Ey_p)); 164 | } 165 | {% else %} 166 | // Hx. 167 | if ((_ty != _tyy) && (_tz != _tzz)) { 168 | Hx_0[0] = mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 169 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 170 | } 171 | 172 | // Hy. 173 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 174 | Hy_0[0] = my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 175 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 176 | } 177 | 178 | // Hz. 179 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 180 | Hz_0[0] = mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 181 | sx1(_X) * (Ey_0[0] - Ey_p)); 182 | } 183 | {% endif %} 184 | __syncthreads(); 185 | 186 | // Write out the results. 187 | if (_in_global && _in_local) { 188 | {% if full_operator %} 189 | P1x(0,0,0) = px; 190 | P1y(0,0,0) = py; 191 | P1z(0,0,0) = pz; 192 | 193 | vx = ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) * 194 | (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn]) 195 | - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn]) 196 | - ex(0,0,0) * Ex_0[0])); 197 | vy = ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) * 198 | (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 199 | - sx0(_X) * (Hz_0[0] - Hz_n) 200 | - ey(0,0,0) * Ey_0[0])); 201 | vz = ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) * 202 | (sx0(_X) * (Hy_0[0] - Hy_n) 203 | - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 204 | - ez(0,0,0) * Ez_0[0])); 205 | 206 | V1x(0,0,0) = vx; 207 | V1y(0,0,0) = vy; 208 | V1z(0,0,0) = vz; 209 | 210 | alpha_denom += (R_hatHx(0,0,0) * vx) + (R_hatHy(0,0,0) * vy) + (R_hatHz(0,0,0) * vz); 211 | 212 | {% else %} 213 | V1x(0,0,0) = Hx_0[0]; 214 | V1y(0,0,0) = Hy_0[0]; 215 | V1z(0,0,0) = Hz_0[0]; 216 | 217 | {% endif %} 218 | } 219 | __syncthreads(); 220 | } 221 | -------------------------------------------------------------------------------- /maxwell-solver/kernels/omega_bloch_allpre.cu: -------------------------------------------------------------------------------- 1 | // Mark the threads that need to load from global memory. 2 | const bool adj_dims = (((_X >= -1) && (_X <= {{ dims[0] }})) && \ 3 | ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \ 4 | ((_Z >= -1) && (_Z <= {{ dims[2] }}))); 5 | 6 | // Set relevant field pointers to create wrap-around periodic grid. 7 | {{ type }} bloch_phaseYZ_x = 1.0; 8 | {{ type }} bloch_phaseYZ_y = 1.0; 9 | {{ type }} bloch_phaseYZ_z = 1.0; 10 | if (_Y == -1) { 11 | _Y = {{ dims[1]-1 }}; 12 | bloch_phaseYZ_x *= conj(bloch_y(0)); 13 | bloch_phaseYZ_y *= conj(bloch_y(1)); 14 | bloch_phaseYZ_z *= conj(bloch_y(2)); 15 | } 16 | if (_Y == {{ dims[1] }}) { 17 | _Y = 0; 18 | bloch_phaseYZ_x *= bloch_y(0); 19 | bloch_phaseYZ_y *= bloch_y(1); 20 | bloch_phaseYZ_z *= bloch_y(2); 21 | } 22 | if (_Z == -1) { 23 | _Z = {{ dims[2]-1 }}; 24 | bloch_phaseYZ_x *= conj(bloch_z(0)); 25 | bloch_phaseYZ_y *= conj(bloch_z(1)); 26 | bloch_phaseYZ_z *= conj(bloch_z(2)); 27 | } 28 | if (_Z == {{ dims[2] }}) { 29 | _Z = 0; 30 | bloch_phaseYZ_x *= bloch_z(0); 31 | bloch_phaseYZ_y *= bloch_z(1); 32 | bloch_phaseYZ_z *= bloch_z(2); 33 | } 34 | 35 | // Some definitions for shared memory. 36 | // Used to get unpadded thread indices. 37 | #define s_ty (_ty + 1) 38 | #define s_tz (_tz + 1) 39 | #define s_tyy (_tyy + 2) 40 | #define s_tzz (_tzz + 2) 41 | 42 | // Helper definitions. 43 | #define s_next_field (s_tyy * s_tzz) 44 | #define s_to_local (s_ty * s_tzz + (s_tz)) 45 | #define s_zp +1 46 | #define s_zn -1 47 | #define s_yp +s_tzz 48 | #define s_yn -s_tzz 49 | 50 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 51 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 52 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 53 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 54 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 55 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 56 | 57 | // Local memory. 58 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n; 59 | {{ type }} tx, ty, tz; 60 | {{ type }} sx, sy, sz, sy_p, sz_p; 61 | 62 | int xn, xp; 63 | {{ type }} bloch_phaseX_x = 1; 64 | {{ type }} bloch_phaseX_y = 1; 65 | {{ type }} bloch_phaseX_z = 1; 66 | if (_X == 0) { 67 | xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction. 68 | bloch_phaseX_x = conj(bloch_x(0)); 69 | bloch_phaseX_y = conj(bloch_x(1)); 70 | bloch_phaseX_z = conj(bloch_x(2));} 71 | else { 72 | xn = -1;} 73 | 74 | // Load E-fields into shared memory. 75 | if (adj_dims) { 76 | // Load in s = r - alpha * v. 77 | Ex_0[0] = bloch_phaseX_x*bloch_phaseYZ_x*(Rx(-1,0,0) - alpha * Vx(-1,0,0)) * 78 | (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z)); 79 | Ey_0[0] = bloch_phaseX_y*bloch_phaseYZ_y*(Ry(-1,0,0) - alpha * Vy(-1,0,0)) * 80 | (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 81 | Ez_0[0] = bloch_phaseX_z*bloch_phaseYZ_z*(Rz(-1,0,0) - alpha * Vz(-1,0,0)) * 82 | (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 83 | 84 | // Ey_p = Ry(0,0,0) - alpha * Vy(0,0,0); 85 | sy_p = Ry(0,0,0) - alpha * Vy(0,0,0); 86 | Ey_p = bloch_phaseYZ_y * (sy_p) * (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 87 | 88 | // Ez_p = Rz(0,0,0) - alpha * Vz(0,0,0); 89 | sz_p = Rz(0,0,0) - alpha * Vz(0,0,0); 90 | Ez_p = bloch_phaseYZ_z * (sz_p) * (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 91 | } 92 | __syncthreads(); 93 | 94 | // Calculate H-fields and store in shared_memory. 95 | // Hy. 96 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 97 | Hy_0[0] = my(-1,0,0) * (sx1(_X+xn) * (Ez_0[0] - Ez_p) - 98 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 99 | } 100 | 101 | // Hz. 102 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 103 | Hz_0[0] = mz(-1,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 104 | sx1(_X+xn) * (Ey_0[0] - Ey_p)); 105 | } 106 | __syncthreads(); 107 | 108 | for (; _X < _x_end ; _X += _txx) { 109 | // We've moved ahead in X, so transfer appropriate field values. 110 | Ey_0[0] = Ey_p; 111 | Ez_0[0] = Ez_p; 112 | Hy_n = Hy_0[0]; 113 | Hz_n = Hz_0[0]; 114 | 115 | sy = sy_p; 116 | sz = sz_p; 117 | 118 | // Load E-fields into shared memory. 119 | if (_X == {{ dims[0]-1 }}){ 120 | xp = {{ -(dims[0]-1) }}; 121 | bloch_phaseX_x = bloch_x(0); 122 | bloch_phaseX_y = bloch_x(1); 123 | bloch_phaseX_z = bloch_x(2);} 124 | else { 125 | xp = +1; 126 | bloch_phaseX_x = 1; 127 | bloch_phaseX_y = 1; 128 | bloch_phaseX_z = 1;} 129 | 130 | if (adj_dims) { 131 | sx = Rx(0,0,0) - alpha * Vx(0,0,0); 132 | Ex_0[0] = bloch_phaseYZ_x * (sx) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z)); 133 | 134 | sy_p = Ry(+1,0,0) - alpha * Vy(+1,0,0); 135 | Ey_p = bloch_phaseX_y * bloch_phaseYZ_y * (sy_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 136 | 137 | sz_p = Rz(+1,0,0) - alpha * Vz(+1,0,0); 138 | Ez_p = bloch_phaseX_z * bloch_phaseYZ_z * (sz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 139 | } 140 | 141 | __syncthreads(); 142 | 143 | // Calculate H-fields and store in shared_memory. 144 | {% if mu_equals_1 == True %} 145 | // Hx. 146 | if ((_ty != _tyy) && (_tz != _tzz)) { 147 | Hx_0[0] = (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 148 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 149 | } 150 | 151 | // Hy. 152 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 153 | Hy_0[0] = (sx1(_X) * (Ez_0[0] - Ez_p) - 154 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 155 | } 156 | 157 | // Hz. 158 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 159 | Hz_0[0] = (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 160 | sx1(_X) * (Ey_0[0] - Ey_p)); 161 | } 162 | {% else %} 163 | // Hx. 164 | if ((_ty != _tyy) && (_tz != _tzz)) { 165 | Hx_0[0] = mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 166 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 167 | } 168 | 169 | // Hy. 170 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 171 | Hy_0[0] = my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 172 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 173 | } 174 | 175 | // Hz. 176 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 177 | Hz_0[0] = mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 178 | sx1(_X) * (Ey_0[0] - Ey_p)); 179 | } 180 | {% endif %} 181 | __syncthreads(); 182 | 183 | // Write out the results. 184 | if (_in_global && _in_local) { 185 | {% if full_operator %} 186 | Sx(0,0,0) = sx; 187 | Sy(0,0,0) = sy; 188 | Sz(0,0,0) = sz; 189 | 190 | tx = ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) * 191 | (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn]) 192 | - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn]) 193 | - ex(0,0,0) * Ex_0[0])); 194 | ty = ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) * 195 | (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 196 | - sx0(_X) * (Hz_0[0] - Hz_n) 197 | - ey(0,0,0) * Ey_0[0])); 198 | tz = ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) * 199 | (sx0(_X) * (Hy_0[0] - Hy_n) 200 | - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 201 | - ez(0,0,0) * Ez_0[0])); 202 | 203 | Tx(0,0,0) = tx; 204 | Ty(0,0,0) = ty; 205 | Tz(0,0,0) = tz; 206 | 207 | omega_num += conj(tx) * sx + conj(ty) * sy + conj(tz) * sz; 208 | omega_denom += (real(tx) * real(tx)) + (imag(tx) * imag(tx)) + 209 | (real(ty) * real(ty)) + (imag(ty) * imag(ty)) + 210 | (real(tz) * real(tz)) + (imag(tz) * imag(tz)); 211 | //omega_num += tx * sx + ty * sy + tz * sz; 212 | //omega_denom += tx * tx + ty * ty + tz * tz; 213 | 214 | {% else %} 215 | Vx(0,0,0) = Hx_0[0]; 216 | Vy(0,0,0) = Hy_0[0]; 217 | Vz(0,0,0) = Hz_0[0]; 218 | 219 | {% endif %} 220 | } 221 | __syncthreads(); 222 | } 223 | -------------------------------------------------------------------------------- /maxwell-solver/gce/grid.py: -------------------------------------------------------------------------------- 1 | """ Defines the Grid class for GCE. """ 2 | 3 | from pycuda import gpuarray as ga 4 | from pycuda import driver as drv 5 | from gce.space import get_space_info 6 | from gce.data import Data 7 | import numpy as np 8 | from mpi4py.MPI import COMM_WORLD as comm 9 | import threading 10 | 11 | 12 | 13 | class Grid(Data): 14 | """ Grid class for GCE. 15 | 16 | Grids store modifiable information on a 3D rectangular grid. 17 | 18 | Grids may be split up along the x-axis for parallel processing. 19 | If a particular Grid requires adjacent values in the x-direction, 20 | then the needed adjacent cells can be synchronized through use of the 21 | x_overlap option and the synchronize(), synchronize_start(), and 22 | synchronize_isdone() functions. 23 | 24 | Derives from the Data class. 25 | 26 | New methods: 27 | __init__ -- Loads a (possibly empty) array onto the GPU. 28 | synchronize -- Used to synchronize Grids with non-zero x_overlap (blocking). 29 | synchronize_start -- Initiate non-blocking synchronization. 30 | synchronize_isdone -- Advance and complete non-blocking synchronization. 31 | 32 | New variables: 33 | none 34 | 35 | """ 36 | 37 | 38 | def __init__(self, array_or_dtype, x_overlap=0): 39 | """ Create a spatial grid on the GPU(s). 40 | 41 | Input variables 42 | array_or_dtype -- can either be a numpy array of the same shape as 43 | the global space, or a numpy dtype. If a valid array is passed, 44 | it will be loaded on to the GPU. If a dtype is passed, then 45 | an array of zeros, of that dtype will be loaded onto the GPU. 46 | 47 | Optional variables 48 | x_overlap -- the number of adjacent cells in either the negative or 49 | positive x-direction that need to simultaneously be accessed along 50 | with the current cell. Must be a non-negative integer. Default 51 | value is 0. 52 | 53 | """ 54 | 55 | shape = get_space_info()['shape'] # Get the shape of the space. 56 | xr = get_space_info()['x_range'] # Get the local x_range. 57 | all_x_ranges = get_space_info()['all_x_ranges'] # Get the local x_range. 58 | local_shape = (xr[1]-xr[0], shape[1], shape[2]) 59 | 60 | self._set_gce_type('grid') # Set the gce type to grid. 61 | 62 | # Make sure overlap option is valid. 63 | if type(x_overlap) is not int: 64 | raise TypeError('x_overlap must be an integer.') 65 | elif x_overlap < 0: 66 | raise TypeError('x_overlap must be a non-negative integer.') 67 | 68 | if comm.rank == 0: 69 | # Process the array_or_dtype input variable. 70 | if type(array_or_dtype) is np.ndarray: # Input is an array. 71 | array = array_or_dtype 72 | 73 | # Make sure the array is of the correct shape. 74 | if array.shape != shape: 75 | raise TypeError('Shape of array does not match shape of space.') 76 | 77 | # Make sure the array is of a valid datatype. 78 | self._get_dtype(array.dtype.type) 79 | 80 | 81 | elif type(array_or_dtype) is type: # Input is a datatype. 82 | self._get_dtype(array_or_dtype) # Validate the dtype. 83 | array = np.zeros(shape, dtype=self.dtype) # Make a zeros array. 84 | 85 | else: # Invalid input. 86 | raise TypeError('Input variable must be a numpy array or dtype') 87 | 88 | # Prepare array to be scattered. 89 | array = [array[r[0]:r[1],:,:] for r in all_x_ranges] 90 | 91 | else: 92 | array = None 93 | 94 | array = comm.scatter(array) 95 | self._get_dtype(array.dtype.type) 96 | 97 | # # Narrow down the array to local x_range. 98 | # array = array[xr[0]:xr[1],:,:] 99 | 100 | # Add padding to array, if needed. 101 | self._xlap = x_overlap 102 | if self._xlap is not 0: 103 | padding = np.empty((self._xlap,) + shape[1:3], dtype=array.dtype) 104 | array = np.concatenate((padding, array, padding), axis=0) 105 | 106 | self.to_gpu(array) # Load onto device. 107 | 108 | # Determine information needed for synchronization. 109 | if self._xlap is not 0: 110 | # Calculates the pointer to the x offset in a grid. 111 | ptr_dx = lambda x_pos: self.data.ptr + self.data.dtype.itemsize * \ 112 | x_pos * shape[1] * shape[2] 113 | 114 | # Pointers to different sections of the grid that are relevant 115 | # for synchronization. 116 | self._sync_ptrs = { 'forw_src': ptr_dx(xr[1]-xr[0]), \ 117 | 'back_dest': ptr_dx(0), \ 118 | 'back_src': ptr_dx(self._xlap), \ 119 | 'forw_dest': ptr_dx(xr[1]-xr[0] + self._xlap)} 120 | 121 | # Buffers used during synchronization. 122 | self._sync_buffers = [drv.pagelocked_empty( \ 123 | (self._xlap, shape[1], shape[2]), \ 124 | self.dtype) for k in range(4)] 125 | 126 | # Streams used during synchronization. 127 | self._sync_streams = [drv.Stream() for k in range(4)] 128 | 129 | # Used to identify neighboring MPI nodes with whom to synchronize. 130 | self._sync_adj = get_space_info()['mpi_adj'] 131 | 132 | # Offset in bytes to the true start of the grid. 133 | # This is used to "hide" overlap areas from the kernel. 134 | self._xlap_offset = self.data.dtype.itemsize * \ 135 | self._xlap * shape[1] * shape[2] 136 | 137 | self.synchronize() # Synchronize the grid. 138 | comm.Barrier() # Wait for all grids to synchronize before proceeding. 139 | 140 | def get(self): 141 | """ Redefined so that we don't get overlap data. """ 142 | # Get our section of the grid (excluding overlap). 143 | if self._xlap is 0: 144 | data = self.data.get() 145 | else: 146 | data = self.data.get()[self._xlap:-self._xlap,:,:] 147 | 148 | # return np.concatenate(comm.allgather(data), axis=0) # Super-simple. 149 | 150 | result = comm.gather(data) # Gather all peices to root. 151 | if comm.Get_rank() == 0: 152 | # Root node glues everything together. 153 | return np.concatenate(result, axis=0) 154 | else: 155 | return None 156 | 157 | def _get_raw(self): 158 | """ Output even the overlap data. Just for debugging/testing. """ 159 | return self.data.get() 160 | 161 | def synchronize(self): 162 | """ Blocking synchronization. """ 163 | 164 | if self._xlap is 0: 165 | raise TypeError('No need to synchronize Grid with no overlaps.') 166 | 167 | self.synchronize_start() 168 | while not self.synchronize_isdone(): 169 | pass 170 | 171 | def synchronize_start(self): 172 | """ Start the synchronization process. """ 173 | 174 | # Use shorter, easier names for class variables. 175 | bufs = self._sync_buffers 176 | ptrs = self._sync_ptrs 177 | streams = self._sync_streams 178 | adj = self._sync_adj 179 | 180 | # Start the transfer operations needed. 181 | self._sync_tags = [mpi_tag() for k in range(2)] # Mpi message tags. 182 | 183 | # Forward send. 184 | drv.memcpy_dtoh_async(bufs[0], ptrs['forw_src'], stream=streams[0]) 185 | 186 | # Backward send. 187 | drv.memcpy_dtoh_async(bufs[1], ptrs['back_src'], stream=streams[1]) 188 | 189 | # Forward receive. 190 | self._sync_req_forw = comm.Irecv(bufs[2], source=adj['back'], \ 191 | tag=self._sync_tags[0]) 192 | 193 | # Backward receive. 194 | self._sync_req_back = comm.Irecv(bufs[3], source=adj['forw'], \ 195 | tag=self._sync_tags[1]) 196 | 197 | # Signalling variables needed to complete transfers. 198 | self._sync_part2_start = [False, False, False, False] 199 | 200 | 201 | def synchronize_isdone(self): 202 | """ Complete synchronization process. """ 203 | 204 | # Use shorter, easier names for class variables. 205 | bufs = self._sync_buffers 206 | ptrs = self._sync_ptrs 207 | streams = self._sync_streams 208 | adj = self._sync_adj 209 | part2_start = self._sync_part2_start 210 | is_done = [False, False, False, False] 211 | 212 | # Forward send. 213 | if streams[0].is_done(): # Device-to-host copy completed. 214 | if not part2_start[0]: # Initialize MPI send. 215 | comm.Isend(bufs[0], dest=adj['forw'], tag=self._sync_tags[0]) 216 | part2_start[0] = True 217 | is_done[0] = True 218 | else: # No more work to do. 219 | is_done[0] = True 220 | 221 | # Backward send. 222 | if streams[1].is_done(): # Device-to-host copy completed. 223 | if not part2_start[1]: # Initialize MPI send. 224 | comm.Isend(bufs[1], dest=adj['back'], tag=self._sync_tags[1]) 225 | part2_start[1] = True 226 | is_done[1] = True 227 | else: # No more work to do. 228 | is_done[1] = True 229 | 230 | # Forward receive. 231 | if self._sync_req_forw.Test(): # MPI receive completed. 232 | if not part2_start[2]: # Initialize host-to-device copy. 233 | drv.memcpy_htod_async(ptrs['back_dest'], bufs[2], \ 234 | stream=streams[2]) # Host-to-device. 235 | part2_start[2] = True 236 | elif streams[2].is_done(): # Host-to-device copy completed. 237 | is_done[2] = True 238 | 239 | # Backward receive. 240 | if self._sync_req_back.Test(): # MPI receive completed. 241 | if not part2_start[3]: # Initialize host-to-device copy. 242 | drv.memcpy_htod_async(ptrs['forw_dest'], bufs[3], \ 243 | stream=streams[3]) # Host-to-device. 244 | part2_start[3] = True 245 | elif streams[3].is_done(): # Host-to-device copy completed. 246 | is_done[3] = True 247 | # print '~', is_done[0:4], 248 | # Return true only when all four transfers are complete. 249 | return all(is_done) 250 | 251 | 252 | __MPI_TAG_NUM = 0 # Global variable used to generate unique mpi tags. 253 | 254 | def mpi_tag(): 255 | """ Get a new, unique mpi tag number. """ 256 | global __MPI_TAG_NUM # Get the global variable. 257 | tag = __MPI_TAG_NUM # The variable to return. 258 | __MPI_TAG_NUM += 1 259 | return tag 260 | -------------------------------------------------------------------------------- /maxwell-solver/kernels/fdfd_matrix_multiplication_pec_pmc.cu: -------------------------------------------------------------------------------- 1 | // Mark the threads that need to load from global memory. 2 | const bool adj_dims = (((_X >= -1) && (_X <= {{ dims[0] }})) && \ 3 | ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \ 4 | ((_Z >= -1) && (_Z <= {{ dims[2] }}))); 5 | 6 | // Set relevant field pointers to create wrap-around periodic grid. 7 | {{ type }} bloch_phaseYZ_x = 1.0; 8 | {{ type }} bloch_phaseYZ_y = 1.0; 9 | {{ type }} bloch_phaseYZ_z = 1.0; 10 | 11 | {{ type }} pc_yz_factor[3] = { 1.0, 1.0, 1.0 }; 12 | int pc_iy_Ex = 0; 13 | int pc_iz_Ex = 0; 14 | int pc_iy_Ey = 0; 15 | int pc_iz_Ey = 0; 16 | int pc_iy_Ez = 0; 17 | int pc_iz_Ez = 0; 18 | 19 | if (_Y == -1) { 20 | _Y = {{ dims[1]-1 }}; 21 | bloch_phaseYZ_x *= conj(bloch_y(0)); 22 | bloch_phaseYZ_y *= conj(bloch_y(1)); 23 | bloch_phaseYZ_z *= conj(bloch_y(2)); 24 | if ( pemc(2) == 1 ) { 25 | //PEC (anti-symmetric) 26 | _Y = 0; 27 | pc_yz_factor[0] = -1.0; 28 | pc_yz_factor[2] = -1.0; 29 | pc_iy_Ex = 1; 30 | pc_iy_Ey = 0; 31 | pc_iy_Ez = 1; 32 | } else if ( pemc(2) == 2 ) { 33 | //PMC (symmetric) 34 | _Y = 0; 35 | pc_yz_factor[1] = -1.0; 36 | pc_iy_Ex = 1; 37 | pc_iy_Ey = 0; 38 | pc_iy_Ez = 1; 39 | } 40 | } 41 | if (_Y == {{ dims[1]-1 }}) { 42 | if ( pemc(3) == 1 ) { 43 | pc_iy_Ey = -1; 44 | } 45 | if ( pemc(3) == 2 ) { 46 | pc_iy_Ey = -1; 47 | pc_yz_factor[1] = -1.0; 48 | } 49 | } 50 | if (_Y == {{ dims[1] }}) { 51 | _Y = 0; 52 | bloch_phaseYZ_x *= bloch_y(0); 53 | bloch_phaseYZ_y *= bloch_y(1); 54 | bloch_phaseYZ_z *= bloch_y(2); 55 | if ( pemc(3) == 1 ) { 56 | //PEC 57 | _Y = {{ dims[1]-1 }}; 58 | pc_yz_factor[0] = -1.0; 59 | pc_yz_factor[2] = -1.0; 60 | pc_iy_Ex = -1; 61 | pc_iy_Ey = 0; 62 | pc_iy_Ez = -1; 63 | } 64 | if ( pemc(3) == 2 ) { 65 | //PMC 66 | _Y = {{ dims[1]-1 }}; 67 | pc_yz_factor[1] = -1.0; 68 | pc_iy_Ex = -1; 69 | pc_iy_Ey = 0; 70 | pc_iy_Ez = -1; 71 | } 72 | } 73 | if (_Z == -1) { 74 | _Z = {{ dims[2]-1 }}; 75 | bloch_phaseYZ_x *= conj(bloch_z(0)); 76 | bloch_phaseYZ_y *= conj(bloch_z(1)); 77 | bloch_phaseYZ_z *= conj(bloch_z(2)); 78 | if ( pemc(4) == 1 ) { 79 | _Z = 0; 80 | pc_yz_factor[0] = -1.0; 81 | pc_yz_factor[1] = -1.0; 82 | pc_iz_Ex = 1; 83 | pc_iz_Ey = 1; 84 | pc_iz_Ez = 0; 85 | } else if ( pemc(4) == 2 ) { 86 | _Z = 0; 87 | pc_yz_factor[2] = -1.0; 88 | pc_iz_Ex = 1; 89 | pc_iz_Ey = 1; 90 | pc_iz_Ez = 0; 91 | } 92 | } 93 | if (_Z == {{ dims[2]-1 }}) { 94 | 95 | if ( pemc(5) == 1 ) { 96 | pc_iz_Ez = -1; 97 | } 98 | if ( pemc(5) == 2 ) { 99 | pc_iz_Ez = -1; 100 | pc_yz_factor[2] = -1.0; 101 | } 102 | } 103 | if (_Z == {{ dims[2] }}) { 104 | _Z = 0; 105 | bloch_phaseYZ_x *= bloch_z(0); 106 | bloch_phaseYZ_y *= bloch_z(1); 107 | bloch_phaseYZ_z *= bloch_z(2); 108 | if ( pemc(5) == 1 ) { 109 | _Z = {{ dims[2]-1 }}; 110 | pc_yz_factor[0] = -1.0; 111 | pc_yz_factor[1] = -1.0; 112 | pc_iz_Ex = -1; 113 | pc_iz_Ey = -1; 114 | pc_iz_Ez = 0; 115 | } else if ( pemc(5) == 2 ) { 116 | _Z = {{ dims[2]-1 }}; 117 | pc_yz_factor[2] = 0.0; 118 | pc_iz_Ex = -1; 119 | pc_iz_Ey = -1; 120 | pc_iz_Ez = 0; 121 | } 122 | } 123 | 124 | // Some definitions for shared memory. 125 | // Used to get unpadded thread indices. 126 | #define s_ty (_ty + 1) 127 | #define s_tz (_tz + 1) 128 | #define s_tyy (_tyy + 2) 129 | #define s_tzz (_tzz + 2) 130 | 131 | // Helper definitions. 132 | #define s_next_field (s_tyy * s_tzz) 133 | #define s_to_local (s_ty * s_tzz + (s_tz)) 134 | #define s_zp +1 135 | #define s_zn -1 136 | #define s_yp +s_tzz 137 | #define s_yn -s_tzz 138 | 139 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 140 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 141 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 142 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 143 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 144 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 145 | 146 | // Local memory. 147 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n; 148 | {{ type }} bx, by, bz; 149 | {{ type }} px, py, pz, py_p, pz_p; 150 | 151 | int xn, xp; 152 | {{ type }} bloch_phaseX_x = 1; 153 | {{ type }} bloch_phaseX_y = 1; 154 | {{ type }} bloch_phaseX_z = 1; 155 | {{ type }} pc_x_factor[3] = { 1.0, 1.0, 1.0 }; 156 | int pc_ix_Ex = -1; 157 | int pc_ix_Ey = -1; 158 | int pc_ix_Ez = -1; 159 | if (_X == 0) { 160 | bloch_phaseX_x = conj(bloch_x(0)); 161 | bloch_phaseX_y = conj(bloch_x(1)); 162 | bloch_phaseX_z = conj(bloch_x(2)); 163 | if ( pemc(0) == 1 ) { 164 | pc_x_factor[1] = -1.0; 165 | pc_x_factor[2] = -1.0; 166 | pc_ix_Ex = 0; 167 | pc_ix_Ey = 1; 168 | pc_ix_Ez = 1; 169 | xn = 0; 170 | } else if ( pemc(0) == 2 ) { 171 | pc_x_factor[0] = -1.0; 172 | pc_ix_Ex = 0; 173 | pc_ix_Ey = 1; 174 | pc_ix_Ez = 1; 175 | xn = 0; 176 | } else { 177 | pc_ix_Ex = -1; 178 | pc_ix_Ey = -1; 179 | pc_ix_Ez = -1; 180 | xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction. 181 | } 182 | } else { 183 | xn = -1;} 184 | 185 | // Load E-fields into shared memory. 186 | if (adj_dims) { 187 | // Load in p = x 188 | Ex_0[0] = bloch_phaseX_x * bloch_phaseYZ_x * pc_x_factor[0] * pc_yz_factor[0] * 189 | (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z)) * 190 | Xx(pc_ix_Ex, pc_iy_Ex, pc_iz_Ex); 191 | Ey_0[0] = bloch_phaseX_y * bloch_phaseYZ_y * pc_x_factor[1] * pc_yz_factor[1] * 192 | (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z)) * 193 | Xy(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey); 194 | Ez_0[0] = bloch_phaseX_z * bloch_phaseYZ_z * pc_x_factor[2] * pc_yz_factor[2] * 195 | (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z)) * 196 | Xz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez); 197 | 198 | py_p = Xy(0, pc_iy_Ey, pc_iz_Ey); 199 | Ey_p = bloch_phaseYZ_y * pc_yz_factor[1] * (py_p) * 200 | (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 201 | 202 | pz_p = Xz(0, pc_iy_Ez, pc_iz_Ez); 203 | Ez_p = bloch_phaseYZ_z * pc_yz_factor[2] * (pz_p) * 204 | (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 205 | } 206 | __syncthreads(); 207 | 208 | // Calculate H-fields and store in shared_memory. 209 | // Hy. 210 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 211 | Hy_0[0] = my(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey) * 212 | (sx1(_X+xn) * (Ez_0[0] - Ez_p) - sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 213 | } 214 | 215 | // Hz. 216 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 217 | Hz_0[0] = mz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez) * 218 | (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - sx1(_X+xn) * (Ey_0[0] - Ey_p)); 219 | } 220 | __syncthreads(); 221 | 222 | // reset the pemc factors and ix's 223 | pc_x_factor[0] = 1.0; 224 | pc_x_factor[1] = 1.0; 225 | pc_x_factor[2] = 1.0; 226 | pc_ix_Ex = 0; 227 | pc_ix_Ey = 1; 228 | pc_ix_Ez = 1; 229 | // start loop in x direction 230 | for (; _X < _x_end ; _X += _txx) { 231 | // We've moved ahead in X, so transfer appropriate field values. 232 | Ey_0[0] = Ey_p; 233 | Ez_0[0] = Ez_p; 234 | Hy_n = Hy_0[0]; 235 | Hz_n = Hz_0[0]; 236 | 237 | py = py_p; 238 | pz = pz_p; 239 | 240 | // Load E-fields into shared memory. 241 | if (_X == {{ dims[0]-1 }}){ 242 | if ( pemc(1) == 1 ) { 243 | // PEC 244 | pc_x_factor[1] = -1.0; 245 | pc_x_factor[2] = -1.0; 246 | pc_ix_Ex = -1; 247 | pc_ix_Ey = -1; 248 | pc_ix_Ez = -1; 249 | xp = 0; 250 | } else if ( pemc(1) == 2 ) { 251 | // PMC 252 | pc_x_factor[0] = -1.0; 253 | pc_ix_Ex = -1; 254 | pc_ix_Ey = -1; 255 | pc_ix_Ez = -1; 256 | xp = 0; 257 | } else { 258 | // bloch 259 | bloch_phaseX_x = bloch_x(0); 260 | bloch_phaseX_y = bloch_x(1); 261 | bloch_phaseX_z = bloch_x(2); 262 | xp = {{ -(dims[0]-1) }}; // Wrap-around step in the negative direction. 263 | } 264 | } else { 265 | xp = +1; 266 | bloch_phaseX_x = 1; 267 | bloch_phaseX_y = 1; 268 | bloch_phaseX_z = 1; 269 | } 270 | 271 | if (adj_dims) { 272 | px = Xx(pc_ix_Ex,pc_iy_Ex,pc_iz_Ex); 273 | Ex_0[0] = bloch_phaseYZ_x * pc_x_factor[0] * pc_yz_factor[0] * 274 | (px) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z)); 275 | 276 | py_p = Xy(pc_ix_Ey,pc_iy_Ey,pc_iz_Ey); 277 | Ey_p = bloch_phaseX_y * bloch_phaseYZ_y * pc_x_factor[1] * pc_yz_factor[1] * 278 | (py_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 279 | 280 | pz_p = Xz(pc_ix_Ez,pc_iy_Ez,pc_iz_Ez); 281 | Ez_p = bloch_phaseX_z * bloch_phaseYZ_z * pc_x_factor[2] * pc_yz_factor[2] * 282 | (pz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 283 | } 284 | 285 | __syncthreads(); 286 | 287 | // Calculate H-fields and store in shared_memory. 288 | {% if mu_equals_1 == True %} 289 | // Hx. 290 | if ((_ty != _tyy) && (_tz != _tzz)) { 291 | Hx_0[0] = (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 292 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 293 | } 294 | 295 | // Hy. 296 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 297 | Hy_0[0] = (sx1(_X) * (Ez_0[0] - Ez_p) - 298 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 299 | } 300 | 301 | // Hz. 302 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 303 | Hz_0[0] = (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 304 | sx1(_X) * (Ey_0[0] - Ey_p)); 305 | } 306 | {% else %} 307 | // Hx. 308 | if ((_ty != _tyy) && (_tz != _tzz)) { 309 | Hx_0[0] = mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 310 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 311 | } 312 | 313 | // Hy. 314 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 315 | Hy_0[0] = my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 316 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 317 | } 318 | 319 | // Hz. 320 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 321 | Hz_0[0] = mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 322 | sx1(_X) * (Ey_0[0] - Ey_p)); 323 | } 324 | {% endif %} 325 | __syncthreads(); 326 | 327 | // Write out the results. 328 | if (_in_global && _in_local) { 329 | {% if full_operator %} 330 | bx = ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) * 331 | (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn]) 332 | - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn]) 333 | - ex(0,0,0) * Ex_0[0])); 334 | by = ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) * 335 | (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 336 | - sx0(_X) * (Hz_0[0] - Hz_n) 337 | - ey(0,0,0) * Ey_0[0])); 338 | bz = ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) * 339 | (sx0(_X) * (Hy_0[0] - Hy_n) 340 | - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 341 | - ez(0,0,0) * Ez_0[0])); 342 | 343 | Bx(0,0,0) = bx; 344 | By(0,0,0) = by; 345 | Bz(0,0,0) = bz; 346 | 347 | {% else %} 348 | Bx(0,0,0) = Hx_0[0]; 349 | By(0,0,0) = Hy_0[0]; 350 | Bz(0,0,0) = Hz_0[0]; 351 | 352 | {% endif %} 353 | } 354 | __syncthreads(); 355 | } 356 | -------------------------------------------------------------------------------- /maxwell-solver/kernels/fdfd_residual_pec_pmc.cu: -------------------------------------------------------------------------------- 1 | // Mark the threads that need to load from global memory. 2 | const bool adj_dims = (((_X >= -1) && (_X <= {{ dims[0] }})) && \ 3 | ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \ 4 | ((_Z >= -1) && (_Z <= {{ dims[2] }}))); 5 | 6 | // Set relevant field pointers to create wrap-around periodic grid. 7 | {{ type }} bloch_phaseYZ_x = 1.0; 8 | {{ type }} bloch_phaseYZ_y = 1.0; 9 | {{ type }} bloch_phaseYZ_z = 1.0; 10 | 11 | {{ type }} pc_yz_factor[3] = { 1.0, 1.0, 1.0 }; 12 | int pc_iy_Ex = 0; 13 | int pc_iz_Ex = 0; 14 | int pc_iy_Ey = 0; 15 | int pc_iz_Ey = 0; 16 | int pc_iy_Ez = 0; 17 | int pc_iz_Ez = 0; 18 | 19 | if (_Y == -1) { 20 | _Y = {{ dims[1]-1 }}; 21 | bloch_phaseYZ_x *= conj(bloch_y(0)); 22 | bloch_phaseYZ_y *= conj(bloch_y(1)); 23 | bloch_phaseYZ_z *= conj(bloch_y(2)); 24 | if ( pemc(2) == 1 ) { 25 | //PEC (anti-symmetric) 26 | _Y = 0; 27 | pc_yz_factor[0] = -1.0; 28 | pc_yz_factor[2] = -1.0; 29 | pc_iy_Ex = 1; 30 | pc_iy_Ey = 0; 31 | pc_iy_Ez = 1; 32 | } else if ( pemc(2) == 2 ) { 33 | //PMC (symmetric) 34 | _Y = 0; 35 | pc_yz_factor[1] = -1.0; 36 | pc_iy_Ex = 1; 37 | pc_iy_Ey = 0; 38 | pc_iy_Ez = 1; 39 | } 40 | } 41 | if (_Y == {{ dims[1]-1 }}) { 42 | if ( pemc(3) == 1 ) { 43 | pc_iy_Ey = -1; 44 | } 45 | if ( pemc(3) == 2 ) { 46 | pc_iy_Ey = -1; 47 | pc_yz_factor[1] = -1.0; 48 | } 49 | } 50 | if (_Y == {{ dims[1] }}) { 51 | _Y = 0; 52 | bloch_phaseYZ_x *= bloch_y(0); 53 | bloch_phaseYZ_y *= bloch_y(1); 54 | bloch_phaseYZ_z *= bloch_y(2); 55 | if ( pemc(3) == 1 ) { 56 | //PEC 57 | _Y = {{ dims[1]-1 }}; 58 | pc_yz_factor[0] = -1.0; 59 | pc_yz_factor[2] = -1.0; 60 | pc_iy_Ex = -1; 61 | pc_iy_Ey = 0; 62 | pc_iy_Ez = -1; 63 | } 64 | if ( pemc(3) == 2 ) { 65 | //PMC 66 | _Y = {{ dims[1]-1 }}; 67 | pc_yz_factor[1] = -1.0; //this value does not matter 68 | pc_iy_Ex = -1; 69 | pc_iy_Ey = 0; 70 | pc_iy_Ez = -1; 71 | } 72 | } 73 | if (_Z == -1) { 74 | _Z = {{ dims[2]-1 }}; 75 | bloch_phaseYZ_x *= conj(bloch_z(0)); 76 | bloch_phaseYZ_y *= conj(bloch_z(1)); 77 | bloch_phaseYZ_z *= conj(bloch_z(2)); 78 | if ( pemc(4) == 1 ) { 79 | //PEC (anti-symmetric) 80 | _Z = 0; 81 | pc_yz_factor[0] = -1.0; 82 | pc_yz_factor[1] = -1.0; 83 | pc_iz_Ex = 1; 84 | pc_iz_Ey = 1; 85 | pc_iz_Ez = 0; 86 | } else if ( pemc(4) == 2 ) { 87 | //PMC (symmetric) 88 | _Z = 0; 89 | pc_yz_factor[2] = -1.0; 90 | pc_iz_Ex = 1; 91 | pc_iz_Ey = 1; 92 | pc_iz_Ez = 0; 93 | } 94 | } 95 | if (_Z == {{ dims[2]-1 }}) { 96 | 97 | if ( pemc(5) == 1 ) { 98 | pc_iz_Ez = -1; 99 | } 100 | if ( pemc(5) == 2 ) { 101 | pc_iz_Ez = -1; 102 | pc_yz_factor[2] = -1.0; 103 | } 104 | } 105 | if (_Z == {{ dims[2] }}) { 106 | _Z = 0; 107 | bloch_phaseYZ_x *= bloch_z(0); 108 | bloch_phaseYZ_y *= bloch_z(1); 109 | bloch_phaseYZ_z *= bloch_z(2); 110 | if ( pemc(5) == 1 ) { 111 | //PEC 112 | _Z = {{ dims[2]-1 }}; 113 | pc_yz_factor[0] = -1.0; 114 | pc_yz_factor[1] = -1.0; 115 | pc_iz_Ex = -1; 116 | pc_iz_Ey = -1; 117 | pc_iz_Ez = 0; 118 | } else if ( pemc(5) == 2 ) { 119 | //PMC 120 | _Z = {{ dims[2]-1 }}; 121 | pc_yz_factor[2] = 0.0; //this value does not matter 122 | pc_iz_Ex = -1; 123 | pc_iz_Ey = -1; 124 | pc_iz_Ez = 0; 125 | } 126 | } 127 | 128 | // Some definitions for shared memory. 129 | // Used to get unpadded thread indices. 130 | #define s_ty (_ty + 1) 131 | #define s_tz (_tz + 1) 132 | #define s_tyy (_tyy + 2) 133 | #define s_tzz (_tzz + 2) 134 | 135 | // Helper definitions. 136 | #define s_next_field (s_tyy * s_tzz) 137 | #define s_to_local (s_ty * s_tzz + (s_tz)) 138 | #define s_zp +1 139 | #define s_zn -1 140 | #define s_yp +s_tzz 141 | #define s_yn -s_tzz 142 | 143 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 144 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 145 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 146 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 147 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 148 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 149 | 150 | // Local memory. 151 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n; 152 | {{ type }} rx, ry, rz; 153 | {{ type }} px, py, pz, py_p, pz_p; 154 | 155 | int xn, xp; 156 | {{ type }} bloch_phaseX_x = 1; 157 | {{ type }} bloch_phaseX_y = 1; 158 | {{ type }} bloch_phaseX_z = 1; 159 | {{ type }} pc_x_factor[3] = { 1.0, 1.0, 1.0 }; 160 | int pc_ix_Ex = -1; 161 | int pc_ix_Ey = -1; 162 | int pc_ix_Ez = -1; 163 | if (_X == 0) { 164 | bloch_phaseX_x = conj(bloch_x(0)); 165 | bloch_phaseX_y = conj(bloch_x(1)); 166 | bloch_phaseX_z = conj(bloch_x(2)); 167 | if ( pemc(0) == 1 ) { 168 | pc_x_factor[1] = -1.0; 169 | pc_x_factor[2] = -1.0; 170 | pc_ix_Ex = 0; 171 | pc_ix_Ey = 1; 172 | pc_ix_Ez = 1; 173 | xn = 0; 174 | } else if ( pemc(0) == 2 ) { 175 | pc_x_factor[0] = -1.0; 176 | pc_ix_Ex = 0; 177 | pc_ix_Ey = 1; 178 | pc_ix_Ez = 1; 179 | xn = 0; 180 | } else { 181 | pc_ix_Ex = -1; 182 | pc_ix_Ey = -1; 183 | pc_ix_Ez = -1; 184 | xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction. 185 | } 186 | } else { 187 | xn = -1;} 188 | 189 | // Load E-fields into shared memory. 190 | if (adj_dims) { 191 | // Load in p = x 192 | Ex_0[0] = bloch_phaseX_x * bloch_phaseYZ_x * pc_x_factor[0] * pc_yz_factor[0] * 193 | (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z)) * 194 | Xx(pc_ix_Ex, pc_iy_Ex, pc_iz_Ex); 195 | Ey_0[0] = bloch_phaseX_y * bloch_phaseYZ_y * pc_x_factor[1] * pc_yz_factor[1] * 196 | (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z)) * 197 | Xy(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey); 198 | Ez_0[0] = bloch_phaseX_z * bloch_phaseYZ_z * pc_x_factor[2] * pc_yz_factor[2] * 199 | (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z)) * 200 | Xz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez); 201 | 202 | py_p = Xy(0, pc_iy_Ey, pc_iz_Ey); 203 | Ey_p = bloch_phaseYZ_y * pc_yz_factor[1] * (py_p) * 204 | (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 205 | 206 | pz_p = Xz(0, pc_iy_Ez, pc_iz_Ez); 207 | Ez_p = bloch_phaseYZ_z * pc_yz_factor[2] * (pz_p) * 208 | (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 209 | } 210 | __syncthreads(); 211 | 212 | // Calculate H-fields and store in shared_memory. 213 | // Hy. 214 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 215 | Hy_0[0] = my(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey) * 216 | (sx1(_X+xn) * (Ez_0[0] - Ez_p) - sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 217 | } 218 | 219 | // Hz. 220 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 221 | Hz_0[0] = mz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez) * 222 | (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - sx1(_X+xn) * (Ey_0[0] - Ey_p)); 223 | } 224 | __syncthreads(); 225 | 226 | // reset the pemc factors and ix's 227 | pc_x_factor[0] = 1.0; 228 | pc_x_factor[1] = 1.0; 229 | pc_x_factor[2] = 1.0; 230 | pc_ix_Ex = 0; 231 | pc_ix_Ey = 1; 232 | pc_ix_Ez = 1; 233 | // start loop in x direction 234 | for (; _X < _x_end ; _X += _txx) { 235 | // We've moved ahead in X, so transfer appropriate field values. 236 | Ey_0[0] = Ey_p; 237 | Ez_0[0] = Ez_p; 238 | Hy_n = Hy_0[0]; 239 | Hz_n = Hz_0[0]; 240 | 241 | py = py_p; 242 | pz = pz_p; 243 | 244 | // Load E-fields into shared memory. 245 | if (_X == {{ dims[0]-1 }}){ 246 | if ( pemc(1) == 1 ) { 247 | // PEC 248 | pc_x_factor[1] = -1.0; 249 | pc_x_factor[2] = -1.0; 250 | pc_ix_Ex = -1; 251 | pc_ix_Ey = -1; 252 | pc_ix_Ez = -1; 253 | xp = 0; 254 | } else if ( pemc(1) == 2 ) { 255 | // PMC 256 | pc_x_factor[0] = -1.0; 257 | pc_ix_Ex = -1; 258 | pc_ix_Ey = -1; 259 | pc_ix_Ez = -1; 260 | xp = 0; 261 | } else { 262 | // bloch 263 | bloch_phaseX_x = bloch_x(0); 264 | bloch_phaseX_y = bloch_x(1); 265 | bloch_phaseX_z = bloch_x(2); 266 | xp = {{ -(dims[0]-1) }}; // Wrap-around step in the negative direction. 267 | } 268 | } else { 269 | xp = +1; 270 | bloch_phaseX_x = 1; 271 | bloch_phaseX_y = 1; 272 | bloch_phaseX_z = 1; 273 | } 274 | 275 | if (adj_dims) { 276 | px = Xx(pc_ix_Ex,pc_iy_Ex,pc_iz_Ex); 277 | Ex_0[0] = bloch_phaseYZ_x * pc_x_factor[0] * pc_yz_factor[0] * 278 | (px) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z)); 279 | 280 | py_p = Xy(pc_ix_Ey,pc_iy_Ey,pc_iz_Ey); 281 | Ey_p = bloch_phaseX_y * bloch_phaseYZ_y * pc_x_factor[1] * pc_yz_factor[1] * 282 | (py_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 283 | 284 | pz_p = Xz(pc_ix_Ez,pc_iy_Ez,pc_iz_Ez); 285 | Ez_p = bloch_phaseX_z * bloch_phaseYZ_z * pc_x_factor[2] * pc_yz_factor[2] * 286 | (pz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 287 | } 288 | 289 | __syncthreads(); 290 | 291 | // Calculate H-fields and store in shared_memory. 292 | {% if mu_equals_1 == True %} 293 | // Hx. 294 | if ((_ty != _tyy) && (_tz != _tzz)) { 295 | Hx_0[0] = (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 296 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 297 | } 298 | 299 | // Hy. 300 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 301 | Hy_0[0] = (sx1(_X) * (Ez_0[0] - Ez_p) - 302 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 303 | } 304 | 305 | // Hz. 306 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 307 | Hz_0[0] = (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 308 | sx1(_X) * (Ey_0[0] - Ey_p)); 309 | } 310 | {% else %} 311 | // Hx. 312 | if ((_ty != _tyy) && (_tz != _tzz)) { 313 | Hx_0[0] = mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 314 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 315 | } 316 | 317 | // Hy. 318 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 319 | Hy_0[0] = my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 320 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 321 | } 322 | 323 | // Hz. 324 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 325 | Hz_0[0] = mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 326 | sx1(_X) * (Ey_0[0] - Ey_p)); 327 | } 328 | {% endif %} 329 | __syncthreads(); 330 | 331 | // Write out the results. 332 | if (_in_global && _in_local) { 333 | {% if full_operator %} 334 | rx = Bx(0,0,0) - ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) * 335 | (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn]) 336 | - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn]) 337 | - ex(0,0,0) * Ex_0[0])); 338 | ry = By(0,0,0) - ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) * 339 | (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 340 | - sx0(_X) * (Hz_0[0] - Hz_n) 341 | - ey(0,0,0) * Ey_0[0])); 342 | rz = Bz(0,0,0) - ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) * 343 | (sx0(_X) * (Hy_0[0] - Hy_n) 344 | - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 345 | - ez(0,0,0) * Ez_0[0])); 346 | 347 | Rx(0,0,0) = rx; 348 | Ry(0,0,0) = ry; 349 | Rz(0,0,0) = rz; 350 | 351 | {% else %} 352 | Rx(0,0,0) = Hx_0[0]; 353 | Ry(0,0,0) = Hy_0[0]; 354 | Rz(0,0,0) = Hz_0[0]; 355 | 356 | {% endif %} 357 | } 358 | __syncthreads(); 359 | } 360 | -------------------------------------------------------------------------------- /maxwell-solver/kernels/omega_bloch_pmc_pec.cu: -------------------------------------------------------------------------------- 1 | // Mark the threads that need to load from global memory. 2 | const bool adj_dims = (((_X >= -1) && (_X <= {{ dims[0] }})) && \ 3 | ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \ 4 | ((_Z >= -1) && (_Z <= {{ dims[2] }}))); 5 | 6 | // Set relevant field pointers to create wrap-around periodic grid. 7 | {{ type }} bloch_phaseYZ_x = 1.0; 8 | {{ type }} bloch_phaseYZ_y = 1.0; 9 | {{ type }} bloch_phaseYZ_z = 1.0; 10 | 11 | {{ type }} pc_yz_factor[3] = { 1.0, 1.0, 1.0 }; 12 | int pc_iy_Ex = 0; 13 | int pc_iz_Ex = 0; 14 | int pc_iy_Ey = 0; 15 | int pc_iz_Ey = 0; 16 | int pc_iy_Ez = 0; 17 | int pc_iz_Ez = 0; 18 | 19 | if (_Y == -1) { 20 | _Y = {{ dims[1]-1 }}; 21 | bloch_phaseYZ_x *= bloch_y(0); 22 | bloch_phaseYZ_y *= bloch_y(1); 23 | bloch_phaseYZ_z *= bloch_y(2); 24 | if ( pemc(2) == 1 ) { 25 | //PEC (anti-symmetric) 26 | _Y = 0; 27 | pc_yz_factor[0] = -1.0; 28 | pc_yz_factor[2] = -1.0; 29 | pc_iy_Ex = 1; 30 | pc_iy_Ey = 0; 31 | pc_iy_Ez = 1; 32 | } else if ( pemc(2) == 2 ) { 33 | //PMC (symmetric) 34 | _Y = 0; 35 | pc_yz_factor[1] = -1.0; 36 | pc_iy_Ex = 1; 37 | pc_iy_Ey = 0; 38 | pc_iy_Ez = 1; 39 | } 40 | } 41 | if (_Y == {{ dims[1]-1 }}) { 42 | if ( pemc(3) == 1 ) { 43 | pc_iy_Ey = -1; 44 | } 45 | if ( pemc(3) == 2 ) { 46 | pc_iy_Ey = -1; 47 | pc_yz_factor[1] = -1.0; 48 | } 49 | } 50 | if (_Y == {{ dims[1] }}) { 51 | _Y = 0; 52 | bloch_phaseYZ_x *= conj(bloch_y(0)); 53 | bloch_phaseYZ_y *= conj(bloch_y(1)); 54 | bloch_phaseYZ_z *= conj(bloch_y(2)); 55 | if ( pemc(3) == 1 ) { 56 | //PEC 57 | _Y = {{ dims[1]-1 }}; 58 | pc_yz_factor[0] = -1.0; 59 | pc_yz_factor[2] = -1.0; 60 | pc_iy_Ex = -1; 61 | pc_iy_Ey = 0; 62 | pc_iy_Ez = -1; 63 | } 64 | if ( pemc(3) == 2 ) { 65 | //PMC 66 | _Y = {{ dims[1]-1 }}; 67 | pc_yz_factor[1] = -1.0; 68 | pc_iy_Ex = -1; 69 | pc_iy_Ey = 0; 70 | pc_iy_Ez = -1; 71 | } 72 | } 73 | if (_Z == -1) { 74 | _Z = {{ dims[2]-1 }}; 75 | bloch_phaseYZ_x *= bloch_z(0); 76 | bloch_phaseYZ_y *= bloch_z(1); 77 | bloch_phaseYZ_z *= bloch_z(2); 78 | if ( pemc(4) == 1 ) { 79 | //PEC (anti-symmetric) 80 | _Z = 0; 81 | pc_yz_factor[0] = -1.0; 82 | pc_yz_factor[1] = -1.0; 83 | pc_iz_Ex = 1; 84 | pc_iz_Ey = 1; 85 | pc_iz_Ez = 0; 86 | } else if ( pemc(4) == 2 ) { 87 | //PMC (symmetric) 88 | _Z = 0; 89 | pc_yz_factor[2] = -1.0; 90 | pc_iz_Ex = 1; 91 | pc_iz_Ey = 1; 92 | pc_iz_Ez = 0; 93 | } 94 | } 95 | if (_Z == {{ dims[2]-1 }}) { 96 | if ( pemc(5) == 1 ) { 97 | pc_iz_Ez = -1; 98 | } 99 | if ( pemc(5) == 2 ) { 100 | pc_iz_Ez = -1; 101 | pc_yz_factor[2] = -1.0; 102 | } 103 | } 104 | if (_Z == {{ dims[2] }}) { 105 | _Z = 0; 106 | bloch_phaseYZ_x *= conj(bloch_z(0)); 107 | bloch_phaseYZ_y *= conj(bloch_z(1)); 108 | bloch_phaseYZ_z *= conj(bloch_z(2)); 109 | if ( pemc(5) == 1 ) { 110 | //PEC 111 | _Z = {{ dims[2]-1 }}; 112 | pc_yz_factor[0] = -1.0; 113 | pc_yz_factor[1] = -1.0; 114 | pc_iz_Ex = -1; 115 | pc_iz_Ey = -1; 116 | pc_iz_Ez = 0; 117 | } else if ( pemc(5) == 2 ) { 118 | //PMC 119 | _Z = {{ dims[2]-1 }}; 120 | pc_yz_factor[2] = 0.0; //this value does not matter 121 | pc_iz_Ex = -1; 122 | pc_iz_Ey = -1; 123 | pc_iz_Ez = 0; 124 | } 125 | } 126 | 127 | // Some definitions for shared memory. 128 | // Used to get unpadded thread indices. 129 | #define s_ty (_ty + 1) 130 | #define s_tz (_tz + 1) 131 | #define s_tyy (_tyy + 2) 132 | #define s_tzz (_tzz + 2) 133 | 134 | // Helper definitions. 135 | #define s_next_field (s_tyy * s_tzz) 136 | #define s_to_local (s_ty * s_tzz + (s_tz)) 137 | #define s_zp +1 138 | #define s_zn -1 139 | #define s_yp +s_tzz 140 | #define s_yn -s_tzz 141 | 142 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 143 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 144 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 145 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 146 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 147 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 148 | 149 | // Local memory. 150 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n; 151 | {{ type }} tx, ty, tz; 152 | {{ type }} sx, sy, sz, sy_p, sz_p; 153 | 154 | int xn, xp; 155 | {{ type }} bloch_phaseX_x = 1; 156 | {{ type }} bloch_phaseX_y = 1; 157 | {{ type }} bloch_phaseX_z = 1; 158 | {{ type }} pc_x_factor[3] = { 1.0, 1.0, 1.0 }; 159 | int pc_ix_Ex = -1; 160 | int pc_ix_Ey = -1; 161 | int pc_ix_Ez = -1; 162 | if (_X == 0) { 163 | bloch_phaseX_x = bloch_x(0); 164 | bloch_phaseX_y = bloch_x(1); 165 | bloch_phaseX_z = bloch_x(2); 166 | if ( pemc(0) == 1 ) { 167 | pc_x_factor[1] = -1.0; 168 | pc_x_factor[2] = -1.0; 169 | pc_ix_Ex = 0; 170 | pc_ix_Ey = 1; 171 | pc_ix_Ez = 1; 172 | xn = 0; 173 | } else if ( pemc(0) == 2 ) { 174 | pc_x_factor[0] = -1.0; 175 | pc_ix_Ex = 0; 176 | pc_ix_Ey = 1; 177 | pc_ix_Ez = 1; 178 | xn = 0; 179 | } else { 180 | pc_ix_Ex = -1; 181 | pc_ix_Ey = -1; 182 | pc_ix_Ez = -1; 183 | xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction. 184 | } 185 | } else { 186 | xn = -1;} 187 | 188 | // Load E-fields into shared memory. 189 | if (adj_dims) { 190 | // Load in s = r - alpha * v. 191 | Ex_0[0] = bloch_phaseX_x * bloch_phaseYZ_x * pc_x_factor[0] * pc_yz_factor[0] * 192 | (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z)) * 193 | ( Rx(pc_ix_Ex, pc_iy_Ex, pc_iz_Ex) 194 | - alpha * Vx(pc_ix_Ex, pc_iy_Ex, pc_iz_Ex)); 195 | Ey_0[0] = bloch_phaseX_y * bloch_phaseYZ_y * pc_x_factor[1] * pc_yz_factor[1] * 196 | (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z)) * 197 | ( Ry(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey) 198 | - alpha * Vy(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey)); 199 | Ez_0[0] = bloch_phaseX_z * bloch_phaseYZ_z * pc_x_factor[2] * pc_yz_factor[2] * 200 | (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z)) * 201 | ( Rz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez) 202 | - alpha * Vz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez)); 203 | 204 | 205 | // Ey_p = Ry(0,0,0) - alpha * Vy(0,0,0); 206 | sy_p = Ry(0, pc_iy_Ey, pc_iz_Ey) 207 | - alpha * Vy(0, pc_iy_Ey, pc_iz_Ey); 208 | Ey_p = bloch_phaseYZ_y * pc_yz_factor[1] * (sy_p) * 209 | (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 210 | 211 | // Ez_p = Rz(0,0,0) - alpha * Vz(0,0,0); 212 | sz_p = Rz(0, pc_iy_Ez, pc_iz_Ez) 213 | - alpha * Vz(0, pc_iy_Ez, pc_iz_Ez); 214 | Ez_p = bloch_phaseYZ_z * pc_yz_factor[2] * (sz_p) * 215 | (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 216 | } 217 | __syncthreads(); 218 | 219 | // Calculate H-fields and store in shared_memory. 220 | // Hy. 221 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 222 | Hy_0[0] = my(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey) * 223 | (sx1(_X+xn) * (Ez_0[0] - Ez_p) - sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 224 | } 225 | 226 | // Hz. 227 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 228 | Hz_0[0] = mz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez) * 229 | (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - sx1(_X+xn) * (Ey_0[0] - Ey_p)); 230 | } 231 | __syncthreads(); 232 | 233 | // reset the pemc factors and ix's 234 | pc_x_factor[0] = 1.0; 235 | pc_x_factor[1] = 1.0; 236 | pc_x_factor[2] = 1.0; 237 | pc_ix_Ex = 0; 238 | pc_ix_Ey = 1; 239 | pc_ix_Ez = 1; 240 | // start loop in x direction 241 | for (; _X < _x_end ; _X += _txx) { 242 | // We've moved ahead in X, so transfer appropriate field values. 243 | Ey_0[0] = Ey_p; 244 | Ez_0[0] = Ez_p; 245 | Hy_n = Hy_0[0]; 246 | Hz_n = Hz_0[0]; 247 | 248 | sy = sy_p; 249 | sz = sz_p; 250 | 251 | // Load E-fields into shared memory. 252 | if (_X == {{ dims[0]-1 }}){ 253 | if ( pemc(1) == 1 ) { 254 | // PEC 255 | pc_x_factor[1] = -1.0; 256 | pc_x_factor[2] = -1.0; 257 | pc_ix_Ex = -1; 258 | pc_ix_Ey = -1; 259 | pc_ix_Ez = -1; 260 | xp = 0; 261 | } else if ( pemc(1) == 2 ) { 262 | // PMC 263 | pc_x_factor[0] = -1.0; 264 | pc_ix_Ex = -1; 265 | pc_ix_Ey = -1; 266 | pc_ix_Ez = -1; 267 | xp = 0; 268 | } else { 269 | // bloch 270 | bloch_phaseX_x = conj(bloch_x(0)); 271 | bloch_phaseX_y = conj(bloch_x(1)); 272 | bloch_phaseX_z = conj(bloch_x(2)); 273 | xp = {{ -(dims[0]-1) }}; // Wrap-around step in the negative direction. 274 | } 275 | } else { 276 | xp = +1; 277 | bloch_phaseX_x = 1; 278 | bloch_phaseX_y = 1; 279 | bloch_phaseX_z = 1; 280 | } 281 | 282 | if (adj_dims) { 283 | sx = Rx(pc_ix_Ex,pc_iy_Ex,pc_iz_Ex) - alpha * Vx(pc_ix_Ex,pc_iy_Ex,pc_iz_Ex); 284 | Ex_0[0] = bloch_phaseYZ_x * pc_x_factor[0] * pc_yz_factor[0] * 285 | (sx) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z)); 286 | 287 | sy_p = Ry(pc_ix_Ey,pc_iy_Ey,pc_iz_Ey) - alpha * Vy(pc_ix_Ey,pc_iy_Ey,pc_iz_Ey); 288 | Ey_p = bloch_phaseX_y * bloch_phaseYZ_y * pc_x_factor[1] * pc_yz_factor[1] * 289 | (sy_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 290 | 291 | sz_p = Rz(pc_ix_Ez,pc_iy_Ez,pc_iz_Ez) - alpha * Vz(pc_ix_Ez,pc_iy_Ez,pc_iz_Ez); 292 | Ez_p = bloch_phaseX_z * bloch_phaseYZ_z * pc_x_factor[2] * pc_yz_factor[2] * 293 | (sz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 294 | } 295 | 296 | __syncthreads(); 297 | 298 | // Calculate H-fields and store in shared_memory. 299 | {% if mu_equals_1 == True %} 300 | // Hx. 301 | if ((_ty != _tyy) && (_tz != _tzz)) { 302 | Hx_0[0] = (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 303 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 304 | } 305 | 306 | // Hy. 307 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 308 | Hy_0[0] = (sx1(_X) * (Ez_0[0] - Ez_p) - 309 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 310 | } 311 | 312 | // Hz. 313 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 314 | Hz_0[0] = (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 315 | sx1(_X) * (Ey_0[0] - Ey_p)); 316 | } 317 | {% else %} 318 | // Hx. 319 | if ((_ty != _tyy) && (_tz != _tzz)) { 320 | Hx_0[0] = mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 321 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 322 | } 323 | 324 | // Hy. 325 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 326 | Hy_0[0] = my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 327 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 328 | } 329 | 330 | // Hz. 331 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 332 | Hz_0[0] = mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 333 | sx1(_X) * (Ey_0[0] - Ey_p)); 334 | } 335 | {% endif %} 336 | __syncthreads(); 337 | 338 | // Write out the results. 339 | if (_in_global && _in_local) { 340 | {% if full_operator %} 341 | Sx(0,0,0) = sx; 342 | Sy(0,0,0) = sy; 343 | Sz(0,0,0) = sz; 344 | 345 | tx = ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) * 346 | (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn]) 347 | - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn]) 348 | - ex(0,0,0) * Ex_0[0])); 349 | ty = ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) * 350 | (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 351 | - sx0(_X) * (Hz_0[0] - Hz_n) 352 | - ey(0,0,0) * Ey_0[0])); 353 | tz = ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) * 354 | (sx0(_X) * (Hy_0[0] - Hy_n) 355 | - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 356 | - ez(0,0,0) * Ez_0[0])); 357 | 358 | Tx(0,0,0) = tx; 359 | Ty(0,0,0) = ty; 360 | Tz(0,0,0) = tz; 361 | 362 | omega_num += conj(tx) * sx + conj(ty) * sy + conj(tz) * sz; 363 | omega_denom += (real(tx) * real(tx)) + (imag(tx) * imag(tx)) + 364 | (real(ty) * real(ty)) + (imag(ty) * imag(ty)) + 365 | (real(tz) * real(tz)) + (imag(tz) * imag(tz)); 366 | //omega_num += tx * sx + ty * sy + tz * sz; 367 | //omega_denom += tx * tx + ty * ty + tz * tz; 368 | 369 | {% else %} 370 | Vx(0,0,0) = Hx_0[0]; 371 | Vy(0,0,0) = Hy_0[0]; 372 | Vz(0,0,0) = Hz_0[0]; 373 | 374 | {% endif %} 375 | } 376 | __syncthreads(); 377 | } 378 | -------------------------------------------------------------------------------- /maxwell-solver/kernels/alpha_bloch_pmc_pec.cu: -------------------------------------------------------------------------------- 1 | // Mark the threads that need to load from global memory. 2 | const bool adj_dims = (((_X >= -1) && (_X <= {{ dims[0] }})) && \ 3 | ((_Y >= -1) && (_Y <= {{ dims[1] }})) && \ 4 | ((_Z >= -1) && (_Z <= {{ dims[2] }}))); 5 | 6 | // Set relevant field pointers to create wrap-around periodic grid. 7 | {{ type }} bloch_phaseYZ_x = 1.0; 8 | {{ type }} bloch_phaseYZ_y = 1.0; 9 | {{ type }} bloch_phaseYZ_z = 1.0; 10 | 11 | {{ type }} pc_yz_factor[3] = { 1.0, 1.0, 1.0 }; 12 | int pc_iy_Ex = 0; 13 | int pc_iz_Ex = 0; 14 | int pc_iy_Ey = 0; 15 | int pc_iz_Ey = 0; 16 | int pc_iy_Ez = 0; 17 | int pc_iz_Ez = 0; 18 | 19 | if (_Y == -1) { 20 | _Y = {{ dims[1]-1 }}; 21 | bloch_phaseYZ_x *= bloch_y(0); 22 | bloch_phaseYZ_y *= bloch_y(1); 23 | bloch_phaseYZ_z *= bloch_y(2); 24 | if ( pemc(2) == 1 ) { 25 | //PEC (anti-symmetric) 26 | _Y = 0; 27 | pc_yz_factor[0] = -1.0; 28 | pc_yz_factor[2] = -1.0; 29 | pc_iy_Ex = 1; 30 | pc_iy_Ey = 0; 31 | pc_iy_Ez = 1; 32 | } else if ( pemc(2) == 2 ) { 33 | //PMC (symmetric) 34 | _Y = 0; 35 | pc_yz_factor[1] = -1.0; 36 | pc_iy_Ex = 1; 37 | pc_iy_Ey = 0; 38 | pc_iy_Ez = 1; 39 | } 40 | } 41 | if (_Y == {{ dims[1]-1 }}) { 42 | if ( pemc(3) == 1 ) { 43 | pc_iy_Ey = -1; 44 | } 45 | if ( pemc(3) == 2 ) { 46 | pc_iy_Ey = -1; 47 | pc_yz_factor[1] = -1.0; 48 | } 49 | } 50 | if (_Y == {{ dims[1] }}) { 51 | _Y = 0; 52 | bloch_phaseYZ_x *= conj(bloch_y(0)); 53 | bloch_phaseYZ_y *= conj(bloch_y(1)); 54 | bloch_phaseYZ_z *= conj(bloch_y(2)); 55 | if ( pemc(3) == 1 ) { 56 | //PEC 57 | _Y = {{ dims[1]-1 }}; 58 | pc_yz_factor[0] = -1.0; 59 | pc_yz_factor[2] = -1.0; 60 | pc_iy_Ex = -1; 61 | pc_iy_Ey = 0; 62 | pc_iy_Ez = -1; 63 | } 64 | if ( pemc(3) == 2 ) { 65 | //PMC 66 | _Y = {{ dims[1]-1 }}; 67 | pc_yz_factor[1] = -1.0; 68 | pc_iy_Ex = -1; 69 | pc_iy_Ey = 0; 70 | pc_iy_Ez = -1; 71 | } 72 | } 73 | if (_Z == -1) { 74 | _Z = {{ dims[2]-1 }}; 75 | bloch_phaseYZ_x *= bloch_z(0); 76 | bloch_phaseYZ_y *= bloch_z(1); 77 | bloch_phaseYZ_z *= bloch_z(2); 78 | if ( pemc(4) == 1 ) { 79 | //PEC (anti-symmetric) 80 | _Z = 0; 81 | pc_yz_factor[0] = -1.0; 82 | pc_yz_factor[1] = -1.0; 83 | pc_iz_Ex = 1; 84 | pc_iz_Ey = 1; 85 | pc_iz_Ez = 0; 86 | } else if ( pemc(4) == 2 ) { 87 | //PMC (symmetric) 88 | _Z = 0; 89 | pc_yz_factor[2] = -1.0; 90 | pc_iz_Ex = 1; 91 | pc_iz_Ey = 1; 92 | pc_iz_Ez = 0; 93 | } 94 | } 95 | if (_Z == {{ dims[2]-1 }}) { 96 | if ( pemc(5) == 1 ) { 97 | pc_iz_Ez = -1; 98 | } 99 | if ( pemc(5) == 2 ) { 100 | pc_iz_Ez = -1; 101 | pc_yz_factor[2] = -1.0; 102 | } 103 | } 104 | if (_Z == {{ dims[2] }}) { 105 | _Z = 0; 106 | bloch_phaseYZ_x *= conj(bloch_z(0)); 107 | bloch_phaseYZ_y *= conj(bloch_z(1)); 108 | bloch_phaseYZ_z *= conj(bloch_z(2)); 109 | if ( pemc(5) == 1 ) { 110 | //PEC 111 | _Z = {{ dims[2]-1 }}; 112 | pc_yz_factor[0] = -1.0; 113 | pc_yz_factor[1] = -1.0; 114 | pc_iz_Ex = -1; 115 | pc_iz_Ey = -1; 116 | pc_iz_Ez = 0; 117 | } else if ( pemc(5) == 2 ) { 118 | //PMC 119 | _Z = {{ dims[2]-1 }}; 120 | pc_yz_factor[2] = 0.0; //this value does not matter 121 | pc_iz_Ex = -1; 122 | pc_iz_Ey = -1; 123 | pc_iz_Ez = 0; 124 | } 125 | } 126 | 127 | // Some definitions for shared memory. 128 | // Used to get unpadded thread indices. 129 | #define s_ty (_ty + 1) 130 | #define s_tz (_tz + 1) 131 | #define s_tyy (_tyy + 2) 132 | #define s_tzz (_tzz + 2) 133 | 134 | // Helper definitions. 135 | #define s_next_field (s_tyy * s_tzz) 136 | #define s_to_local (s_ty * s_tzz + (s_tz)) 137 | #define s_zp +1 138 | #define s_zn -1 139 | #define s_yp +s_tzz 140 | #define s_yn -s_tzz 141 | 142 | {{ type }} *Ex_0 = (0 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 143 | {{ type }} *Ey_0 = (1 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 144 | {{ type }} *Ez_0 = (2 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 145 | {{ type }} *Hx_0 = (3 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 146 | {{ type }} *Hy_0 = (4 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 147 | {{ type }} *Hz_0 = (5 * s_next_field) + (({{ type }}*) _gce_smem) + s_to_local; 148 | 149 | // Local memory. 150 | {{ type }} Ey_p, Ez_p, Hy_n, Hz_n; 151 | {{ type }} vx, vy, vz; 152 | {{ type }} px, py, pz, py_p, pz_p; 153 | 154 | int xn, xp; 155 | {{ type }} bloch_phaseX_x = 1; 156 | {{ type }} bloch_phaseX_y = 1; 157 | {{ type }} bloch_phaseX_z = 1; 158 | {{ type }} pc_x_factor[3] = { 1.0, 1.0, 1.0 }; 159 | int pc_ix_Ex = -1; 160 | int pc_ix_Ey = -1; 161 | int pc_ix_Ez = -1; 162 | if (_X == 0) { 163 | bloch_phaseX_x = bloch_x(0); 164 | bloch_phaseX_y = bloch_x(1); 165 | bloch_phaseX_z = bloch_x(2); 166 | if ( pemc(0) == 1 ) { 167 | pc_x_factor[1] = -1.0; 168 | pc_x_factor[2] = -1.0; 169 | pc_ix_Ex = 0; 170 | pc_ix_Ey = 1; 171 | pc_ix_Ez = 1; 172 | xn = 0; 173 | } else if ( pemc(0) == 2 ) { 174 | pc_x_factor[0] = -1.0; 175 | pc_ix_Ex = 0; 176 | pc_ix_Ey = 1; 177 | pc_ix_Ez = 1; 178 | xn = 0; 179 | } else { 180 | pc_ix_Ex = -1; 181 | pc_ix_Ey = -1; 182 | pc_ix_Ez = -1; 183 | xn = {{ dims[0]-1 }}; // Wrap-around step in the negative direction. 184 | } 185 | } else { 186 | xn = -1;} 187 | 188 | // Load E-fields into shared memory. 189 | if (adj_dims) { 190 | // Load in p = r + beta * p. 191 | Ex_0[0] = bloch_phaseX_x * bloch_phaseYZ_x * pc_x_factor[0] * pc_yz_factor[0] * 192 | (sqrt_sx1(_X+xn) * sqrt_sy0(_Y) * sqrt_sz0(_Z)) * 193 | ( Rx(pc_ix_Ex, pc_iy_Ex, pc_iz_Ex) 194 | + beta * ( Px(pc_ix_Ex, pc_iy_Ex, pc_iz_Ex) 195 | - omega * Vx(pc_ix_Ex, pc_iy_Ex, pc_iz_Ex))); 196 | Ey_0[0] = bloch_phaseX_y * bloch_phaseYZ_y * pc_x_factor[1] * pc_yz_factor[1] * 197 | (sqrt_sx0(_X+xn) * sqrt_sy1(_Y) * sqrt_sz0(_Z)) * 198 | ( Ry(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey) 199 | + beta * ( Py(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey) 200 | - omega * Vy(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey))); 201 | Ez_0[0] = bloch_phaseX_z * bloch_phaseYZ_z * pc_x_factor[2] * pc_yz_factor[2] * 202 | (sqrt_sx0(_X+xn) * sqrt_sy0(_Y) * sqrt_sz1(_Z)) * 203 | ( Rz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez) 204 | + beta * ( Pz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez) 205 | - omega * Vz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez))); 206 | 207 | // Ey_p = Ry(0,0,0) + beta * Ey(0,0,0); 208 | py_p = Ry(0, pc_iy_Ey, pc_iz_Ey) 209 | + beta * (Py(0, pc_iy_Ey, pc_iz_Ey) 210 | - omega * Vy(0, pc_iy_Ey, pc_iz_Ey)); 211 | Ey_p = bloch_phaseYZ_y * pc_yz_factor[1] * (py_p) * 212 | (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 213 | 214 | // Ez_p = Rz(0,0,0) + beta * Ez(0,0,0); 215 | pz_p = Rz(0, pc_iy_Ez, pc_iz_Ez) 216 | + beta * (Pz(0, pc_iy_Ez, pc_iz_Ez) 217 | - omega * Vz(0, pc_iy_Ez, pc_iz_Ez)); 218 | Ez_p = bloch_phaseYZ_z * pc_yz_factor[2] * (pz_p) * 219 | (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 220 | } 221 | __syncthreads(); 222 | 223 | // Calculate H-fields and store in shared_memory. 224 | // Hy. 225 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 226 | Hy_0[0] = my(pc_ix_Ey, pc_iy_Ey, pc_iz_Ey) * 227 | (sx1(_X+xn) * (Ez_0[0] - Ez_p) - sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 228 | } 229 | 230 | // Hz. 231 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 232 | Hz_0[0] = mz(pc_ix_Ez, pc_iy_Ez, pc_iz_Ez) * 233 | (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - sx1(_X+xn) * (Ey_0[0] - Ey_p)); 234 | } 235 | __syncthreads(); 236 | 237 | // reset the pemc factors and ix's 238 | pc_x_factor[0] = 1.0; 239 | pc_x_factor[1] = 1.0; 240 | pc_x_factor[2] = 1.0; 241 | pc_ix_Ex = 0; 242 | pc_ix_Ey = 1; 243 | pc_ix_Ez = 1; 244 | // start loop in x direction 245 | for (; _X < _x_end ; _X += _txx) { 246 | // We've moved ahead in X, so transfer appropriate field values. 247 | Ey_0[0] = Ey_p; 248 | Ez_0[0] = Ez_p; 249 | Hy_n = Hy_0[0]; 250 | Hz_n = Hz_0[0]; 251 | 252 | py = py_p; 253 | pz = pz_p; 254 | 255 | // Load E-fields into shared memory. 256 | if (_X == {{ dims[0]-1 }}){ 257 | if ( pemc(1) == 1 ) { 258 | // PEC 259 | pc_x_factor[1] = -1.0; 260 | pc_x_factor[2] = -1.0; 261 | pc_ix_Ex = -1; 262 | pc_ix_Ey = -1; 263 | pc_ix_Ez = -1; 264 | xp = 0; 265 | } else if ( pemc(1) == 2 ) { 266 | // PMC 267 | pc_x_factor[0] = -1.0; 268 | pc_ix_Ex = -1; 269 | pc_ix_Ey = -1; 270 | pc_ix_Ez = -1; 271 | xp = 0; 272 | } else { 273 | // bloch 274 | bloch_phaseX_x = conj(bloch_x(0)); 275 | bloch_phaseX_y = conj(bloch_x(1)); 276 | bloch_phaseX_z = conj(bloch_x(2)); 277 | xp = {{ -(dims[0]-1) }}; // Wrap-around step in the negative direction. 278 | } 279 | } else { 280 | xp = +1; 281 | bloch_phaseX_x = 1; 282 | bloch_phaseX_y = 1; 283 | bloch_phaseX_z = 1; 284 | } 285 | 286 | if (adj_dims) { 287 | px = Rx(pc_ix_Ex,pc_iy_Ex,pc_iz_Ex) + beta * ( 288 | Px(pc_ix_Ex,pc_iy_Ex,pc_iz_Ex) - omega * Vx(pc_ix_Ex,pc_iy_Ex,pc_iz_Ex)); 289 | Ex_0[0] = bloch_phaseYZ_x * pc_x_factor[0] * pc_yz_factor[0] * 290 | (px) * (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z)); 291 | 292 | py_p = Ry(pc_ix_Ey,pc_iy_Ey,pc_iz_Ey) + beta * ( 293 | Py(pc_ix_Ey,pc_iy_Ey,pc_iz_Ey) - omega * Vy(pc_ix_Ez,pc_iy_Ey,pc_iz_Ey)); 294 | Ey_p = bloch_phaseX_y * bloch_phaseYZ_y * pc_x_factor[1] * pc_yz_factor[1] * 295 | (py_p) * (sqrt_sx0(_X+xp) * sqrt_sy1(_Y) * sqrt_sz0(_Z)); 296 | 297 | pz_p = Rz(pc_ix_Ez,pc_iy_Ez,pc_iz_Ez) + beta * ( 298 | Pz(pc_ix_Ez,pc_iy_Ez,pc_iz_Ez) - omega * Vz(pc_ix_Ez,pc_iy_Ez,pc_iz_Ez)); 299 | Ez_p = bloch_phaseX_z * bloch_phaseYZ_z * pc_x_factor[2] * pc_yz_factor[2] * 300 | (pz_p) * (sqrt_sx0(_X+xp) * sqrt_sy0(_Y) * sqrt_sz1(_Z)); 301 | } 302 | 303 | __syncthreads(); 304 | 305 | // Calculate H-fields and store in shared_memory. 306 | {% if mu_equals_1 == True %} 307 | // Hx. 308 | if ((_ty != _tyy) && (_tz != _tzz)) { 309 | Hx_0[0] = (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 310 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 311 | } 312 | 313 | // Hy. 314 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 315 | Hy_0[0] = (sx1(_X) * (Ez_0[0] - Ez_p) - 316 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 317 | } 318 | 319 | // Hz. 320 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 321 | Hz_0[0] = (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 322 | sx1(_X) * (Ey_0[0] - Ey_p)); 323 | } 324 | {% else %} 325 | // Hx. 326 | if ((_ty != _tyy) && (_tz != _tzz)) { 327 | Hx_0[0] = mx(0,0,0) * (sz1(_Z) * (Ey_0[0] - Ey_0[s_zp]) - 328 | sy1(_Y) * (Ez_0[0] - Ez_0[s_yp])); 329 | } 330 | 331 | // Hy. 332 | if ((_ty != -1) && (_ty != _tyy) && (_tz != _tzz)) { 333 | Hy_0[0] = my(0,0,0) * (sx1(_X) * (Ez_0[0] - Ez_p) - 334 | sz1(_Z) * (Ex_0[0] - Ex_0[s_zp])); 335 | } 336 | 337 | // Hz. 338 | if ((_ty != _tyy) && (_tz != -1) && (_tz != _tzz)) { 339 | Hz_0[0] = mz(0,0,0) * (sy1(_Y) * (Ex_0[0] - Ex_0[s_yp]) - 340 | sx1(_X) * (Ey_0[0] - Ey_p)); 341 | } 342 | {% endif %} 343 | __syncthreads(); 344 | 345 | // Write out the results. 346 | if (_in_global && _in_local) { 347 | {% if full_operator %} 348 | P1x(0,0,0) = px; 349 | P1y(0,0,0) = py; 350 | P1z(0,0,0) = pz; 351 | 352 | vx = ((1.0 / (sqrt_sx1(_X) * sqrt_sy0(_Y) * sqrt_sz0(_Z))) * 353 | (sy0(_Y) * (Hz_0[0] - Hz_0[s_yn]) 354 | - sz0(_Z) * (Hy_0[0] - Hy_0[s_zn]) 355 | - ex(0,0,0) * Ex_0[0])); 356 | vy = ((1.0 / (sqrt_sx0(_X) * sqrt_sy1(_Y) * sqrt_sz0(_Z))) * 357 | (sz0(_Z) * (Hx_0[0] - Hx_0[s_zn]) 358 | - sx0(_X) * (Hz_0[0] - Hz_n) 359 | - ey(0,0,0) * Ey_0[0])); 360 | vz = ((1.0 / (sqrt_sx0(_X) * sqrt_sy0(_Y) * sqrt_sz1(_Z))) * 361 | (sx0(_X) * (Hy_0[0] - Hy_n) 362 | - sy0(_Y) * (Hx_0[0] - Hx_0[s_yn]) 363 | - ez(0,0,0) * Ez_0[0])); 364 | 365 | V1x(0,0,0) = vx; 366 | V1y(0,0,0) = vy; 367 | V1z(0,0,0) = vz; 368 | 369 | alpha_denom += (R_hatHx(0,0,0) * vx) + (R_hatHy(0,0,0) * vy) + (R_hatHz(0,0,0) * vz); 370 | 371 | {% else %} 372 | V1x(0,0,0) = Hx_0[0]; 373 | V1y(0,0,0) = Hy_0[0]; 374 | V1z(0,0,0) = Hz_0[0]; 375 | 376 | {% endif %} 377 | } 378 | __syncthreads(); 379 | } 380 | -------------------------------------------------------------------------------- /maxwell-solver/fdfd.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import numpy as np 3 | import maxwell_ops_lumped 4 | from solvers import bicg 5 | from gce.grid import Grid 6 | from mpi4py.MPI import COMM_WORLD as comm 7 | import time, sys, tempfile, os 8 | 9 | from pycuda import driver 10 | 11 | 12 | def simulate(name, check_success_only=False): 13 | """ Read simulation from input file, simulate, and write out results. """ 14 | print_comm0('starting simulate') 15 | 16 | # Reset the environment variables pointing to the temporary directory. 17 | tempfile.tempdir = '/tmp' 18 | 19 | # Create the reporter function. 20 | write_status = lambda msg: open(name + '.status', 'a').write(msg) 21 | if comm.Get_rank() == 0: 22 | # write_status('EXEC initializing\n') 23 | def rep(err, info: str = None): 24 | if info is None: 25 | write_status('%e\n' % np.abs(err)) 26 | else: 27 | write_status('%s %e\n' % (info, np.abs(err))) 28 | else: # No reporting needed for non-root nodes. 29 | 30 | def rep(err, info: str = None): 31 | pass 32 | 33 | # Get input parameters. 34 | params = get_parameters(name) 35 | solver = params['solver'] 36 | if solver == 'CG': 37 | # Define operations needed for the CG operation.(bicg is CG) 38 | b, x, ops, post_cond, gpu_func = bicg.ops(params) 39 | # Solve! 40 | start_time = time.time() 41 | x, err, success, iters = bicg.solve_symm_lumped(b, x=x, \ 42 | max_iters=params['max_iters'], \ 43 | reporter=rep, \ 44 | err_thresh=params['err_thresh'], \ 45 | gpu_func=gpu_func, \ 46 | **ops) 47 | stop_time = time.time() 48 | 49 | elif solver == 'biCGSTAB': 50 | # Define operations needed for the biCGSTAB operation. 51 | b, x, r_hatH, ops, post_cond, gpu_func = bicg.ops_biCGSTAB(params) 52 | 53 | # Solve! 54 | start_time = time.time() 55 | x, err, success, iters = bicg.solve_asymm_biCGSTAB( b, r_hatH, x=x, \ 56 | max_iters=params['max_iters'], \ 57 | reporter=rep, \ 58 | err_thresh=params['err_thresh'], \ 59 | gpu_func=gpu_func,\ 60 | **ops) 61 | stop_time = time.time() 62 | 63 | elif solver == 'lgmres': 64 | from solvers import lgmres 65 | # Define operations needed for the lumped bicg operation. 66 | b, x, lgmres_functions, post_cond, gpu_func = lgmres.ops_lgmres(params) 67 | 68 | options = { 69 | 'maxiters': params['max_iters'], 70 | 'inner_m': 15, 71 | 'outer_k': 2, 72 | 'tol': params['err_thresh'] 73 | } 74 | # Solve! 75 | start_time = time.time() 76 | x, err, success, iters = lgmres.solve_asymm_lgmres( b, x=x, \ 77 | reporter=rep, \ 78 | lgmres_func=lgmres_functions,\ 79 | options=options, \ 80 | gpu_func=gpu_func) 81 | stop_time = time.time() 82 | 83 | elif solver == 'Jacobi-Davidson': 84 | from solvers import lgmres, JacDav 85 | # Check if x is zeros and do a simulation with biCGSTAB if so 86 | if not np.any(params['x']): 87 | b, x, lgmres_functions, post_cond, gpu_func = lgmres.ops_lgmres( 88 | params) 89 | options = { 90 | 'maxiters': 300, #params['max_iters'], 91 | 'inner_m': 15, 92 | 'outer_k': 2, 93 | 'tol': 10 * params['err_thresh'] 94 | } 95 | print_comm0('zero E0 - initial simulation needed') 96 | x_start, err, success, iters = lgmres.solve_asymm_lgmres( b, x=x, \ 97 | reporter=rep, \ 98 | lgmres_func=lgmres_functions,\ 99 | options=options, \ 100 | gpu_func=gpu_func) 101 | #b, x0, r_hatH, ops, post_cond, gpu_func = maxwell_ops_lumped.ops_lgmres(params) 102 | #x_start, err, success, iters = bicg.solve_asymm_biCGSTAB( b, r_hatH, x=x0, \ 103 | # max_iters=params['max_iters'], \ 104 | # reporter=rep, \ 105 | # err_thresh=10*params['err_thresh'], \ 106 | # gpu_func=gpu_func,\ 107 | # **ops) 108 | params['x'] = [ 109 | E.get() for E in x_start 110 | ] #.get() will get the data from the gpu and gather it to the root 111 | if comm.Get_rank() == 0: 112 | params['x'] = post_cond(params['x']) # Apply postconditioner 113 | #shp = params['x'][0].shape 114 | #params['x']=[np.random.rand(shp[0], shp[1], shp[2]) for i in range(3)] 115 | del x_start 116 | else: 117 | print_comm0('none zero E0 - No initial simulation needed') 118 | 119 | # Change the precompution that was done on the permitivity (j and m is kept) 120 | # So, undo eps = omega**2*eps 121 | if comm.Get_rank() == 0: 122 | for k in range(3): 123 | params['e'][k] = (params['omega']**(-2) * params['e'][k]) 124 | 125 | # Define operations needed for the JacDav operation. 126 | print_comm0('preparing solver') 127 | t0, gpu_post_cond_eps_norm, post_cond, JacDav_func, gpu_func = \ 128 | JacDav.ops_JacDav(params) 129 | 130 | # Set the solver options 131 | options_JacDav = { 132 | 'maxiters': 100, 133 | 'n_eig': params['n_eig'], 134 | 'target': params['omega']**2, 135 | 'm_max': 40, 136 | 'm_min': 2, 137 | 'tol': params['err_thresh'] 138 | } 139 | options_lgmres = {'maxiters': 25, 'inner_m': 15, 'outer_k': 3} 140 | 141 | # Solve! 142 | start_time = time.time() 143 | print_comm0('start solver') # t can not be 0 144 | 145 | q, Q, success, err, iters = \ 146 | JacDav.solve_eig_JacDav( t0 = t0, \ 147 | reporter = rep, \ 148 | JacDav_func = JacDav_func, \ 149 | options_lgmres = options_lgmres, \ 150 | options_JacDav = options_JacDav, 151 | gpu_func = gpu_func) 152 | stop_time = time.time() 153 | print_comm0('time: ' + str(stop_time - start_time)) 154 | 155 | # remove the eps_norm 156 | for Qi in Q: 157 | gpu_post_cond_eps_norm(Qi) 158 | 159 | if check_success_only: # Don't write output, just see if we got a success. 160 | return success 161 | 162 | # Gather results onto root's host memory. 163 | if solver == 'Jacobi-Davidson': 164 | Q_result = [[E.get() for E in x] for x in Q] 165 | result = { 'Q': Q_result, \ 166 | 'q': q, 167 | 'err': err, \ 168 | 'success': success, \ 169 | 'iters': iters, \ 170 | 'time': (stop_time-start_time)} 171 | else: 172 | result = { 'E': [E.get() for E in x], \ 173 | 'err': err, \ 174 | 'success': success, \ 175 | 'iters': iters, \ 176 | 'time': (stop_time-start_time)} 177 | print_comm0(result['time']) 178 | 179 | # Write results to output file. 180 | if comm.Get_rank() == 0: 181 | if solver == 'Jacobi-Davidson': 182 | for i in range(len(result['Q'])): 183 | result['Q'][i] = post_cond( 184 | result['Q'][i]) # Apply postconditioner 185 | else: 186 | result['E'] = post_cond(result['E']) # Apply postconditioner. 187 | write_results(name, result) 188 | 189 | return success 190 | 191 | 192 | def get_parameters(name): 193 | """ Reads the simulation parameters from the input hdf5 file. """ 194 | 195 | if comm.rank == 0: 196 | f = h5py.File(name + '.grid', 'r') 197 | files_to_delete = [name + '.grid'] 198 | 199 | omega = np.complex128(f['omega_r'][0] + 1j * f['omega_i'][0]) 200 | shape = tuple([int(s) for s in f['shape'][:]]) 201 | n_eig = int(f['n_eig'][0]) 202 | 203 | # bloch boundary conditions 204 | bloch_phase = f['bloch_phase'][...] 205 | 206 | # PEC or PMC boundary conditions 207 | pemc = f['pemc'][...].astype('int32') 208 | 209 | # get solver 210 | EM_solvers = ['CG', 'biCGSTAB', 'lgmres', 'Jacobi-Davidson'] 211 | solver = EM_solvers[f['solver'][...]] 212 | 213 | # Function used to read in a 1D complex vector fields. 214 | get_1D_fields = lambda a: [(f[a+'_'+u+'r'][:] + 1j * f[a+'_'+u+'i'][:]).\ 215 | astype(np.complex128) for u in 'xyz'] 216 | 217 | # Read in s and t vectors. 218 | s = get_1D_fields('sp') 219 | t = get_1D_fields('sd') 220 | 221 | # Read in max_iters and err_thresh. 222 | max_iters = int(f['max_iters'][0]) 223 | err_thresh = float(f['err_thresh'][0]) 224 | 225 | # Function used to read in 3D complex vector fields. 226 | def get_3D_fields(a): 227 | field = [] 228 | # Check if field data all in one HDF5 file. 229 | if (a + '_xr') in f: 230 | for k in range(3): 231 | key = a + '_' + 'xyz' [k] 232 | field.append( 233 | (f[key + 'r'][:] + 1j * f[key + 'i'][:]).astype( 234 | np.complex128)) 235 | return field 236 | 237 | for k in range(3): 238 | key = name + '.' + a + '_' + 'xyz' [k] 239 | field.append((h5py.File(key + 'r')['data'][:] + \ 240 | 1j * h5py.File(key + 'i')['data'][:]).astype(np.complex128)) 241 | files_to_delete.append(key + 'r') 242 | files_to_delete.append(key + 'i') 243 | return field 244 | 245 | e = get_3D_fields('e') # Permittivity (eps). 246 | j = get_3D_fields('J') # Current source. 247 | m = get_3D_fields('m') # Permeability (mu). 248 | x = get_3D_fields('A') # Initial fields (E0). 249 | 250 | f.close() # Close file. 251 | 252 | # Delete input files. 253 | for filename in files_to_delete: 254 | os.remove(filename) 255 | 256 | # Do some simple pre-computation. 257 | for k in range(3): 258 | m[k] = m[k]**-1 259 | e[k] = omega**2 * e[k] 260 | j[k] = -1j * omega * j[k] 261 | 262 | params = {'omega': omega, 'shape': shape, 'n_eig': n_eig,\ 263 | 'max_iters': max_iters, 'err_thresh': err_thresh, \ 264 | 's': s, 't': t, 'bloch_phase': bloch_phase, \ 265 | 'pemc': pemc, 'solver': solver} 266 | else: 267 | params = None 268 | 269 | params = comm.bcast(params) 270 | 271 | if comm.rank == 0: 272 | params['e'] = e 273 | params['m'] = m 274 | params['j'] = j 275 | params['x'] = x 276 | else: 277 | for field_name in 'emjx': 278 | params[field_name] = [None] * 3 279 | 280 | return params 281 | 282 | 283 | def write_results(name, result): 284 | """ Write out the results to an hdf5 file. """ 285 | 286 | my_write = lambda fieldname, data: h5py.File(name + '.' + fieldname, 'w').\ 287 | create_dataset('data', data=data) 288 | 289 | if 'q' in list(result.keys()): 290 | my_write('iter_info', np.array([result['iters']]).astype(np.float32)) 291 | my_write('time_info', np.array([result['time']]).astype(np.float32)) 292 | my_write('qr', np.real(np.array([result['q']])).astype(np.float32)) 293 | my_write('qi', np.imag(np.array([result['q']])).astype(np.float32)) 294 | 295 | # Write out the datasets. 296 | for i in range(len(result['q'])): 297 | for k in range(3): 298 | my_write('Q' + str(i) + '_' + 'xyz'[k] + 'r', \ 299 | np.real(result['Q'][i][k]).astype(np.float32)) 300 | my_write('Q' + str(i)+ '_' + 'xyz'[k] + 'i', \ 301 | np.imag(result['Q'][i][k]).astype(np.float32)) 302 | my_write = lambda fieldname, data: h5py.File(name + '.' + fieldname, 'w').\ 303 | create_dataset('data', data=data) 304 | else: 305 | my_write('iter_info', np.array([result['iters']]).astype(np.float32)) 306 | my_write('time_info', np.array([result['time']]).astype(np.float32)) 307 | 308 | # Write out the datasets. 309 | for k in range(3): 310 | my_write('E_' + 'xyz'[k] + 'r', \ 311 | np.real(result['E'][k]).astype(np.float32)) 312 | my_write('E_' + 'xyz'[k] + 'i', \ 313 | np.imag(result['E'][k]).astype(np.float32)) 314 | 315 | 316 | def print_comm0(txt: str): 317 | if comm.Get_rank() == 0: 318 | print(txt) 319 | 320 | 321 | if __name__ == '__main__': # Allows calls from command line. 322 | if comm.rank == 0: 323 | print('start in main') 324 | simulate(sys.argv[1]) # Specify name of the job. 325 | -------------------------------------------------------------------------------- /maxwell-solver/gce/kernel.py: -------------------------------------------------------------------------------- 1 | """ Defines the Kernel class for GCE. """ 2 | from pycuda import compiler 3 | from pycuda import driver as drv 4 | from jinja2 import Environment, PackageLoader 5 | from gce.space import get_space_info 6 | from gce.out import batch_reduce 7 | import numpy as np 8 | from mpi4py.MPI import COMM_WORLD as comm 9 | 10 | # Load the jinja environment when the module is loaded. 11 | _template_file = 'kernel.cu' 12 | _jinja_env = Environment(loader=PackageLoader(__name__, '.')) 13 | 14 | 15 | class Kernel: 16 | """ Create an executable kernel for GCE. 17 | 18 | A Kernel executable allows for the modification of Grid objects and the 19 | computation of Outs. Kernels accept Grid, Const, Out, and certain numpy 20 | scalar objects as their input. 21 | 22 | Kernels work by traversing the 3D space in the x-direction and executing 23 | user-specified cuda code at every grid point. For more information on the 24 | conventions and available tools for defining Kernels, please see the 25 | KERNEL_DOC file. 26 | 27 | Additionally, Kernels will self-optimize runtime parameters. Such parmaters 28 | include only block size for now though. 29 | 30 | Methods: 31 | __init__ -- Define the executable kernel. 32 | __call__ -- Execute the kernel. 33 | 34 | Example usage: 35 | fun = Kernel(((x, 'x'), (y, 'y')), code) 36 | fun() 37 | """ 38 | 39 | def __init__(self, code, *vars, **kwargs): 40 | """ Prepare a cuda function that will execute on the GCE space. 41 | 42 | Input variables: 43 | code -- The looped cuda code to be executed. 44 | vars -- (name, gce_type, numpy_type) of the input arguments. 45 | 46 | Keyword variables: 47 | pre_loop -- Cuda code that is executed before the loop code. 48 | shape_filter -- Can be either 'all', 'skinny', or 'square'. 49 | padding -- (yn, yp, zn, zp), describes the number of "extra" threads 50 | to be run on the border of each thread block. 51 | smem_per_thread -- Number of bytes of shared memory needed by a thread. 52 | """ 53 | 54 | # Make sure there are no extraneous keyword arguments. 55 | if any([key not in \ 56 | ('pre_loop', 'shape_filter', 'padding', 'smem_per_thread') 57 | for key in kwargs.keys()]): 58 | raise TypeError('Invalid key used.') 59 | 60 | # Process keyword arguments. 61 | pre_code = kwargs.get('pre_loop', '') 62 | shape_filter = kwargs.get('shape_filter', 'skinny') 63 | padding = kwargs.get('padding', (0, 0, 0, 0)) 64 | smem_per_thread = kwargs.get('smem_per_thread', 0) 65 | 66 | # Dictionary for conversion from numpy to cuda types. 67 | cuda_types = {np.float32: 'float', np.float64: 'double', \ 68 | np.int32: 'int', \ 69 | np.complex64: 'pycuda::complex', \ 70 | np.complex128: 'pycuda::complex'} 71 | # Dictionary for conversion from numpy to alternate type for Consts. 72 | alt_types = {np.float32: 'float', np.float64: 'double', \ 73 | np.complex64: 'float2', np.complex128: 'double2'} 74 | 75 | # Process vars. 76 | params = [{'name': v[0], \ 77 | 'gce_type': v[1], \ 78 | 'dtype': v[2], \ 79 | 'cuda_type': cuda_types[v[2]]} for v in vars] 80 | 81 | # Get the template and render it using jinja2. 82 | shape = get_space_info()['shape'] # Shape of the space. 83 | template = _jinja_env.get_template(_template_file) 84 | cuda_source = template.render( params=params, \ 85 | padding=padding, \ 86 | dims =get_space_info()['shape'], \ 87 | x_range=get_space_info()['x_range'], \ 88 | preloop_code=pre_code, \ 89 | loop_code=code, \ 90 | flat_tag='_f') 91 | 92 | # Compile the code into a callable cuda function. 93 | mod = compiler.SourceModule(cuda_source) 94 | # mod = compiler.SourceModule(cuda_source, options=['-Xptxas', '-dlcm=cg']) # Global skips L1 cache. 95 | self.fun = mod.get_function('_gce_kernel') 96 | 97 | # Prefer 48KB of L1 cache when possible. 98 | self.fun.set_cache_config(drv.func_cache.PREFER_L1) 99 | 100 | # Get address of global variable in module. 101 | # Note: contains a work-around for problems with complex types. 102 | my_get_global = lambda name: mod.get_global('_' + name + '_temp') 103 | 104 | # Useful information about the kernel. 105 | self._kernel_info = {'max_threads': self.fun.max_threads_per_block, \ 106 | 'const_bytes': self.fun.const_size_bytes, \ 107 | 'local_bytes': self.fun.local_size_bytes, \ 108 | 'num_regs': self.fun.num_regs} 109 | 110 | # Get some valid execution configurations. 111 | self.exec_configs = self._get_exec_configs( \ 112 | self.fun.max_threads_per_block, \ 113 | padding, smem_per_thread, shape_filter) 114 | 115 | # Prepare the function by telling pycuda the types of the inputs. 116 | arg_types = [] 117 | for p in params: 118 | if p['gce_type'] is 'number': 119 | arg_types.append(p['dtype']) 120 | # elif p['gce_type'] is 'const': 121 | # arg_types.append(p['dtype']) 122 | # # pass # Consts don't actually get passed in. 123 | else: 124 | arg_types.append(np.intp) 125 | self.fun.prepare([np.int32, np.int32] + arg_types) 126 | 127 | # Define the function which we will use to execute the kernel. 128 | # TODO: Make a shortcut version with lower overhead. 129 | # Used for asynchronous execution and timing. 130 | stream = drv.Stream() 131 | start, start2, pad_done, sync_done, comp_done, all_done = \ 132 | [drv.Event() for k in range(6)] 133 | 134 | # Kernel execution over a range of x-values. 135 | def execute_range(x_start, x_end, gpu_params, cfg, stream): 136 | """ Defines asynchronous kernel execution for a range of x. """ 137 | self.fun.prepared_async_call( \ 138 | cfg['grid_shape'][::-1], \ 139 | cfg['block_shape'][::-1] + (1,), \ 140 | stream, \ 141 | *([np.int32(x_start), np.int32(x_end)] + gpu_params), \ 142 | shared_size=cfg['smem_size']) 143 | 144 | x_start, x_end = get_space_info()['x_range'] # This node's range. 145 | 146 | def execute(cfg, *args, **kwargs): 147 | 148 | # Parse keyword arguments. 149 | post_sync_grids = kwargs.get('post_sync', None) 150 | 151 | # Parse the inputs. 152 | gpu_params = [] 153 | for k in range(len(params)): 154 | if params[k]['gce_type'] is 'number': 155 | gpu_params.append(params[k]['dtype'](args[k])) 156 | elif params[k]['gce_type'] is 'const': # Load Const. 157 | gpu_params.append(args[k].data.ptr) 158 | # Const no longer actually "const" in cuda code. 159 | 160 | 161 | # d_ptr, size_in_bytes = my_get_global(params[k]['name']) 162 | # drv.memcpy_dtod(d_ptr, args[k].data.gpudata, size_in_bytes) 163 | elif params[k]['gce_type'] is 'grid': 164 | if args[k]._xlap is 0: 165 | gpu_params.append(args[k].data.ptr) 166 | else: 167 | gpu_params.append(args[k].data.ptr + \ 168 | args[k]._xlap_offset) 169 | elif params[k]['gce_type'] is 'out': 170 | args[k].data.fill(args[k].dtype(0)) # Initialize the Out. 171 | gpu_params.append(args[k].data.ptr) 172 | else: 173 | raise TypeError('Invalid input type.') 174 | 175 | # See if we need to synchronize grids after kernel execution. 176 | if post_sync_grids is None: 177 | sync_pad = 0 178 | else: 179 | sync_pad = max([g._xlap for g in post_sync_grids]) 180 | 181 | start2.record(stream) 182 | comm.Barrier() 183 | start.record(stream) 184 | 185 | # Execute kernel in padded regions first. 186 | execute_range(x_start, x_start + sync_pad, gpu_params, cfg, stream) 187 | execute_range(x_end - sync_pad, x_end, gpu_params, cfg, stream) 188 | pad_done.record(stream) # Just for timing purposes. 189 | stream.synchronize() # Wait for execution to finish. 190 | 191 | # Begin kernel execution in remaining "core" region. 192 | execute_range(x_start + sync_pad, x_end - sync_pad, gpu_params, 193 | cfg, stream) 194 | comp_done.record(stream) # Timing only. 195 | 196 | # While core kernel is executing, perform synchronization. 197 | if post_sync_grids is not None: # Synchronization needed. 198 | for grid in post_sync_grids: 199 | grid.synchronize_start() # Start synchronization. 200 | 201 | # Keep on checking until everything is done. 202 | while not (all([grid.synchronize_isdone() \ 203 | for grid in post_sync_grids]) and \ 204 | stream.is_done()): 205 | pass 206 | 207 | else: # Nothing to synchronize. 208 | stream.synchronize() # Just wait for execution to finish. 209 | 210 | sync_done.record() # Timing. 211 | 212 | # Obtain the result for all Outs. 213 | batch_reduce(*[args[k] for k in range(len(params)) \ 214 | if params[k]['gce_type'] is 'out']) 215 | all_done.record() # Timing. 216 | all_done.synchronize() 217 | 218 | return comp_done.time_since( 219 | start) # Return time needed to execute the function. 220 | 221 | self.execute = execute # Save execution function in Kernel instance. 222 | self.min_exec_time = float('inf') # Stores the fastest execution time. 223 | 224 | def __call__(self, *args, **kwargs): 225 | """ Execute the kernel. 226 | 227 | Each valid execution configuration will be tried once, and then the 228 | fastest configuration will be used for all remaining calls. 229 | """ 230 | if self.exec_configs: # As long as list is not empty, choose from list. 231 | cfg = self.exec_configs.pop() # Choose execution configuration. 232 | 233 | # Execute. 234 | exec_time = self.execute(cfg, *args, **kwargs) 235 | 236 | # Check if this was the fastest execution to-date. 237 | if exec_time < self.min_exec_time: # Found a new fastest config. 238 | self.min_exec_time = exec_time 239 | self.fastest_cfg = cfg 240 | 241 | else: # If config list empty, go with the fastest configuration found. 242 | cfg = self.fastest_cfg 243 | exec_time = self.execute(cfg, *args, **kwargs) 244 | 245 | # Return results. 246 | return exec_time, cfg 247 | 248 | def _get_exec_configs(self, threads_max, padding, smem_per_thread, \ 249 | shape_filter): 250 | """ Find all valid execution configurations. """ 251 | 252 | # Padding of the kernel. 253 | y_pad = sum(padding[0:2]) 254 | z_pad = sum(padding[2:4]) 255 | 256 | # Shared memory requirements. 257 | smem_size = lambda b_shape: smem_per_thread * \ 258 | (b_shape[0] * b_shape[1]) 259 | 260 | # The kind of shapes that we are interested in. 261 | if shape_filter is 'skinny': # Only z-dominant shapes. 262 | my_filter = lambda b_shape: (b_shape[0] < b_shape[1]) and \ 263 | (b_shape[1] > 8) and ((b_shape[1] % 16) == 0) 264 | elif shape_filter is 'square': # Only square-ish shapes. 265 | my_filter = lambda b_shape: (b_shape[0] < 2 * b_shape[1]) and \ 266 | (b_shape[1] < 2 * b_shape[0]) and \ 267 | (b_shape[0] > 8) and \ 268 | (b_shape[1] > 8) 269 | elif shape_filter is 'all': # All shapes okay. 270 | my_filter = lambda b_shape: b_shape[1] > 1 # Must be greater than 1. 271 | else: 272 | raise TypeError('Unrecognized shape filter.') 273 | 274 | # Function defining valid block shapes. 275 | smem_max = get_space_info()['max_shared_mem'] 276 | is_valid_shape = lambda b_shape: (smem_size(b_shape) < smem_max) and \ 277 | my_filter(b_shape) and \ 278 | (b_shape[0] * b_shape[1]) <= \ 279 | threads_max 280 | 281 | # Create a list of all valid block shapes. 282 | valid_block_shapes = [] 283 | z_max = get_space_info()['max_block_z'] 284 | y_max = get_space_info()['max_block_y'] 285 | for j in range(y_pad + 1, y_max + 1): 286 | for k in range(z_pad + 1, z_max + 1): 287 | if is_valid_shape((j, k)): 288 | valid_block_shapes.append((j, 289 | k)) # Block shape is (yy,zz). 290 | 291 | # A hack for profiling 292 | # valid_block_shapes = ((31,16),) 293 | # valid_block_shapes = ((17,22),) 294 | 295 | if not valid_block_shapes: # Make sure the list is not empty. 296 | raise TypeError('No valid shapes found.') 297 | 298 | # Create a list of all possible execution configurations. 299 | # Note that the convention for both block_shape and grid_shape is 300 | # (yy,zz). Among other things, this leads to the (slightly) 301 | # tricky computation of grid_shape. 302 | sp_shape = get_space_info()['shape'] # Shape of the space. 303 | return [{ 'block_shape': vbs, \ 304 | 'grid_shape': (int((sp_shape[1]-1)/(vbs[0]-y_pad)) + 1, \ 305 | int((sp_shape[2]-1)/(vbs[1]-z_pad)) + 1), \ 306 | 'smem_size': smem_size(vbs)} 307 | for vbs in valid_block_shapes] 308 | -------------------------------------------------------------------------------- /maxwell-solver/maxwell_ops_lumped.py: -------------------------------------------------------------------------------- 1 | """ Implements the operations needed to solve Maxwell's equations in 3D. """ 2 | 3 | import numpy as np 4 | import copy 5 | from jinja2 import Environment, PackageLoader, Template 6 | from gce.space import initialize_space, get_space_info 7 | from gce.grid import Grid 8 | from gce.const import Const 9 | from gce.out import Out 10 | from gce.kernel import Kernel 11 | from typing import List 12 | from mpi4py.MPI import COMM_WORLD as comm 13 | 14 | # Execute when module is loaded. 15 | # Load the jinja environment. 16 | jinja_env = Environment(loader=PackageLoader(__name__, 'kernels')) 17 | 18 | 19 | def conditioners(params, dtype): 20 | """ Form the functions for both the preconditioner and postconditioner. """ 21 | 22 | # 23 | # # Code for the post step function. 24 | # code = """ 25 | # if (_in_global) { 26 | # Ex(0,0,0) *= tx1(_X) * ty0(_Y) * tz0(_Z); 27 | # Ey(0,0,0) *= tx0(_X) * ty1(_Y) * tz0(_Z); 28 | # Ez(0,0,0) *= tx0(_X) * ty0(_Y) * tz1(_Z); 29 | # } """ 30 | def reshaper(f): 31 | for k in range(3): 32 | new_shape = [1, 1, 1] 33 | new_shape[k] = f[k].size 34 | f[k] = f[k].reshape(new_shape) 35 | return f 36 | 37 | # Consts that are used. 38 | sqrt_sc_pml_0 = reshaper([dtype(np.sqrt(s)**1) for s in params['s']]) 39 | sqrt_sc_pml_1 = reshaper([dtype(np.sqrt(t)**1) for t in params['t']]) 40 | inv_sqrt_sc_pml_0 = reshaper([dtype(np.sqrt(s)**-1) for s in params['s']]) 41 | inv_sqrt_sc_pml_1 = reshaper([dtype(np.sqrt(t)**-1) for t in params['t']]) 42 | 43 | # Define the actual functions. 44 | 45 | def apply_cond(x, t0, t1): 46 | x[0] *= t1[0] * t0[1] * t0[2] 47 | x[1] *= t0[0] * t1[1] * t0[2] 48 | x[2] *= t0[0] * t0[1] * t1[2] 49 | return x 50 | 51 | def pre_step(x): 52 | return apply_cond(x, sqrt_sc_pml_0, sqrt_sc_pml_1) 53 | 54 | def post_step(x): 55 | return apply_cond(x, inv_sqrt_sc_pml_0, inv_sqrt_sc_pml_1) 56 | 57 | return pre_step, post_step 58 | 59 | 60 | def _get_cuda_type(dtype): 61 | """ Convert numpy type into cuda type. """ 62 | if dtype is np.complex64: 63 | return 'pycuda::complex' 64 | elif dtype is np.complex128: 65 | return 'pycuda::complex' 66 | else: 67 | raise TypeError('Invalid dtype.') 68 | 69 | 70 | # GPU operations 71 | #--------- 72 | def make_gpu_copy(dtype): 73 | """ Returns a function that does B=A """ 74 | # Code for the rho step function. 75 | code = Template(""" 76 | if (_in_global) { 77 | Bx(0,0,0) = Ax(0,0,0); 78 | By(0,0,0) = Ay(0,0,0); 79 | Bz(0,0,0) = Az(0,0,0); 80 | } """).render(type=_get_cuda_type(dtype)) 81 | 82 | # Compile the code using gce.Kernel 83 | grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']] 84 | copy_fun = Kernel(code, \ 85 | *[(name, 'grid', dtype) for name in grid_names], \ 86 | shape_filter='skinny') 87 | 88 | # Define the actual function. 89 | def gpu_copy(A, B): 90 | copy_fun( \ 91 | *( A + B), \ 92 | post_sync=B) # r must be post-synced for upcoming alpha step. 93 | 94 | return gpu_copy 95 | 96 | 97 | def make_gpu_norm(dtype): 98 | """ Returns a function c=vec_norm(A) that does c=sqrt(A'A) """ 99 | # GPU Code in gce.kernel. 100 | code = Template(""" 101 | if (_in_global) { 102 | norm_a += conj(Ax(0,0,0))*Ax(0,0,0); 103 | norm_a += conj(Ay(0,0,0))*Ay(0,0,0); 104 | norm_a += conj(Az(0,0,0))*Az(0,0,0); 105 | } """).render(type=_get_cuda_type(dtype)) 106 | 107 | # Compile the code using gce.Kernel 108 | grid_names = [A + i for A in ['A'] for i in ['x', 'y', 'z']] 109 | prod_fun = Kernel(code, \ 110 | ('norm_a', 'out', dtype), \ 111 | *[(name, 'grid', dtype) for name in grid_names], \ 112 | shape_filter='skinny') 113 | norm_a = Out(dtype) 114 | 115 | # Define the actual function. 116 | def gpu_norm(A): 117 | prod_fun( norm_a,\ 118 | *( A )) # remove the post_sync 119 | return np.sqrt(norm_a.get()) 120 | 121 | return gpu_norm 122 | 123 | 124 | def make_gpu_scale(dtype): 125 | """ Returns a function scale(A, a) that does A=aA """ 126 | # Code for the rho step function. 127 | code = Template(""" 128 | if (_in_global) { 129 | Ax(0,0,0) = a*Ax(0,0,0); 130 | Ay(0,0,0) = a*Ay(0,0,0); 131 | Az(0,0,0) = a*Az(0,0,0); 132 | } """).render(type=_get_cuda_type(dtype)) 133 | 134 | # Compile the code using gce.Kernel 135 | grid_names = [A + i for A in ['A'] for i in ['x', 'y', 'z']] 136 | Sum_fun = Kernel(code, \ 137 | ('a', 'number', dtype), \ 138 | *[(name, 'grid', dtype) for name in grid_names], \ 139 | shape_filter='skinny') 140 | 141 | # Define the actual function. 142 | def gpu_scale(A, a): 143 | Sum_fun(dtype(a), \ 144 | *( A ), \ 145 | post_sync=A) # r must be post-synced for upcoming alpha step. 146 | 147 | return gpu_scale 148 | 149 | 150 | def make_gpu_conj(dtype): 151 | """ Returns a function that does B=conj(A) """ 152 | # Code for the rho step function. 153 | code = Template(""" 154 | if (_in_global) { 155 | Bx(0,0,0) = conj(Ax(0,0,0)); 156 | By(0,0,0) = conj(Ay(0,0,0)); 157 | Bz(0,0,0) = conj(Az(0,0,0)); 158 | } """).render(type=_get_cuda_type(dtype)) 159 | 160 | # Compile the code using gce.Kernel 161 | grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']] 162 | conj_fun = Kernel(code, \ 163 | *[(name, 'grid', dtype) for name in grid_names], \ 164 | shape_filter='skinny') 165 | 166 | # Define the actual function. 167 | def gpu_conj(A, B): 168 | conj_fun( \ 169 | *( A + B), \ 170 | post_sync=B) # r must be post-synced for upcoming alpha step. 171 | 172 | return gpu_conj 173 | 174 | 175 | def make_gpu_dot(dtype): 176 | """ Returns a function c=vec_dot(A, B) that does c=A'B """ 177 | # GPU Code in gce.kernel. 178 | code = Template(""" 179 | if (_in_global) { 180 | dot_ab += conj(Ax(0,0,0))*Bx(0,0,0); 181 | dot_ab += conj(Ay(0,0,0))*By(0,0,0); 182 | dot_ab += conj(Az(0,0,0))*Bz(0,0,0); 183 | } """).render(type=_get_cuda_type(dtype)) 184 | 185 | # Compile the code using gce.Kernel 186 | grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']] 187 | prod_fun = Kernel(code, \ 188 | ('dot_ab', 'out', dtype), \ 189 | *[(name, 'grid', dtype) for name in grid_names], \ 190 | shape_filter='skinny') 191 | 192 | dot_ab = Out(dtype) 193 | 194 | # Define the actual function. 195 | def gpu_dot(A, B): 196 | prod_fun( dot_ab,\ 197 | *( A + B)) 198 | return dot_ab.get() 199 | 200 | return gpu_dot 201 | 202 | 203 | def make_gpu_addvec(dtype): 204 | """ Returns a function vec_addvec(A, b, B) that does A=A+bB """ 205 | # GPU Code in gce.Kernel 206 | code = Template(""" 207 | if (_in_global) { 208 | Ax(0,0,0) = Ax(0,0,0) + b*Bx(0,0,0); 209 | Ay(0,0,0) = Ay(0,0,0) + b*By(0,0,0); 210 | Az(0,0,0) = Az(0,0,0) + b*Bz(0,0,0); 211 | } """).render(type=_get_cuda_type(dtype)) 212 | 213 | # Compile the code using gce.Kernel 214 | grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']] 215 | Sum_fun = Kernel(code, \ 216 | ('b', 'number', dtype), \ 217 | *[(name, 'grid', dtype) for name in grid_names], \ 218 | shape_filter='skinny') 219 | 220 | # Define the actual function. 221 | def gpu_addvec(A, b, B): 222 | Sum_fun( dtype(b), \ 223 | *( A + B ), \ 224 | post_sync=A) # r must be post-synced for upcoming alpha step. 225 | 226 | return gpu_addvec 227 | 228 | 229 | def make_gpu_scaled_copy(dtype): 230 | """ Returns a function vec_scaled_copy(A, a, B) that does B=aA """ 231 | # GPU code for the Kernel 232 | code = Template(""" 233 | if (_in_global) { 234 | Bx(0,0,0) = a*Ax(0,0,0); 235 | By(0,0,0) = a*Ay(0,0,0); 236 | Bz(0,0,0) = a*Az(0,0,0); 237 | } """).render(type=_get_cuda_type(dtype)) 238 | 239 | # Compile the code using gce.Kernel 240 | grid_names = [A + i for A in ['A', 'B'] for i in ['x', 'y', 'z']] 241 | Sum_fun = Kernel( code, \ 242 | ('a', 'number', dtype), \ 243 | *[(name, 'grid', dtype) for name in grid_names], \ 244 | shape_filter='skinny') 245 | 246 | # Define the actual function. 247 | def gpu_scaled_copy(A, a, B): 248 | Sum_fun(dtype(a), \ 249 | *( A + B ), \ 250 | post_sync=B) 251 | 252 | return gpu_scaled_copy 253 | 254 | 255 | def make_gpu_sum(dtype): 256 | """ Returns a function that does aA+bB=C """ 257 | # Code for the rho step function. 258 | code = Template(""" 259 | if (_in_global) { 260 | Cx(0,0,0) = a*Ax(0,0,0) + b*Bx(0,0,0); 261 | Cy(0,0,0) = a*Ay(0,0,0) + b*By(0,0,0); 262 | Cz(0,0,0) = a*Az(0,0,0) + b*Bz(0,0,0); 263 | } """).render(type=_get_cuda_type(dtype)) 264 | 265 | # Compile the code using gce.Kernel 266 | grid_names = [A + i for A in ['A', 'B', 'C'] for i in ['x', 'y', 'z']] 267 | Sum_fun = Kernel(code, \ 268 | ('a', 'number', dtype), \ 269 | ('b', 'number', dtype), \ 270 | *[(name, 'grid', dtype) for name in grid_names], \ 271 | shape_filter='skinny') 272 | 273 | # Define the actual function. 274 | def gpu_sum(a, b, A, B, C): 275 | Sum_fun(dtype(a), dtype(b), \ 276 | *( A + B + C), \ 277 | post_sync=C) # r must be post-synced for upcoming alpha step. 278 | 279 | return gpu_sum 280 | 281 | 282 | def make_gpu_weighted_sum(dtype): 283 | """ Return weighted sum function """ 284 | # returns function vec_weighted_sum(V,y,U) that will do: 285 | # U = y1*V1 + y2*V2 + ... + yn*Vn 286 | # Note: you can not have U in V !!! 287 | gpu_scaled_copy = make_gpu_scaled_copy(dtype) 288 | gpu_addvec = make_gpu_addvec(dtype) 289 | 290 | def gpu_weighted_sum(L: List[Grid], y: np.ndarray, A): 291 | gpu_scaled_copy(L[0], y[0], A) 292 | for i in range(1, len(y)): 293 | gpu_addvec(A, y[i], L[i]) 294 | 295 | return gpu_weighted_sum 296 | 297 | 298 | def make_gpu_fdfd_residual(params, dtype): 299 | """ Return function get_residual(X, B, R) that will do R = B - AX """ 300 | 301 | ### this will be wrong !!! this code is not adapted to the needed changes to make the 302 | ### biCGstab work 303 | 304 | num_shared_banks = 6 # TODO Dries: does this need to be increased? 305 | 306 | # Render the pre-loop and in-loop code. 307 | cuda_type = _get_cuda_type(dtype) 308 | code_allpre = jinja_env.get_template('fdfd_residual_pec_pmc.cu').\ 309 | render(dims=params['shape'], \ 310 | type=cuda_type, \ 311 | mu_equals_1=False, \ 312 | full_operator=True) 313 | 314 | # Grid input parameters. 315 | grid_params = [(A + i, 'grid', dtype) for A in ['X', 'B', 'R', 'e', 'm'] \ 316 | for i in ['x', 'y', 'z']] 317 | 318 | # Const input parameters. 319 | const_names = ('sx0', 'sy0', 'sz0', 'sx1', 'sy1', 'sz1') + \ 320 | ('sqrt_sx0', 'sqrt_sy0', 'sqrt_sz0', \ 321 | 'sqrt_sx1', 'sqrt_sy1', 'sqrt_sz1') + \ 322 | ('bloch_x', 'bloch_y', 'bloch_z') 323 | const_sizes = params['shape'] * 4 + tuple([3]) * 3 324 | const_params = [(const_names[k], 'const', dtype, const_sizes[k]) \ 325 | for k in range(len(const_sizes))] 326 | const_params.append(('pemc', 'const', params['pemc'].dtype.type, 6)) 327 | 328 | # Compile. (note shape_filter = 'square') 329 | residual_fun = Kernel('', \ 330 | *(grid_params + const_params), \ 331 | pre_loop=code_allpre, \ 332 | padding=(1,1,1,1), \ 333 | smem_per_thread=num_shared_banks*16, \ 334 | shape_filter='square') 335 | 336 | # Temporary variables. 337 | 338 | # Grid variables. 339 | # !!!!! here eps is scattered over the GPUs when e intitialised 340 | e = [Grid(dtype(f), x_overlap=1) for f in params['e']] 341 | m = [Grid(dtype(f), x_overlap=1) for f in params['m']] # Optional. 342 | 343 | # Constant variables. 344 | sc_pml_0 = [Const(dtype(s**-1)) for s in params['s']] 345 | sc_pml_1 = [Const(dtype(t**-1)) for t in params['t']] 346 | sqrt_sc_pml_0 = [Const(dtype(np.sqrt(s**-1))) for s in params['s']] 347 | sqrt_sc_pml_1 = [Const(dtype(np.sqrt(t**-1))) for t in params['t']] 348 | bloch_x = [Const(dtype(params['bloch_phase'][0]))] 349 | bloch_y = [Const(dtype(params['bloch_phase'][1]))] 350 | bloch_z = [Const(dtype(params['bloch_phase'][2]))] 351 | pemc = [Const(params['pemc'])] 352 | 353 | # Define the function 354 | def gpu_fdfd_residual(X, B, R): 355 | # Execute cuda code. 356 | residual_fun( \ 357 | *(X + B + R + e + m + \ 358 | sc_pml_0 + sc_pml_1 + sqrt_sc_pml_0 + sqrt_sc_pml_1 + \ 359 | bloch_x + bloch_y + bloch_z + pemc), \ 360 | post_sync = R) 361 | 362 | return gpu_fdfd_residual 363 | 364 | 365 | def make_gpu_fdfd_matrix_multiplication(params, dtype): 366 | """ Return function vec_matrix_multiplication(X, B) that will do AX=B """ 367 | 368 | num_shared_banks = 6 369 | 370 | # Render the pre-loop and in-loop code. 371 | cuda_type = _get_cuda_type(dtype) 372 | code_allpre = jinja_env.get_template('fdfd_matrix_multiplication_pec_pmc.cu').\ 373 | render(dims=params['shape'], \ 374 | type=cuda_type, \ 375 | mu_equals_1=False, \ 376 | full_operator=True) 377 | 378 | # Grid input parameters. 379 | grid_params = [(A + i, 'grid', dtype) for A in ['X', 'B', 'e', 'm'] \ 380 | for i in ['x', 'y', 'z']] 381 | 382 | # Const input parameters. 383 | const_names = ('sx0', 'sy0', 'sz0', 'sx1', 'sy1', 'sz1') + \ 384 | ('sqrt_sx0', 'sqrt_sy0', 'sqrt_sz0', \ 385 | 'sqrt_sx1', 'sqrt_sy1', 'sqrt_sz1') + \ 386 | ('bloch_x', 'bloch_y', 'bloch_z') 387 | const_sizes = params['shape'] * 4 + tuple([3]) * 3 388 | const_params = [(const_names[k], 'const', dtype, const_sizes[k]) \ 389 | for k in range(len(const_sizes))] 390 | const_params.append(('pemc', 'const', params['pemc'].dtype.type, 6)) 391 | 392 | # Compile. (note shape_filter = 'square') 393 | A_multiplication_fun = Kernel('', \ 394 | *(grid_params + const_params), \ 395 | pre_loop=code_allpre, \ 396 | padding=(1,1,1,1), \ 397 | smem_per_thread=num_shared_banks*16, \ 398 | shape_filter='square') 399 | 400 | # Temporary variables. 401 | 402 | # Grid variables. 403 | # !!!!! here eps is scattered over the GPUs when e intitialised 404 | e = [Grid(dtype(f), x_overlap=1) for f in params['e']] 405 | m = [Grid(dtype(f), x_overlap=1) for f in params['m']] # Optional. 406 | 407 | # Constant variables. 408 | sc_pml_0 = [Const(dtype(s**-1)) for s in params['s']] 409 | sc_pml_1 = [Const(dtype(t**-1)) for t in params['t']] 410 | sqrt_sc_pml_0 = [Const(dtype(np.sqrt(s**-1))) for s in params['s']] 411 | sqrt_sc_pml_1 = [Const(dtype(np.sqrt(t**-1))) for t in params['t']] 412 | bloch_x = [Const(dtype(params['bloch_phase'][0]))] 413 | bloch_y = [Const(dtype(params['bloch_phase'][1]))] 414 | bloch_z = [Const(dtype(params['bloch_phase'][2]))] 415 | pemc = [Const(params['pemc'])] 416 | 417 | # Define the function 418 | def gpu_fdfd_matrix_multiplication(X, B): 419 | # Execute cuda code. 420 | A_multiplication_fun( \ 421 | *(X + B + e + m + \ 422 | sc_pml_0 + sc_pml_1 + sqrt_sc_pml_0 + sqrt_sc_pml_1 + \ 423 | bloch_x + bloch_y + bloch_z + pemc), \ 424 | post_sync = B) 425 | 426 | return gpu_fdfd_matrix_multiplication 427 | 428 | 429 | def make_gpu_cond(dtype, cond): 430 | """ Returns a function gpu_cond(A) that does A=A*C """ 431 | # GPU Code in gce.Kernel 432 | code = Template(""" 433 | if (_in_global) { 434 | Ax(0,0,0) = Ax(0,0,0)*Cx(0,0,0); 435 | Ay(0,0,0) = Ay(0,0,0)*Cy(0,0,0); 436 | Az(0,0,0) = Az(0,0,0)*Cz(0,0,0); 437 | } """).render(type=_get_cuda_type(dtype)) 438 | 439 | # Compile the code using gce.Kernel 440 | grid_names = [A + i for A in ['A', 'C'] for i in ['x', 'y', 'z']] 441 | Sum_fun = Kernel(code, \ 442 | *[(name, 'grid', dtype) for name in grid_names], \ 443 | shape_filter='skinny') 444 | 445 | C = cond 446 | 447 | # Define the actual function. 448 | def gpu_cond(A): 449 | Sum_fun(*( A + C ), \ 450 | post_sync=A) # r must be post-synced for upcoming alpha step. 451 | 452 | return gpu_cond 453 | 454 | 455 | def make_DB_get_vec(dtype): 456 | """ Returns a function that does aA+bB=C """ 457 | # Code for the rho step function. 458 | temp = [Grid(dtype, x_overlap=1) for k in range(3)] 459 | gpu_scaled_copy = make_gpu_scaled_copy(dtype) 460 | 461 | # Define the actual function. 462 | def DB_get_vec(A): 463 | gpu_scaled_copy(A, 1, temp) 464 | out = [E.get() for E in temp] 465 | return out 466 | 467 | return DB_get_vec 468 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | --------------------------------------------------------------------------------