├── .github └── workflows │ └── python-publish.yml ├── .gitignore ├── AUTHORS ├── LICENSE ├── README.rst ├── gimmik ├── __init__.py ├── _version.py ├── base.py ├── c.py ├── copenmp.py ├── cuda.py ├── hip.py ├── ispc.py ├── kernels │ ├── c-openmp │ │ └── cstream.mako │ ├── c │ │ └── cstream.mako │ ├── cuda │ │ ├── base.mako │ │ ├── bstream-msplit.mako │ │ ├── bstream.mako │ │ ├── cstream-ksplit.mako │ │ └── cstream.mako │ ├── hip │ │ ├── base.mako │ │ ├── bstream-msplit.mako │ │ ├── bstream.mako │ │ ├── cstream-ksplit.mako │ │ └── cstream.mako │ ├── ispc │ │ └── cstream.mako │ ├── metal │ │ ├── base.mako │ │ ├── bstream-msplit.mako │ │ ├── bstream.mako │ │ ├── cstream-ksplit.mako │ │ └── cstream.mako │ └── opencl │ │ ├── bstream-msplit.mako │ │ ├── bstream.mako │ │ ├── cstream-ksplit.mako │ │ └── cstream.mako ├── metal.py └── opencl.py └── setup.py /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | workflow_dispatch: 15 | 16 | permissions: 17 | contents: read 18 | 19 | jobs: 20 | deploy: 21 | 22 | runs-on: ubuntu-latest 23 | 24 | steps: 25 | - uses: actions/checkout@v3 26 | - name: Set up Python 27 | uses: actions/setup-python@v3 28 | with: 29 | python-version: '3.x' 30 | - name: Install dependencies 31 | run: | 32 | python -m pip install --upgrade pip 33 | pip install build 34 | - name: Build package 35 | run: python -m build 36 | - name: Publish package 37 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 38 | with: 39 | user: __token__ 40 | password: ${{ secrets.PYPI_API_TOKEN }} 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Freddie Witherden 2 | Bartosz Wozniak 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, 2015, 2016 Fredie Witherden and Bartosz Wozniak 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of GiMMiK nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | GiMMiK 2 | ====== 3 | Generator of Matrix Multiplication Kernels - GiMMiK - is a tool for generation of high performance matrix multiplication kernel code for various accelerator platforms. Currently C, CUDA, HIP, ISPC, Metal, and OpenCL are supported. 4 | 5 | What does GiMMiK do? 6 | -------------------- 7 | Consider matrix multiplication of the form 8 | 9 | C = α∙A×B + β∙C 10 | 11 | GiMMiK generates fully unrolled kernels, highly specialised to a given operator matrix. The generated code is fully unrolled - each kernel computes a single column of the output matrix. GiMMiK was designed to perform well in a Block by Panel type of matrix multiplication where the operator matrix is small. GiMMiK also removes any sparsity form the operator matrix as well as attempts to reduce common sub-expressions. 12 | 13 | How do I install GiMMiK? 14 | ------------------------ 15 | Clone the git repository and use `setup.py` to install the GiMMiK package. You will need the following dependencies: 16 | 17 | * `mako `_ 18 | * `numpy >= 1.7 `_ 19 | 20 | Once obtained, you can install GiMMiK by running 21 | 22 | :: 23 | 24 | python setup.py install 25 | 26 | to perform a system-wide install. Alternatively, run 27 | :: 28 | 29 | python setup.py install --user 30 | 31 | to install the package locally. 32 | 33 | How do I use GiMMiK? 34 | -------------------- 35 | Once installed, you are ready to use GiMMiK. 36 | 37 | .. code:: python 38 | 39 | from gimmik import generate_mm 40 | 41 | ... 42 | 43 | # Generate a CUDA kernel for C = 2*mat*B 44 | src = generate_mm(mat, np.float32, platform='cuda', alpha=2.0, beta=0.0) 45 | 46 | ... 47 | 48 | Who uses GiMMiK? 49 | ---------------- 50 | GiMMiK was develop to improve performance of the `PyFR `_ framework. 51 | -------------------------------------------------------------------------------- /gimmik/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from gimmik._version import __version__ 4 | from gimmik.c import CMatMul 5 | from gimmik.copenmp import COpenMPMatMul 6 | from gimmik.cuda import CUDAMatMul 7 | from gimmik.ispc import ISPCMatMul 8 | from gimmik.hip import HIPMatMul 9 | from gimmik.metal import MetalMatMul 10 | from gimmik.opencl import OpenCLMatMul 11 | 12 | 13 | def generate_mm(mat, dtype, platform, alpha=1.0, beta=0.0, funcn='gimmik_mm', 14 | n=None, ldb=None, ldc=None): 15 | import warnings 16 | 17 | warnings.warn('generate_mm is deprecated, use MatMul', DeprecationWarning) 18 | 19 | platmap = { 20 | 'c': CMatMul, 21 | 'c-omp': COpenMPMatMul, 22 | 'cuda': CUDAMatMul, 23 | 'ispc': ISPCMatMul, 24 | 'hip': HIPMatMul, 25 | 'opencl': OpenCLMatMul 26 | } 27 | 28 | mm = platmap[platform](alpha*mat, beta, None, n, ldb, ldc) 29 | return next(mm.kernels(dtype, kname=funcn))[0] 30 | -------------------------------------------------------------------------------- /gimmik/_version.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __version__ = '3.2.1' 4 | -------------------------------------------------------------------------------- /gimmik/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it 4 | import pkgutil 5 | import re 6 | 7 | from mako.lookup import TemplateLookup 8 | from mako.template import Template 9 | import numpy as np 10 | 11 | 12 | class _PlatformTemplateLookup(TemplateLookup): 13 | def __init__(self, platform): 14 | self.platform = platform 15 | 16 | def adjust_uri(self, uri, relto): 17 | return uri 18 | 19 | def get_template(self, name): 20 | platform = self.platform 21 | src = pkgutil.get_data(__name__, f'kernels/{platform}/{name}.mako') 22 | 23 | return Template(src, lookup=self) 24 | 25 | 26 | def _dot(bfn, row, maxsplit=1): 27 | nzixs, = np.nonzero(row) 28 | 29 | if not nzixs.size: 30 | return '0.0' 31 | 32 | nsplit = max(min(maxsplit, nzixs.size // 3), 1) 33 | snzixs = np.array_split(nzixs, nsplit) 34 | 35 | frags = [' + '.join(f'{row[i]}*{bfn(i)}' for i in ix) for ix in snzixs] 36 | return ' + '.join(f'({f})' for f in frags) 37 | 38 | 39 | def _partition(mat, into, by): 40 | if by == 'rows': 41 | return [list(range(i, len(mat), into)) for i in range(into)] 42 | elif by == 'cols': 43 | return [list(range(i, len(mat.T), into)) for i in range(into)] 44 | else: 45 | raise ValueError('Invalid partition by') 46 | 47 | 48 | def _chunk(l, chunksz): 49 | l, n = iter(l), len(l) 50 | nchunks = -(-n // chunksz) 51 | 52 | return [list(it.islice(l, chunksz)) for i in range(nchunks)] 53 | 54 | 55 | class MatMul: 56 | platform = None 57 | 58 | def __init__(self, A, beta=0.0, aligne=None, n=None, ldb=None, ldc=None): 59 | self.A = A 60 | self.beta = beta 61 | self.aligne = aligne 62 | 63 | if n is None and ldb is None and ldc is None: 64 | self.n = self.ldb = self.ldc = None 65 | elif n is not None and ldb is not None and ldc is not None: 66 | if aligne is not None and (ldb % aligne or ldc % aligne): 67 | raise ValueError('ldb/ldc not compatible with aligne') 68 | 69 | self.n, self.ldb, self.ldc = n, ldb, ldc 70 | else: 71 | raise ValueError('Must provide all of (n, ldb, ldc) or none') 72 | 73 | # Check the matrix has a non-zero 74 | if not A.any(): 75 | raise ValueError('A can not be empty') 76 | 77 | # Extract the shape of A 78 | self.m, self.k = m, k = A.shape 79 | 80 | # Determine the index of the first and last non-zero in each row of A 81 | self.afix = (A != 0).argmax(axis=1) 82 | self.alix = k - 1 - (A != 0)[:, ::-1].argmax(axis=1) 83 | 84 | # Mark rows of A which are all zero 85 | self.afix = np.where(np.any(A != 0, axis=1), self.afix, -1) 86 | self.alix = np.where(np.any(A != 0, axis=1), self.alix, -1) 87 | self.has_zero_rows = np.any(self.afix == -1) 88 | 89 | # Determine which entries of B partake in the multiplication 90 | self.bix = np.nonzero(np.any(A != 0, axis=0))[0] 91 | self.bix = {kx: k for k, kx in enumerate(self.bix)} 92 | 93 | def kernels(self, dtype, kname='gimmik_mm', **kwargs): 94 | basemeta = self.basemeta 95 | 96 | # Process the data type 97 | dtype = np.dtype(dtype).type 98 | if dtype == np.float32: 99 | dtype, dsize = 'float', 4 100 | elif dtype == np.float64: 101 | dtype, dsize = 'double', 8 102 | else: 103 | raise ValueError('Invalid floating point data type') 104 | 105 | # Common template arguments 106 | baseargs = { 107 | 'dtype': dtype, 'kname': kname, 108 | 'A': self.A, 'beta': self.beta, 'width': 1, 109 | 'm': self.m, 'n': self.n, 'k': self.k, 110 | 'ldb': self.ldb, 'ldc': self.ldc, 111 | 'afix': self.afix, 'alix': self.alix, 'bix': self.bix, 112 | 'dot': _dot, 'partition': _partition, 'chunk': _chunk 113 | } 114 | 115 | # Incrementally generate and render the kernels 116 | gen = self._kernel_generators(dtype, dsize, **kwargs) 117 | try: 118 | resp = None 119 | while True: 120 | # Generate the next kernel in the sequence 121 | name, exargs, exmeta = gen.send(resp) 122 | 123 | # Merge in the base arguments and metadata 124 | args = baseargs | exargs 125 | meta = basemeta | exmeta 126 | 127 | # Render the kernel template 128 | src = self._render_kernel(dtype, name, args) 129 | 130 | # Post-process the metadata 131 | meta['tplname'] = name 132 | self._process_meta(meta) 133 | 134 | # Yield the source and metadata and await a response 135 | resp = yield (src, meta) 136 | except StopIteration: 137 | pass 138 | 139 | def _process_meta(self, meta): 140 | pass 141 | 142 | def _render_kernel(self, dtype, tplname, tplargs): 143 | tpl = _PlatformTemplateLookup(self.platform).get_template(tplname) 144 | src = tpl.render(**tplargs) 145 | 146 | # At single precision suffix all floating point constants by 'f' 147 | if dtype == 'float': 148 | src = re.sub(r'(?=\d*[.eE])(?=\.?\d)\d*\.?\d*(?:[eE][+-]?\d+)?', 149 | r'\g<0>f', src) 150 | 151 | # Cleanup 152 | src = re.sub(r'^\w+\n$', '', src.strip()) 153 | src = re.sub(r'\n\n+', r'\n\n', src) + '\n' 154 | src = re.sub(r'\w+$', '', src) 155 | return src 156 | -------------------------------------------------------------------------------- /gimmik/c.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from gimmik.base import MatMul 4 | 5 | 6 | class CMatMul(MatMul): 7 | platform = 'c' 8 | basemeta = {} 9 | 10 | def _kernel_generators(self, dtype, dsize): 11 | yield ('cstream', {}, {}) 12 | -------------------------------------------------------------------------------- /gimmik/copenmp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from gimmik.base import MatMul 4 | 5 | 6 | class COpenMPMatMul(MatMul): 7 | platform = 'c-openmp' 8 | basemeta = {} 9 | 10 | def _kernel_generators(self, dtype, dsize): 11 | yield ('cstream', {}, {}) 12 | 13 | -------------------------------------------------------------------------------- /gimmik/cuda.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from gimmik.base import MatMul 4 | 5 | 6 | class CUDAMatMul(MatMul): 7 | platform = 'cuda' 8 | basemeta = {'block': (128, 1, 1), 'width': 1, 'shared': 0, 9 | 'dynamic_shared': 0} 10 | 11 | def _kernel_generators(self, dtype, dsize, *, compute_capability=None): 12 | # B loading, C streaming kernel 13 | yield ('cstream', {}, {}) 14 | 15 | # B streaming, C accumulation kernel 16 | yield ('bstream', {}, {}) 17 | 18 | # Four-way m-split B streaming, C accumulation kernel 19 | ms, bsz, blkx = 4, 24, 32 20 | args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} 21 | meta = {'block': (blkx, ms, 1), 'shared': 2*bsz*blkx*dsize} 22 | yield ('bstream-msplit', args, meta) 23 | 24 | # Two-way k-split B loading, C streaming kernel 25 | ks, csz, blkx = 2, 24, 32 26 | args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} 27 | meta = {'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize} 28 | yield ('cstream-ksplit', args, meta) 29 | 30 | # At single precision also consider vectorized kernels 31 | if (dtype == 'float' and 32 | self.aligne is not None and self.aligne % 2 == 0): 33 | # Vector B loading, C streaming kernel 34 | args = {'dtype': 'float2', 'width': 2} 35 | meta = {'width': 2} 36 | yield ('cstream', args, meta) 37 | 38 | # Vector four-way m-split B streaming, C accumulation kernel 39 | ms, bsz, blkx = 4, 16, 32 40 | args = {'dtype': 'float2', 'width': 2, 'msplit': ms, 41 | 'bsz': bsz, 'blockx': blkx} 42 | meta = {'block': (blkx, ms, 1), 'width': 2, 43 | 'shared': 2*blkx*bsz*2*dsize} 44 | yield ('bstream-msplit', args, meta) 45 | 46 | # Vector two-way k-split B loading, C streaming kernel 47 | ks, csz, blkx = 2, 24, 32 48 | args = {'dtype': 'float2', 'width': 2, 'ksplit': ks, 49 | 'csz': csz, 'blockx': blkx} 50 | meta = {'block': (blkx, ks, 1), 'width': 2, 51 | 'shared': 2*(ks - 1)*csz*blkx*dsize} 52 | yield ('cstream-ksplit', args, meta) 53 | 54 | def _process_meta(self, meta): 55 | if self.n is not None: 56 | div = meta['block'][0]*meta['width'] 57 | meta['grid'] = (-(-self.n // div), 1, 1) 58 | -------------------------------------------------------------------------------- /gimmik/hip.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from gimmik.base import MatMul 4 | 5 | 6 | class HIPMatMul(MatMul): 7 | platform = 'hip' 8 | basemeta = {'block': (128, 1, 1), 'width': 1, 'shared': 0} 9 | 10 | def _kernel_generators(self, dtype, dsize, *, gcn_arch=None, warp_size=64): 11 | # B loading, C streaming kernel 12 | yield ('cstream', {}, {}) 13 | 14 | # B streaming, C accumulation kernel 15 | yield ('bstream', {}, {}) 16 | 17 | # Four-way m-split B streaming, C accumulation kernel 18 | ms, bsz, blkx = 4, 24, 64 19 | args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx} 20 | meta = {'block': (blkx, ms, 1), 'shared': 2*bsz*blkx*dsize} 21 | yield ('bstream-msplit', args, meta) 22 | 23 | # Two-way k-split B loading, C streaming kernel 24 | ks, csz, blkx = 2, 24, 64 25 | args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} 26 | meta = {'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize} 27 | yield ('cstream-ksplit', args, meta) 28 | 29 | def _process_meta(self, meta): 30 | if self.n is not None: 31 | div = meta['block'][0]*meta['width'] 32 | meta['grid'] = (-(-self.n // div), 1, 1) 33 | -------------------------------------------------------------------------------- /gimmik/ispc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from gimmik.base import MatMul 4 | 5 | 6 | class ISPCMatMul(MatMul): 7 | platform = 'ispc' 8 | basemeta = {} 9 | 10 | def _kernel_generators(self, dtype, dsize): 11 | yield ('cstream', {}, {}) 12 | -------------------------------------------------------------------------------- /gimmik/kernels/c-openmp/cstream.mako: -------------------------------------------------------------------------------- 1 | void 2 | % if n is None: 3 | ${kname}(int n, 4 | const ${dtype}* restrict b, int ldb, 5 | ${dtype}* restrict c, int ldc) 6 | { 7 | % else: 8 | ${kname}(const ${dtype}* restrict b, ${dtype}* restrict c) 9 | { 10 | const int n = ${n}; 11 | const ${'long long' if k*ldb >= 2**31 else 'int'} ldb = ${ldb}; 12 | const ${'long long' if m*ldc >= 2**31 else 'int'} ldc = ${ldc}; 13 | % endif 14 | 15 | #pragma omp parallel for simd private(dotp) 16 | for (int i = 0; i < n; i++) 17 | { 18 | % for j, jx in enumerate(A): 19 | % if beta == 0: 20 | c[i + ${j}*ldc] = ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)}; 21 | % elif beta == 1: 22 | c[i + ${j}*ldc] += ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)}; 23 | % else: 24 | c[i + ${j}*ldc] = ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)} 25 | + ${beta}*c[i + ${j}*ldc]; 26 | % endif 27 | % endfor 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /gimmik/kernels/c/cstream.mako: -------------------------------------------------------------------------------- 1 | void 2 | % if n is None: 3 | ${kname}(int n, 4 | const ${dtype}* restrict b, int ldb, 5 | ${dtype}* restrict c, int ldc) 6 | { 7 | % else: 8 | ${kname}(const ${dtype}* restrict b, ${dtype}* restrict c) 9 | { 10 | const int n = ${n}; 11 | const ${'long long' if k*ldb >= 2**31 else 'int'} ldb = ${ldb}; 12 | const ${'long long' if m*ldc >= 2**31 else 'int'} ldc = ${ldc}; 13 | % endif 14 | 15 | #pragma omp simd 16 | for (int i = 0; i < n; i++) 17 | { 18 | % for j, jx in enumerate(A): 19 | % if beta == 0: 20 | c[i + ${j}*ldc] = ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)}; 21 | % elif beta == 1: 22 | c[i + ${j}*ldc] += ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)}; 23 | % else: 24 | c[i + ${j}*ldc] = ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)} 25 | + ${beta}*c[i + ${j}*ldc]; 26 | % endif 27 | % endfor 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /gimmik/kernels/cuda/base.mako: -------------------------------------------------------------------------------- 1 | % if dtype.endswith('4'): 2 | inline __device__ ${dtype} operator+(${dtype} a, ${dtype} b) 3 | { return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } 4 | 5 | inline __device__ ${dtype} operator*(${dtype[:-1]} a, ${dtype} b) 6 | { return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); } 7 | 8 | inline __device__ void operator+=(${dtype} &a, ${dtype} b) 9 | { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; } 10 | 11 | inline __device__ ${dtype} make_zero() 12 | { return make_${dtype}(0, 0, 0, 0); } 13 | % elif dtype.endswith('2'): 14 | inline __device__ ${dtype} operator+(${dtype} a, ${dtype} b) 15 | { return make_${dtype}(a.x + b.x, a.y + b.y); } 16 | 17 | inline __device__ ${dtype} operator*(${dtype[:-1]} a, ${dtype} b) 18 | { return make_${dtype}(a*b.x, a*b.y); } 19 | 20 | inline __device__ void operator+=(${dtype} &a, ${dtype} b) 21 | { a.x += b.x; a.y += b.y; } 22 | 23 | inline __device__ ${dtype} make_zero() 24 | { return make_${dtype}(0, 0); } 25 | % else: 26 | inline __device__ ${dtype} make_zero() 27 | { return 0; } 28 | % endif 29 | 30 | ${next.body()} 31 | -------------------------------------------------------------------------------- /gimmik/kernels/cuda/bstream-msplit.mako: -------------------------------------------------------------------------------- 1 | <%inherit file='base'/> 2 | 3 | <% 4 | mx = partition(A, into=msplit, by='rows') 5 | bchunks = chunk(bix, bsz) 6 | %> 7 | 8 | __global__ void 9 | % if n is None: 10 | ${kname}(int n, 11 | const ${dtype}* __restrict__ b, int ldb, 12 | ${dtype}* __restrict__ c, int ldc) 13 | { 14 | % if width > 1: 15 | n = ((n + ${width} - 1) / ${width}) * ${width}; 16 | ldb /= ${width}; 17 | ldc /= ${width}; 18 | % endif 19 | % else: 20 | ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) 21 | { 22 | const int n = ${-(-n // width)}; 23 | const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; 24 | const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; 25 | % endif 26 | int i = blockDim.x*blockIdx.x + threadIdx.x; 27 | 28 | ${dtype} bv, csub[${-(-m // msplit)}]; 29 | __shared__ ${dtype} bsub[2][${bsz}][${blockx}]; 30 | 31 | if (i >= n) 32 | return; 33 | 34 | ## Iterate over each row-chunk of C 35 | % for cid, mcx in enumerate(mx): 36 | if (threadIdx.y == ${cid}) 37 | { 38 | ## Iterate over each row-chunk of B 39 | % for bb in range(len(bchunks)): 40 | ## Fill the initial shared memory block 41 | % if loop.first: 42 | % for kx in bchunks[0]: 43 | % if loop.index % msplit == cid: 44 | bsub[0][${loop.index}][threadIdx.x] = __ldcg(b + i + ${kx}*ldb); 45 | % endif 46 | % endfor 47 | __barrier_sync(0); 48 | % endif 49 | ## Start filling the next shared memory block 50 | % if not loop.last: 51 | % for kx in bchunks[bb + 1]: 52 | % if loop.index % msplit == cid: 53 | bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = __ldcg(b + i + ${kx}*ldb); 54 | % endif 55 | % endfor 56 | % endif 57 | ## Accumulate our dot products 58 | % for kx in bchunks[bb]: 59 | bv = bsub[${bb % 2}][${loop.index}][threadIdx.x]; 60 | % for j, jx in enumerate(A[mcx, kx]): 61 | % if jx != 0 and kx == afix[mcx[j]]: 62 | csub[${j}] = ${jx}*bv; 63 | % elif jx != 0: 64 | csub[${j}] += ${jx}*bv; 65 | % endif 66 | ## If we're done with this dot product then store to global 67 | % if kx == alix[mcx[j]] and beta == 0: 68 | __stcg(c + i + ${mcx[j]}*ldc, csub[${j}]); 69 | % elif kx == alix[mcx[j]] and beta == 1: 70 | c[i + ${mcx[j]}*ldc] += csub[${j}]; 71 | % elif kx == alix[mcx[j]]: 72 | c[i + ${mcx[j]}*ldc] = csub[${j}] + ${beta}*c[i + ${mcx[j]}*ldc]; 73 | % endif 74 | % endfor 75 | % endfor 76 | __barrier_sync(0); 77 | % endfor 78 | ## Handle rows of A which are all zero 79 | % for j, jx in enumerate(afix): 80 | % if jx == -1 and j % msplit == cid and beta == 0: 81 | __stcg(c + i + ${j}*ldc, make_zero()); 82 | % elif jx == -1 and j % msplit == cid and beta != 1: 83 | c[i + ${j}*ldc] *= ${beta}; 84 | % endif 85 | % endfor 86 | } 87 | % endfor 88 | } 89 | -------------------------------------------------------------------------------- /gimmik/kernels/cuda/bstream.mako: -------------------------------------------------------------------------------- 1 | <%inherit file='base'/> 2 | 3 | __global__ void 4 | % if n is None: 5 | ${kname}(int n, 6 | const ${dtype}* __restrict__ b, int ldb, 7 | ${dtype}* __restrict__ c, int ldc) 8 | { 9 | % if width > 1: 10 | n = ((n + ${width} - 1) / ${width}) * ${width}; 11 | ldb /= ${width}; 12 | ldc /= ${width}; 13 | % endif 14 | % else: 15 | ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) 16 | { 17 | const int n = ${-(-n // width)}; 18 | const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; 19 | const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; 20 | % endif 21 | const int i = blockDim.x*blockIdx.x + threadIdx.x; 22 | 23 | if (i < n) 24 | { 25 | ${dtype} bv, csub[${m}]; 26 | 27 | ## Iterare through the used rows of B 28 | % for kx in bix: 29 | bv = __ldcg(b + i + ${kx}*ldb); 30 | % for j, jx in enumerate(A[:, kx]): 31 | % if jx != 0 and kx == afix[j]: 32 | csub[${j}] = ${jx}*bv; 33 | % elif jx != 0: 34 | csub[${j}] += ${jx}*bv; 35 | % endif 36 | ## 37 | % if kx == alix[j] and beta == 0: 38 | __stcg(c + i + ${j}*ldc, csub[${j}]); 39 | % elif kx == alix[j] and beta == 1: 40 | c[i + ${j}*ldc] += csub[${j}]; 41 | % elif kx == alix[j]: 42 | c[i + ${j}*ldc] = csub[${j}] + ${beta}*c[i + ${j}*ldc]; 43 | % endif 44 | % endfor 45 | % endfor 46 | 47 | ## Handle rows of A which are all zero 48 | % for j, jx in enumerate(afix): 49 | % if jx == -1 and beta == 0: 50 | c[i + ${j}*ldc] = make_zero(); 51 | % elif jx == -1 and beta != 1: 52 | c[i + ${j}*ldc] *= ${beta}; 53 | % endif 54 | % endfor 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /gimmik/kernels/cuda/cstream-ksplit.mako: -------------------------------------------------------------------------------- 1 | <%inherit file='base'/> 2 | 3 | <% 4 | kparts = partition(A, ksplit, by='cols') 5 | cchunks = chunk(range(m), csz) 6 | loaded = set() 7 | %> 8 | 9 | __global__ void 10 | % if n is None: 11 | ${kname}(int n, 12 | const ${dtype}* __restrict__ b, int ldb, 13 | ${dtype}* __restrict__ c, int ldc) 14 | { 15 | % if width > 1: 16 | n = ((n + ${width} - 1) / ${width}) * ${width}; 17 | ldb /= ${width}; 18 | ldc /= ${width}; 19 | % endif 20 | % else: 21 | ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) 22 | { 23 | const int n = ${-(-n // width)}; 24 | const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; 25 | const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; 26 | % endif 27 | const int i = blockDim.x*blockIdx.x + threadIdx.x; 28 | 29 | ${dtype} cv[${-(-csz // ksplit)}], bv[${-(-k // ksplit)}], dotp; 30 | __shared__ ${dtype} csub[${ksplit - 1}][${csz}][${blockx}]; 31 | 32 | if (i >= n) 33 | return; 34 | 35 | ## Iterate over the column-partitions of B 36 | % for bid, kbx in enumerate(kparts): 37 | if (threadIdx.y == ${bid}) 38 | { 39 | ## Iterate over the row-partitions of C 40 | % for cchunk in cchunks: 41 | ## Evaluate our partial dot products 42 | % for j in cchunk: 43 | ## Load in any missing parts of B 44 | % for kx in kbx: 45 | % if A[j, kx] != 0 and kx not in loaded: 46 | bv[${loop.index}] = __ldcg(b + i + ${kx}*ldb); <% loaded.add(kx) %> 47 | % endif 48 | % endfor 49 | % if (dotex := dot(lambda kx: f'bv[{kx}]', A[j, kbx])) != '0.0': 50 | dotp = ${dotex}; 51 | % else: 52 | dotp = make_zero(); 53 | % endif 54 | ## Save to a register 55 | % if loop.index % ksplit == bid: 56 | cv[${loop.index // ksplit}] = dotp; 57 | ## Save to shared memory 58 | % else: 59 | csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][threadIdx.x] = dotp; 60 | % endif 61 | % endfor 62 | __barrier_sync(0); 63 | ## Sum and output the final set of dot products 64 | % for j in cchunk: 65 | % if loop.index % ksplit == bid: 66 | dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]' 67 | for i in range(ksplit - 1))}; 68 | % if beta == 0: 69 | __stcg(c + i + ${j}*ldc, dotp); 70 | % elif beta == 1: 71 | c[i + ${j}*ldc] += dotp; 72 | % else: 73 | c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc]; 74 | % endif 75 | % endif 76 | % endfor 77 | __barrier_sync(0); 78 | % endfor 79 | } 80 | % endfor 81 | } 82 | -------------------------------------------------------------------------------- /gimmik/kernels/cuda/cstream.mako: -------------------------------------------------------------------------------- 1 | <%inherit file='base'/> 2 | 3 | <% ksplit = 2 if m < 36 else 1 %> 4 | 5 | __global__ void 6 | % if n is None: 7 | ${kname}(int n, 8 | const ${dtype}* __restrict__ b, int ldb, 9 | ${dtype}* __restrict__ c, int ldc) 10 | { 11 | % if width > 1: 12 | n = ((n + ${width} - 1) / ${width}) * ${width}; 13 | ldb /= ${width}; 14 | ldc /= ${width}; 15 | % endif 16 | % else: 17 | ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) 18 | { 19 | const int n = ${-(-n // width)}; 20 | const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; 21 | const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; 22 | % endif 23 | const int i = blockDim.x*blockIdx.x + threadIdx.x; 24 | ${dtype} dotp; 25 | 26 | if (i < n) 27 | { 28 | % for j, jx in enumerate(A): 29 | % if (dotex := dot(lambda kx: f'b[i + {kx}*ldb]', jx, maxsplit=ksplit)) != '0.0': 30 | dotp = ${dotex}; 31 | % else: 32 | dotp = make_zero(); 33 | % endif 34 | % if beta == 0: 35 | c[i + ${j}*ldc] = dotp; 36 | % elif beta == 1 and dotex != '0.0': 37 | c[i + ${j}*ldc] += dotp; 38 | % else: 39 | c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc]; 40 | % endif 41 | % endfor 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /gimmik/kernels/hip/base.mako: -------------------------------------------------------------------------------- 1 | % if dtype.endswith('4'): 2 | static inline __device__ ${dtype} make_zero() 3 | { return make_${dtype}(0, 0, 0, 0); } 4 | % elif dtype.endswith('2'): 5 | static inline __device__ ${dtype} make_zero() 6 | { return make_${dtype}(0, 0); } 7 | % else: 8 | static inline __device__ ${dtype} make_zero() 9 | { return 0; } 10 | % endif 11 | 12 | ${next.body()} 13 | -------------------------------------------------------------------------------- /gimmik/kernels/hip/bstream-msplit.mako: -------------------------------------------------------------------------------- 1 | <%inherit file='base'/> 2 | 3 | <% 4 | mx = partition(A, into=msplit, by='rows') 5 | bchunks = chunk(bix, bsz) 6 | %> 7 | 8 | __global__ __launch_bounds__(${blockx*msplit}) void 9 | % if n is None: 10 | ${kname}(int n, 11 | const ${dtype}* __restrict__ b, int ldb, 12 | ${dtype}* __restrict__ c, int ldc) 13 | { 14 | % if width > 1: 15 | n = ((n + ${width} - 1) / ${width}) * ${width}; 16 | ldb /= ${width}; 17 | ldc /= ${width}; 18 | % endif 19 | % else: 20 | ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) 21 | { 22 | const int n = ${-(-n // width)}; 23 | const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; 24 | const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; 25 | % endif 26 | int i = blockDim.x*blockIdx.x + threadIdx.x; 27 | 28 | ${dtype} bv, csub[${-(-m // msplit)}]; 29 | __shared__ ${dtype} bsub[2][${bsz}][${blockx}]; 30 | 31 | ## Fill the initial shared memory block 32 | % for cid in range(msplit): 33 | if (i < n && threadIdx.y == ${cid}) 34 | { 35 | % for kx in bchunks[0]: 36 | % if loop.index % msplit == cid: 37 | bsub[0][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb]; 38 | % endif 39 | % endfor 40 | } 41 | % endfor 42 | __syncthreads(); 43 | 44 | ## Iterate over each row-chunk of B 45 | % for bb in range(len(bchunks)): 46 | ## Iterate over each row-chunk of C 47 | % for cid, mcx in enumerate(mx): 48 | if (i < n && threadIdx.y == ${cid}) 49 | { 50 | ## Start filling the next shared memory block 51 | % if not loop.parent.last: 52 | % for kx in bchunks[bb + 1]: 53 | % if loop.index % msplit == cid: 54 | bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb]; 55 | % endif 56 | % endfor 57 | % endif 58 | ## Accumulate our dot products 59 | % for kx in bchunks[bb]: 60 | bv = bsub[${bb % 2}][${loop.index}][threadIdx.x]; 61 | % for j, jx in enumerate(A[mcx, kx]): 62 | % if jx != 0 and kx == afix[mcx[j]]: 63 | csub[${j}] = ${jx}*bv; 64 | % elif jx != 0: 65 | csub[${j}] += ${jx}*bv; 66 | % endif 67 | ## If we're done with this dot product then store to global 68 | % if kx == alix[mcx[j]] and beta == 0: 69 | c[i + ${mcx[j]}*ldc] = csub[${j}]; 70 | % elif kx == alix[mcx[j]] and beta == 1: 71 | c[i + ${mcx[j]}*ldc] += csub[${j}]; 72 | % elif kx == alix[mcx[j]]: 73 | c[i + ${mcx[j]}*ldc] = csub[${j}] + ${beta}*c[i + ${mcx[j]}*ldc]; 74 | % endif 75 | % endfor 76 | % endfor 77 | ## Handle rows of A which are all zero 78 | % if loop.parent.last: 79 | % for j, jx in enumerate(afix): 80 | % if jx == -1 and j % msplit == cid and beta == 0: 81 | c[i + ${j}*ldc] = make_zero(); 82 | % elif jx == -1 and j % msplit == cid and beta != 1: 83 | c[i + ${j}*ldc] *= ${beta}; 84 | % endif 85 | % endfor 86 | % endif 87 | } 88 | % endfor 89 | __syncthreads(); 90 | % endfor 91 | } 92 | -------------------------------------------------------------------------------- /gimmik/kernels/hip/bstream.mako: -------------------------------------------------------------------------------- 1 | <%inherit file='base'/> 2 | 3 | __global__ __launch_bounds__(128) void 4 | % if n is None: 5 | ${kname}(int n, 6 | const ${dtype}* __restrict__ b, int ldb, 7 | ${dtype}* __restrict__ c, int ldc) 8 | { 9 | % if width > 1: 10 | n = ((n + ${width} - 1) / ${width}) * ${width}; 11 | ldb /= ${width}; 12 | ldc /= ${width}; 13 | % endif 14 | % else: 15 | ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) 16 | { 17 | const int n = ${-(-n // width)}; 18 | const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; 19 | const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; 20 | % endif 21 | const int i = blockDim.x*blockIdx.x + threadIdx.x; 22 | 23 | if (i < n) 24 | { 25 | ${dtype} bv, csub[${m}]; 26 | 27 | ## Iterare through the used rows of B 28 | % for kx in bix: 29 | bv = b[i + ${kx}*ldb]; 30 | % for j, jx in enumerate(A[:, kx]): 31 | % if jx != 0 and kx == afix[j]: 32 | csub[${j}] = ${jx}*bv; 33 | % elif jx != 0: 34 | csub[${j}] += ${jx}*bv; 35 | % endif 36 | ## 37 | % if kx == alix[j] and beta == 0: 38 | c[i + ${j}*ldc] = csub[${j}]; 39 | % elif kx == alix[j] and beta == 1: 40 | c[i + ${j}*ldc] += csub[${j}]; 41 | % elif kx == alix[j]: 42 | c[i + ${j}*ldc] = csub[${j}] + ${beta}*c[i + ${j}*ldc]; 43 | % endif 44 | % endfor 45 | % endfor 46 | 47 | ## Handle rows of A which are all zero 48 | % for j, jx in enumerate(afix): 49 | % if jx == -1 and beta == 0: 50 | c[i + ${j}*ldc] = make_zero(); 51 | % elif jx == -1 and beta != 1: 52 | c[i + ${j}*ldc] *= ${beta}; 53 | % endif 54 | % endfor 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /gimmik/kernels/hip/cstream-ksplit.mako: -------------------------------------------------------------------------------- 1 | <%inherit file='base'/> 2 | 3 | <% 4 | kparts = partition(A, ksplit, by='cols') 5 | cchunks = chunk(range(m), csz) 6 | loaded = set() 7 | %> 8 | 9 | __global__ __launch_bounds__(${blockx*ksplit}) void 10 | % if n is None: 11 | ${kname}(int n, 12 | const ${dtype}* __restrict__ b, int ldb, 13 | ${dtype}* __restrict__ c, int ldc) 14 | { 15 | % if width > 1: 16 | n = ((n + ${width} - 1) / ${width}) * ${width}; 17 | ldb /= ${width}; 18 | ldc /= ${width}; 19 | % endif 20 | % else: 21 | ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) 22 | { 23 | const int n = ${-(-n // width)}; 24 | const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; 25 | const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; 26 | % endif 27 | int i = blockDim.x*blockIdx.x + threadIdx.x; 28 | 29 | ${dtype} cv[${-(-csz // ksplit)}], bv[${-(-k // ksplit)}], dotp; 30 | __shared__ ${dtype} csub[${ksplit - 1}][${csz}][${blockx}]; 31 | 32 | ## Iterate over the row-partitions of C 33 | % for cchunk in cchunks: 34 | ## Iterate over the row-partitions of B 35 | % for bid, kbx in enumerate(kparts): 36 | if (i < n && threadIdx.y == ${bid}) 37 | { 38 | ## Evaluate our partial dot products 39 | % for j in cchunk: 40 | ## Load in any missing parts of B 41 | % for kx in kbx: 42 | % if A[j, kx] != 0 and kx not in loaded: 43 | bv[${loop.index}] = b[i + ${kx}*ldb]; <% loaded.add(kx) %> 44 | % endif 45 | % endfor 46 | % if (dotex := dot(lambda kx: f'bv[{kx}]', A[j, kbx])) != '0.0': 47 | dotp = ${dotex}; 48 | % else: 49 | dotp = make_zero(); 50 | % endif 51 | ## Save to a register 52 | % if loop.index % ksplit == bid: 53 | cv[${loop.index // ksplit}] = dotp; 54 | ## Save to shared memory 55 | % else: 56 | csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][threadIdx.x] = dotp; 57 | % endif 58 | % endfor 59 | } 60 | % endfor 61 | __syncthreads(); 62 | ## Iterate over the column-partitions of B 63 | % for bid, kbx in enumerate(kparts): 64 | if (i < n && threadIdx.y == ${bid}) 65 | { 66 | ## Sum and output the final set of dot products 67 | % for j in cchunk: 68 | % if loop.index % ksplit == bid: 69 | dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]' 70 | for i in range(ksplit - 1))}; 71 | % if beta == 0: 72 | c[i + ${j}*ldc] = dotp; 73 | % elif beta == 1: 74 | c[i + ${j}*ldc] += dotp; 75 | % else: 76 | c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc]; 77 | % endif 78 | % endif 79 | % endfor 80 | } 81 | % endfor 82 | __syncthreads(); 83 | % endfor 84 | } 85 | -------------------------------------------------------------------------------- /gimmik/kernels/hip/cstream.mako: -------------------------------------------------------------------------------- 1 | <%inherit file='base'/> 2 | 3 | <% ksplit = 2 if m < 36 else 1 %> 4 | 5 | __global__ __launch_bounds__(128) void 6 | % if n is None: 7 | ${kname}(int n, 8 | const ${dtype}* __restrict__ b, int ldb, 9 | ${dtype}* __restrict__ c, int ldc) 10 | { 11 | % if width > 1: 12 | n = ((n + ${width} - 1) / ${width}) * ${width}; 13 | ldb /= ${width}; 14 | ldc /= ${width}; 15 | % endif 16 | % else: 17 | ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c) 18 | { 19 | const int n = ${-(-n // width)}; 20 | const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; 21 | const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; 22 | % endif 23 | const int i = blockDim.x*blockIdx.x + threadIdx.x; 24 | ${dtype} dotp; 25 | 26 | if (i < n) 27 | { 28 | % for j, jx in enumerate(A): 29 | % if (dotex := dot(lambda kx: f'b[i + {kx}*ldb]', jx, maxsplit=ksplit)) != '0.0': 30 | dotp = ${dotex}; 31 | % else: 32 | dotp = make_zero(); 33 | % endif 34 | % if beta == 0: 35 | c[i + ${j}*ldc] = dotp; 36 | % elif beta == 1 and dotex != '0.0': 37 | c[i + ${j}*ldc] += dotp; 38 | % else: 39 | c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc]; 40 | % endif 41 | % endfor 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /gimmik/kernels/ispc/cstream.mako: -------------------------------------------------------------------------------- 1 | export void 2 | % if n is None: 3 | ${kname}(uniform int n, 4 | const uniform ${dtype} b[], uniform int ldb, 5 | ${dtype} uniform c[], uniform int ldc) 6 | { 7 | % else: 8 | ${kname}(const uniform ${dtype} b[], ${dtype} uniform c[]) 9 | { 10 | const uniform int n = ${n}; 11 | const uniform ${'long long' if k*ldb >= 2**31 else 'int'} ldb = ${ldb}; 12 | const uniform ${'long long' if m*ldc >= 2**31 else 'int'} ldc = ${ldc}; 13 | % endif 14 | 15 | foreach (i = 0 ... n) 16 | { 17 | % for j, jx in enumerate(A): 18 | % if beta == 0: 19 | c[i + ${j}*ldc] = ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)}; 20 | % elif beta == 1: 21 | c[i + ${j}*ldc] += ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)}; 22 | % else: 23 | c[i + ${j}*ldc] = ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)} 24 | + ${beta}*c[i + ${j}*ldc]; 25 | % endif 26 | % endfor 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /gimmik/kernels/metal/base.mako: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | using namespace metal; 4 | 5 | % if dtype.endswith('4'): 6 | static inline ${dtype} make_zero() 7 | { return ${dtype}(0, 0, 0, 0); } 8 | % elif dtype.endswith('2'): 9 | static inline ${dtype} make_zero() 10 | { return ${dtype}(0, 0); } 11 | % else: 12 | static inline ${dtype} make_zero() 13 | { return 0; } 14 | % endif 15 | 16 | ${next.body()} 17 | -------------------------------------------------------------------------------- /gimmik/kernels/metal/bstream-msplit.mako: -------------------------------------------------------------------------------- 1 | <%inherit file='base'/> 2 | 3 | <% 4 | mx = partition(A, into=msplit, by='rows') 5 | bchunks = chunk(bix, bsz) 6 | %> 7 | 8 | kernel void 9 | % if n is None: 10 | ${kname}(constant int& n_, 11 | device ${dtype}* b, constant int& ldb_, 12 | device ${dtype}* c, constant int& ldc_, 13 | uint2 tpig [[thread_position_in_grid]], 14 | uint2 tpitg [[thread_position_in_threadgroup]]) 15 | { 16 | const int n = ((n_ + ${width} - 1) / ${width}) * ${width}; 17 | const int ldb = ldb_ / ${width}; 18 | const int ldc = ldc_ / ${width}; 19 | % else: 20 | ${kname}(device const ${dtype}* b, device ${dtype}* c, 21 | uint2 tpig [[thread_position_in_grid]], 22 | uint2 tpitg [[thread_position_in_threadgroup]]) 23 | { 24 | const int n = ${-(-n // width)}; 25 | const ${'long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; 26 | const ${'long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; 27 | % endif 28 | const int i = tpig.x; 29 | 30 | ${dtype} bv, csub[${-(-m // msplit)}]; 31 | threadgroup ${dtype} bsub[2][${bsz}][${blockx}]; 32 | 33 | ## Fill the initial shared memory block 34 | % for cid in range(msplit): 35 | if (i < n && tpitg.y == ${cid}) 36 | { 37 | % for kx in bchunks[0]: 38 | % if loop.index % msplit == cid: 39 | bsub[0][${loop.index}][tpitg.x] = b[i + ${kx}*ldb]; 40 | % endif 41 | % endfor 42 | } 43 | % endfor 44 | threadgroup_barrier(mem_flags::mem_threadgroup); 45 | 46 | ## Iterate over each row-chunk of B 47 | % for bb in range(len(bchunks)): 48 | ## Iterate over each row-chunk of C 49 | % for cid, mcx in enumerate(mx): 50 | if (i < n && tpitg.y == ${cid}) 51 | { 52 | ## Start filling the next shared memory block 53 | % if not loop.parent.last: 54 | % for kx in bchunks[bb + 1]: 55 | % if loop.index % msplit == cid: 56 | bsub[${(bb + 1) % 2}][${loop.index}][tpitg.x] = b[i + ${kx}*ldb]; 57 | % endif 58 | % endfor 59 | % endif 60 | ## Accumulate our dot products 61 | % for kx in bchunks[bb]: 62 | bv = bsub[${bb % 2}][${loop.index}][tpitg.x]; 63 | % for j, jx in enumerate(A[mcx, kx]): 64 | % if jx != 0 and kx == afix[mcx[j]]: 65 | csub[${j}] = ${jx}*bv; 66 | % elif jx != 0: 67 | csub[${j}] += ${jx}*bv; 68 | % endif 69 | ## If we're done with this dot product then store to global 70 | % if kx == alix[mcx[j]] and beta == 0: 71 | c[i + ${mcx[j]}*ldc] = csub[${j}]; 72 | % elif kx == alix[mcx[j]] and beta == 1: 73 | c[i + ${mcx[j]}*ldc] += csub[${j}]; 74 | % elif kx == alix[mcx[j]]: 75 | c[i + ${mcx[j]}*ldc] = csub[${j}] + ${beta}*c[i + ${mcx[j]}*ldc]; 76 | % endif 77 | % endfor 78 | % endfor 79 | ## Handle rows of A which are all zero 80 | % if loop.parent.last: 81 | % for j, jx in enumerate(afix): 82 | % if jx == -1 and j % msplit == cid and beta == 0: 83 | c[i + ${j}*ldc] = make_zero(); 84 | % elif jx == -1 and j % msplit == cid and beta != 1: 85 | c[i + ${j}*ldc] *= ${beta}; 86 | % endif 87 | % endfor 88 | % endif 89 | } 90 | % endfor 91 | threadgroup_barrier(mem_flags::mem_threadgroup); 92 | % endfor 93 | } 94 | -------------------------------------------------------------------------------- /gimmik/kernels/metal/bstream.mako: -------------------------------------------------------------------------------- 1 | <%inherit file='base'/> 2 | 3 | kernel void 4 | % if n is None: 5 | ${kname}(constant int& n_, 6 | device ${dtype}* b, constant int& ldb_, 7 | device ${dtype}* c, constant int& ldc_, 8 | uint i [[thread_position_in_grid]]) 9 | { 10 | const int n = ((n_ + ${width} - 1) / ${width}) * ${width}; 11 | const int ldb = ldb_ / ${width}; 12 | const int ldc = ldc_ / ${width}; 13 | % else: 14 | ${kname}(device const ${dtype}* b, device ${dtype}* c, 15 | uint i [[thread_position_in_grid]]) 16 | { 17 | const int n = ${-(-n // width)}; 18 | const ${'long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; 19 | const ${'long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; 20 | % endif 21 | 22 | if (i < n) 23 | { 24 | ${dtype} bv, csub[${m}]; 25 | 26 | ## Iterare through the used rows of B 27 | % for kx in bix: 28 | bv = b[i + ${kx}*ldb]; 29 | % for j, jx in enumerate(A[:, kx]): 30 | % if jx != 0 and kx == afix[j]: 31 | csub[${j}] = ${jx}*bv; 32 | % elif jx != 0: 33 | csub[${j}] += ${jx}*bv; 34 | % endif 35 | ## 36 | % if kx == alix[j] and beta == 0: 37 | c[i + ${j}*ldc] = csub[${j}]; 38 | % elif kx == alix[j] and beta == 1: 39 | c[i + ${j}*ldc] += csub[${j}]; 40 | % elif kx == alix[j]: 41 | c[i + ${j}*ldc] = csub[${j}] + ${beta}*c[i + ${j}*ldc]; 42 | % endif 43 | % endfor 44 | % endfor 45 | 46 | ## Handle rows of A which are all zero 47 | % for j, jx in enumerate(afix): 48 | % if jx == -1 and beta == 0: 49 | c[i + ${j}*ldc] = make_zero(); 50 | % elif jx == -1 and beta != 1: 51 | c[i + ${j}*ldc] *= ${beta}; 52 | % endif 53 | % endfor 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /gimmik/kernels/metal/cstream-ksplit.mako: -------------------------------------------------------------------------------- 1 | <%inherit file='base'/> 2 | 3 | <% 4 | kparts = partition(A, ksplit, by='cols') 5 | cchunks = chunk(range(m), csz) 6 | loaded = set() 7 | %> 8 | 9 | kernel void 10 | % if n is None: 11 | ${kname}(constant int& n_, 12 | device ${dtype}* b, constant int& ldb_, 13 | device ${dtype}* c, constant int& ldc_, 14 | uint2 tpig [[thread_position_in_grid]], 15 | uint2 tpitg [[thread_position_in_threadgroup]]) 16 | { 17 | const int n = ((n_ + ${width} - 1) / ${width}) * ${width}; 18 | const int ldb = ldb_ / ${width}; 19 | const int ldc = ldc_ / ${width}; 20 | % else: 21 | ${kname}(device const ${dtype}* b, device ${dtype}* c, 22 | uint2 tpig [[thread_position_in_grid]], 23 | uint2 tpitg [[thread_position_in_threadgroup]]) 24 | { 25 | const int n = ${-(-n // width)}; 26 | const ${'long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; 27 | const ${'long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; 28 | % endif 29 | const int i = tpig.x; 30 | 31 | ${dtype} cv[${-(-csz // ksplit)}], bv[${-(-k // ksplit)}], dotp; 32 | threadgroup ${dtype} csub[${ksplit - 1}][${csz}][${blockx}]; 33 | 34 | ## Iterate over the row-partitions of C 35 | % for cchunk in cchunks: 36 | ## Iterate over the row-partitions of B 37 | % for bid, kbx in enumerate(kparts): 38 | if (i < n && tpitg.y == ${bid}) 39 | { 40 | ## Evaluate our partial dot products 41 | % for j in cchunk: 42 | ## Load in any missing parts of B 43 | % for kx in kbx: 44 | % if A[j, kx] != 0 and kx not in loaded: 45 | bv[${loop.index}] = b[i + ${kx}*ldb]; <% loaded.add(kx) %> 46 | % endif 47 | % endfor 48 | % if (dotex := dot(lambda kx: f'bv[{kx}]', A[j, kbx])) != '0.0': 49 | dotp = ${dotex}; 50 | % else: 51 | dotp = make_zero(); 52 | % endif 53 | ## Save to a register 54 | % if loop.index % ksplit == bid: 55 | cv[${loop.index // ksplit}] = dotp; 56 | ## Save to shared memory 57 | % else: 58 | csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][tpitg.x] = dotp; 59 | % endif 60 | % endfor 61 | } 62 | % endfor 63 | threadgroup_barrier(mem_flags::mem_threadgroup); 64 | ## Iterate over the column-partitions of B 65 | % for bid, kbx in enumerate(kparts): 66 | if (i < n && tpitg.y == ${bid}) 67 | { 68 | ## Sum and output the final set of dot products 69 | % for j in cchunk: 70 | % if loop.index % ksplit == bid: 71 | dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][tpitg.x]' 72 | for i in range(ksplit - 1))}; 73 | % if beta == 0: 74 | c[i + ${j}*ldc] = dotp; 75 | % elif beta == 1: 76 | c[i + ${j}*ldc] += dotp; 77 | % else: 78 | c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc]; 79 | % endif 80 | % endif 81 | % endfor 82 | } 83 | % endfor 84 | threadgroup_barrier(mem_flags::mem_threadgroup); 85 | % endfor 86 | } 87 | -------------------------------------------------------------------------------- /gimmik/kernels/metal/cstream.mako: -------------------------------------------------------------------------------- 1 | <%inherit file='base'/> 2 | 3 | <% ksplit = 2 if m < 36 else 1 %> 4 | 5 | kernel void 6 | % if n is None: 7 | ${kname}(constant int& n_, 8 | device ${dtype}* b, constant int& ldb_, 9 | device ${dtype}* c, constant int& ldc_, 10 | uint i [[thread_position_in_grid]]) 11 | { 12 | const int n = ((n_ + ${width} - 1) / ${width}) * ${width}; 13 | const int ldb = ldb_ / ${width}; 14 | const int ldc = ldc_ / ${width}; 15 | % else: 16 | ${kname}(device const ${dtype}* b, device ${dtype}* c, 17 | uint i [[thread_position_in_grid]]) 18 | { 19 | const int n = ${-(-n // width)}; 20 | const ${'long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; 21 | const ${'long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; 22 | % endif 23 | ${dtype} dotp; 24 | 25 | if (i < n) 26 | { 27 | % for j, jx in enumerate(A): 28 | % if (dotex := dot(lambda kx: f'b[i + {kx}*ldb]', jx, maxsplit=ksplit)) != '0.0': 29 | dotp = ${dotex}; 30 | % else: 31 | dotp = make_zero(); 32 | % endif 33 | % if beta == 0: 34 | c[i + ${j}*ldc] = dotp; 35 | % elif beta == 1 and dotex != '0.0': 36 | c[i + ${j}*ldc] += dotp; 37 | % else: 38 | c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc]; 39 | % endif 40 | % endfor 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /gimmik/kernels/opencl/bstream-msplit.mako: -------------------------------------------------------------------------------- 1 | <% 2 | mx = partition(A, into=msplit, by='rows') 3 | bchunks = chunk(bix, bsz) 4 | %> 5 | 6 | __kernel __attribute__((reqd_work_group_size(${blockx}, ${msplit}, 1))) void 7 | % if n is None: 8 | ${kname}(int n, 9 | __global const ${dtype}* restrict b, int ldb, 10 | __global ${dtype}* restrict c, int ldc) 11 | { 12 | % if width > 1: 13 | n = ((n + ${width} - 1) / ${width}) * ${width}; 14 | ldb /= ${width}; 15 | ldc /= ${width}; 16 | % endif 17 | % else: 18 | ${kname}(__global const ${dtype}* restrict b, __global ${dtype}* restrict c) 19 | { 20 | const int n = ${-(-n // width)}; 21 | const ${'long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; 22 | const ${'long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; 23 | % endif 24 | int i = get_global_id(0); 25 | int lx = get_local_id(0), ly = get_local_id(1); 26 | 27 | ${dtype} bv, csub[${-(-m // msplit)}]; 28 | __local ${dtype} bsub[2][${bsz}][${blockx}]; 29 | 30 | ## Fill the initial shared memory block 31 | % for cid in range(msplit): 32 | if (i < n && ly == ${cid}) 33 | { 34 | % for kx in bchunks[0]: 35 | % if loop.index % msplit == cid: 36 | bsub[0][${loop.index}][lx] = b[i + ${kx}*ldb]; 37 | % endif 38 | % endfor 39 | } 40 | % endfor 41 | work_group_barrier(CLK_LOCAL_MEM_FENCE); 42 | 43 | ## Iterate over each row-chunk of B 44 | % for bb in range(len(bchunks)): 45 | ## Iterate over each row-chunk of C 46 | % for cid, mcx in enumerate(mx): 47 | if (i < n && ly == ${cid}) 48 | { 49 | ## Start filling the next shared memory block 50 | % if not loop.parent.last: 51 | % for kx in bchunks[bb + 1]: 52 | % if loop.index % msplit == cid: 53 | bsub[${(bb + 1) % 2}][${loop.index}][lx] = b[i + ${kx}*ldb]; 54 | % endif 55 | % endfor 56 | % endif 57 | ## Accumulate our dot products 58 | % for kx in bchunks[bb]: 59 | bv = bsub[${bb % 2}][${loop.index}][lx]; 60 | % for j, jx in enumerate(A[mcx, kx]): 61 | % if jx != 0 and kx == afix[mcx[j]]: 62 | csub[${j}] = ${jx}*bv; 63 | % elif jx != 0: 64 | csub[${j}] += ${jx}*bv; 65 | % endif 66 | ## If we're done with this dot product then store to global 67 | % if kx == alix[mcx[j]] and beta == 0: 68 | c[i + ${mcx[j]}*ldc] = csub[${j}]; 69 | % elif kx == alix[mcx[j]] and beta == 1: 70 | c[i + ${mcx[j]}*ldc] += csub[${j}]; 71 | % elif kx == alix[mcx[j]]: 72 | c[i + ${mcx[j]}*ldc] = csub[${j}] + ${beta}*c[i + ${mcx[j]}*ldc]; 73 | % endif 74 | % endfor 75 | % endfor 76 | ## Handle rows of A which are all zero 77 | % if loop.parent.last: 78 | % for j, jx in enumerate(afix): 79 | % if jx == -1 and j % msplit == cid and beta == 0: 80 | c[i + ${j}*ldc] = 0; 81 | % elif jx == -1 and j % msplit == cid and beta != 1: 82 | c[i + ${j}*ldc] *= ${beta}; 83 | % endif 84 | % endfor 85 | % endif 86 | } 87 | % endfor 88 | work_group_barrier(CLK_LOCAL_MEM_FENCE); 89 | % endfor 90 | } 91 | -------------------------------------------------------------------------------- /gimmik/kernels/opencl/bstream.mako: -------------------------------------------------------------------------------- 1 | __kernel void 2 | % if n is None: 3 | ${kname}(int n, 4 | __global const ${dtype}* restrict b, int ldb, 5 | __global ${dtype}* restrict c, int ldc) 6 | { 7 | % else: 8 | ${kname}(__global const ${dtype}* restrict b, __global ${dtype}* restrict c) 9 | { 10 | const int n = ${n}; 11 | const ${'long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; 12 | const ${'long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; 13 | % endif 14 | int i = get_global_id(0); 15 | 16 | if (i < n) 17 | { 18 | ${dtype} bv, csub[${m}]; 19 | 20 | ## Iterare through the used rows of B 21 | % for kx in bix: 22 | bv = b[i + ${kx}*ldb]; 23 | % for j, jx in enumerate(A[:, kx]): 24 | % if jx != 0 and kx == afix[j]: 25 | csub[${j}] = ${jx}*bv; 26 | % elif jx != 0: 27 | csub[${j}] += ${jx}*bv; 28 | % endif 29 | ## 30 | % if kx == alix[j] and beta == 0: 31 | c[i + ${j}*ldc] = csub[${j}]; 32 | % elif kx == alix[j] and beta == 1: 33 | c[i + ${j}*ldc] += csub[${j}]; 34 | % elif kx == alix[j]: 35 | c[i + ${j}*ldc] = csub[${j}] + ${beta}*c[i + ${j}*ldc]; 36 | % endif 37 | % endfor 38 | % endfor 39 | 40 | ## Handle rows of A which are all zero 41 | % for j, jx in enumerate(afix): 42 | % if jx == -1 and beta == 0: 43 | c[i + ${j}*ldc] = 0; 44 | % elif jx == -1 and beta != 1: 45 | c[i + ${j}*ldc] *= ${beta}; 46 | % endif 47 | % endfor 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /gimmik/kernels/opencl/cstream-ksplit.mako: -------------------------------------------------------------------------------- 1 | <% 2 | kparts = partition(A, ksplit, by='cols') 3 | cchunks = chunk(range(m), csz) 4 | loaded = set() 5 | %> 6 | 7 | __kernel __attribute__((reqd_work_group_size(${blockx}, ${ksplit}, 1))) void 8 | % if n is None: 9 | ${kname}(int n, 10 | __global const ${dtype}* restrict b, int ldb, 11 | __global ${dtype}* restrict c, int ldc) 12 | { 13 | % if width > 1: 14 | n = ((n + ${width} - 1) / ${width}) * ${width}; 15 | ldb /= ${width}; 16 | ldc /= ${width}; 17 | % endif 18 | % else: 19 | ${kname}(__global const ${dtype}* restrict b, __global ${dtype}* restrict c) 20 | { 21 | const int n = ${-(-n // width)}; 22 | const ${'long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; 23 | const ${'long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; 24 | % endif 25 | int i = get_global_id(0); 26 | int lx = get_local_id(0), ly = get_local_id(1); 27 | 28 | ${dtype} cv[${-(-csz // ksplit)}], bv[${-(-k // ksplit)}], dotp; 29 | __local ${dtype} csub[${ksplit - 1}][${csz}][${blockx}]; 30 | 31 | ## Iterate over the row-partitions of C 32 | % for cchunk in cchunks: 33 | ## Iterate over the row-partitions of B 34 | % for bid, kbx in enumerate(kparts): 35 | if (i < n && ly == ${bid}) 36 | { 37 | ## Evaluate our partial dot products 38 | % for j in cchunk: 39 | ## Load in any missing parts of B 40 | % for kx in kbx: 41 | % if A[j, kx] != 0 and kx not in loaded: 42 | bv[${loop.index}] = b[i + ${kx}*ldb]; <% loaded.add(kx) %> 43 | % endif 44 | % endfor 45 | % if (dotex := dot(lambda kx: f'bv[{kx}]', A[j, kbx])) != '0.0': 46 | dotp = ${dotex}; 47 | % else: 48 | dotp = 0; 49 | % endif 50 | ## Save to a register 51 | % if loop.index % ksplit == bid: 52 | cv[${loop.index // ksplit}] = dotp; 53 | ## Save to shared memory 54 | % else: 55 | csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][lx] = dotp; 56 | % endif 57 | % endfor 58 | } 59 | % endfor 60 | work_group_barrier(CLK_LOCAL_MEM_FENCE); 61 | ## Iterate over the column-partitions of B 62 | % for bid, kbx in enumerate(kparts): 63 | if (i < n && ly == ${bid}) 64 | { 65 | ## Sum and output the final set of dot products 66 | % for j in cchunk: 67 | % if loop.index % ksplit == bid: 68 | dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][lx]' 69 | for i in range(ksplit - 1))}; 70 | % if beta == 0: 71 | c[i + ${j}*ldc] = dotp; 72 | % elif beta == 1: 73 | c[i + ${j}*ldc] += dotp; 74 | % else: 75 | c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc]; 76 | % endif 77 | % endif 78 | % endfor 79 | } 80 | % endfor 81 | work_group_barrier(CLK_LOCAL_MEM_FENCE); 82 | % endfor 83 | } 84 | -------------------------------------------------------------------------------- /gimmik/kernels/opencl/cstream.mako: -------------------------------------------------------------------------------- 1 | __kernel void 2 | % if n is None: 3 | ${kname}(int n, 4 | __global const ${dtype}* restrict b, int ldb, 5 | __global ${dtype}* restrict c, int ldc) 6 | { 7 | % if width > 1: 8 | n = ((n + ${width} - 1) / ${width}) * ${width}; 9 | ldb /= ${width}; 10 | ldc /= ${width}; 11 | % endif 12 | % else: 13 | ${kname}(__global const ${dtype}* restrict b, __global ${dtype}* restrict c) 14 | { 15 | const int n = ${-(-n // width)}; 16 | const ${'long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width}; 17 | const ${'long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width}; 18 | % endif 19 | int i = get_global_id(0); 20 | 21 | if (i < n) 22 | { 23 | % for j, jx in enumerate(A): 24 | % if beta == 0: 25 | c[i + ${j}*ldc] = ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)}; 26 | % elif beta == 1: 27 | c[i + ${j}*ldc] += ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)}; 28 | % else: 29 | c[i + ${j}*ldc] = ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)} 30 | + ${beta}*c[i + ${j}*ldc]; 31 | % endif 32 | % endfor 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /gimmik/metal.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from gimmik.base import MatMul 4 | 5 | 6 | class MetalMatMul(MatMul): 7 | platform = 'metal' 8 | basemeta = {'threadgroup': (128, 1, 1), 'threadgroup_mem_size': 0, 9 | 'width': 1} 10 | 11 | def _kernel_generators(self, dtype, dsize): 12 | # B loading, C streaming kernel 13 | yield ('cstream', {}, {}) 14 | 15 | # B streaming, C accumulation kernel 16 | yield ('bstream', {}, {}) 17 | 18 | # Four-way m-split B streaming, C accumulation kernel 19 | ms, bsz, blkx = 4, 16, 32 20 | args = {'msplit': ms, 'blockx': blkx, 'bsz': bsz} 21 | meta = {'threadgroup': (blkx, ms, 1), 22 | 'threadgroup_mem_size': 2*blkx*bsz*dsize} 23 | yield ('bstream-msplit', args, meta) 24 | 25 | # Four-way m-split B streaming, C accumulation kernel 26 | ms, bsz, blkx = 4, 20, 32 27 | args = {'msplit': ms, 'blockx': blkx, 'bsz': bsz} 28 | meta = {'threadgroup': (blkx, ms, 1), 29 | 'threadgroup_mem_size': 2*blkx*bsz*dsize} 30 | yield ('bstream-msplit', args, meta) 31 | 32 | # Two-way k-split B loading, C streaming kernel 33 | ks, csz, blkx = 2, 20, 32 34 | args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} 35 | meta = {'threadgroup': (blkx, ks, 1), 36 | 'threadgroup_mem_size': (ks - 1)*csz*blkx*dsize} 37 | yield ('cstream-ksplit', args, meta) 38 | 39 | if self.aligne is not None and self.aligne % 2 == 0: 40 | # Vector B loading, C streaming kernel 41 | args = {'dtype': 'float2', 'width': 2} 42 | meta = {'width': 2} 43 | yield ('cstream', args, meta) 44 | 45 | # Vector B streaming, C accumulation kernel 46 | yield ('bstream', args, meta) 47 | 48 | # Vector four-way m-split B streaming, C accumulation kernel 49 | ms, bsz, blkx = 4, 16, 32 50 | args = {'dtype': 'float2', 'width': 2, 'msplit': ms, 51 | 'blockx': blkx, 'bsz': bsz} 52 | meta = {'threadgroup': (blkx, ms, 1), 53 | 'threadgroup_mem_size': 2*blkx*bsz*dsize, 'width': 2} 54 | yield ('bstream-msplit', args, meta) 55 | 56 | def _process_meta(self, meta): 57 | if self.n is not None: 58 | tg = meta['threadgroup'] 59 | meta['grid'] = (-(-self.n // meta['width']), tg[1], 1) 60 | -------------------------------------------------------------------------------- /gimmik/opencl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from gimmik.base import MatMul 4 | 5 | 6 | class OpenCLMatMul(MatMul): 7 | platform = 'opencl' 8 | basemeta = {'local_work_size': None, 'local_mem_size': 0, 'width': 1} 9 | 10 | def _kernel_generators(self, dtype, dsize, *, local_mem_size=None): 11 | max_local_mem = local_mem_size or 1024**3 12 | 13 | # B loading, C streaming kernel 14 | yield ('cstream', {}, {}) 15 | 16 | # B streaming, C accumulation kernel 17 | yield ('bstream', {}, {}) 18 | 19 | # Four-way m-split B streaming, C accumulation kernel 20 | ms, bsz, blkx = 4, 16, 64 21 | args = {'msplit': ms, 'blockx': blkx, 'bsz': bsz} 22 | meta = {'local_work_size': (blkx, ms), 23 | 'local_mem_size': 2*blkx*bsz*dsize} 24 | if meta['local_mem_size'] < max_local_mem: 25 | yield ('bstream-msplit', args, meta) 26 | 27 | # Two-way k-split B loading, C streaming kernel 28 | ks, csz, blkx = 2, 32, 64 29 | args = {'ksplit': ks, 'csz': csz, 'blockx': blkx} 30 | meta = {'local_work_size': (blkx, ks), 31 | 'local_mem_size': (ks - 1)*csz*blkx*dsize} 32 | if meta['local_mem_size'] < max_local_mem: 33 | yield ('cstream-ksplit', args, meta) 34 | 35 | # At single precision also consider vectorized kernels 36 | if (dtype == 'float' and 37 | self.aligne is not None and self.aligne % 2 == 0): 38 | # Vector B loading, C streaming kernel 39 | args = {'dtype': 'float2', 'width': 2} 40 | meta = {'width': 2} 41 | yield ('cstream', args, meta) 42 | 43 | # Vector four-way m-split B streaming, C accumulation kernel 44 | ms, bsz, blkx = 4, 16, 64 45 | args = {'dtype': 'float2', 'width': 2, 'msplit': ms, 46 | 'blockx': blkx, 'bsz': bsz} 47 | meta = {'local_work_size': (blkx, ms), 48 | 'local_mem_size': 2*blkx*bsz*dsize, 'width': 2} 49 | if meta['local_mem_size'] < max_local_mem: 50 | yield ('bstream-msplit', args, meta) 51 | 52 | def _process_meta(self, meta): 53 | if self.n is not None: 54 | lws, width = meta['local_work_size'], meta['width'] 55 | if lws is not None: 56 | meta['global_work_size'] = (-(-self.n // width), lws[1]) 57 | else: 58 | meta['global_work_size'] = (-(-self.n // width),) 59 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | from setuptools import setup 6 | import sys 7 | 8 | 9 | # Python version 10 | if sys.version_info[:2] < (3, 9): 11 | print('GiMMiK requires Python 3.9 or newer') 12 | sys.exit(-1) 13 | 14 | # GiMMiK version 15 | vfile = open('gimmik/_version.py').read() 16 | vsrch = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", vfile, re.M) 17 | 18 | if vsrch: 19 | version = vsrch.group(1) 20 | else: 21 | print('Unable to find a version string in gimmik/_version.py') 22 | 23 | # Data 24 | package_data = { 25 | 'gimmik': ['kernels/*/*.mako'], 26 | } 27 | 28 | # Hard dependencies 29 | install_requires = [ 30 | 'mako', 31 | 'numpy >= 1.7' 32 | ] 33 | 34 | # Info 35 | classifiers = [ 36 | 'License :: OSI Approved :: BSD License', 37 | 'Programming Language :: Python :: 3.9', 38 | 'Programming Language :: Python :: 3.10', 39 | 'Programming Language :: Python :: 3.11', 40 | 'Topic :: Scientific/Engineering' 41 | ] 42 | 43 | # Long Description 44 | long_description = '''GiMMiK is a Python based kernel generator for 45 | matrix multiplication kernels for various accelerator platforms. For 46 | small operator matrices the generated kernels are capable of 47 | outperfoming the state-of-the-art general matrix multiplication 48 | routines such as cuBLAS GEMM or clBLAS GEMM. GiMMiK was originally 49 | developed as part of Bartosz Wozniak's master's thesis in the 50 | Department of Computing at Imperial College London and is currently 51 | maintained by Freddie Witherden.''' 52 | 53 | # Keywords 54 | keywords = ['Matrix Multiplication', 'ISPC', 'GPU', 'CUDA', 'HIP', 'Metal', 55 | 'OpenCL'] 56 | 57 | setup(name='gimmik', 58 | version=version, 59 | 60 | # Packages 61 | packages=['gimmik'], 62 | package_data=package_data, 63 | install_requires=install_requires, 64 | 65 | # Metadata 66 | description='Generator of Matrix Multiplication Kernels', 67 | long_description=long_description, 68 | maintainer='Freddie Witherden', 69 | maintainer_email='freddie@witherden.org', 70 | url='https://github.com/vincentlab/GiMMiK', 71 | license='BSD', 72 | keywords=keywords, 73 | classifiers=classifiers) 74 | --------------------------------------------------------------------------------