├── .github
    └── workflows
    │   └── python-publish.yml
├── .gitignore
├── AUTHORS
├── LICENSE
├── README.rst
├── gimmik
    ├── __init__.py
    ├── _version.py
    ├── base.py
    ├── c.py
    ├── copenmp.py
    ├── cuda.py
    ├── hip.py
    ├── ispc.py
    ├── kernels
    │   ├── c-openmp
    │   │   └── cstream.mako
    │   ├── c
    │   │   └── cstream.mako
    │   ├── cuda
    │   │   ├── base.mako
    │   │   ├── bstream-msplit.mako
    │   │   ├── bstream.mako
    │   │   ├── cstream-ksplit.mako
    │   │   └── cstream.mako
    │   ├── hip
    │   │   ├── base.mako
    │   │   ├── bstream-msplit.mako
    │   │   ├── bstream.mako
    │   │   ├── cstream-ksplit.mako
    │   │   └── cstream.mako
    │   ├── ispc
    │   │   └── cstream.mako
    │   ├── metal
    │   │   ├── base.mako
    │   │   ├── bstream-msplit.mako
    │   │   ├── bstream.mako
    │   │   ├── cstream-ksplit.mako
    │   │   └── cstream.mako
    │   └── opencl
    │   │   ├── bstream-msplit.mako
    │   │   ├── bstream.mako
    │   │   ├── cstream-ksplit.mako
    │   │   └── cstream.mako
    ├── metal.py
    └── opencl.py
└── setup.py


/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 |   workflow_dispatch:
15 | 
16 | permissions:
17 |   contents: read
18 | 
19 | jobs:
20 |   deploy:
21 | 
22 |     runs-on: ubuntu-latest
23 | 
24 |     steps:
25 |     - uses: actions/checkout@v3
26 |     - name: Set up Python
27 |       uses: actions/setup-python@v3
28 |       with:
29 |         python-version: '3.x'
30 |     - name: Install dependencies
31 |       run: |
32 |         python -m pip install --upgrade pip
33 |         pip install build
34 |     - name: Build package
35 |       run: python -m build
36 |     - name: Publish package
37 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
38 |       with:
39 |         user: __token__
40 |         password: ${{ secrets.PYPI_API_TOKEN }}
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | bin/
12 | build/
13 | develop-eggs/
14 | dist/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # Installer logs
26 | pip-log.txt
27 | pip-delete-this-directory.txt
28 | 
29 | # Unit test / coverage reports
30 | htmlcov/
31 | .tox/
32 | .coverage
33 | .cache
34 | nosetests.xml
35 | coverage.xml
36 | 
37 | # Translations
38 | *.mo
39 | 
40 | # Mr Developer
41 | .mr.developer.cfg
42 | .project
43 | .pydevproject
44 | 
45 | # Rope
46 | .ropeproject
47 | 
48 | # Django stuff:
49 | *.log
50 | *.pot
51 | 
52 | # Sphinx documentation
53 | docs/_build/
54 | 
55 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | Freddie Witherden <freddie@witherden.org>
2 | Bartosz Wozniak <bartosz.wozniak10@imperial.ac.uk>
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014, 2015, 2016 Fredie Witherden and Bartosz Wozniak
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of GiMMiK nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | GiMMiK
 2 | ======
 3 | Generator of Matrix Multiplication Kernels - GiMMiK - is a tool for generation of high performance matrix multiplication kernel code for various accelerator platforms. Currently C, CUDA, HIP, ISPC, Metal, and OpenCL are supported.
 4 | 
 5 | What does GiMMiK do?
 6 | --------------------
 7 | Consider matrix multiplication of the form
 8 | 
 9 | C = α∙A×B + β∙C
10 | 
11 | GiMMiK generates fully unrolled kernels, highly specialised to a given operator matrix. The generated code is fully unrolled - each kernel computes a single column of the output matrix. GiMMiK was designed to perform well in a Block by Panel type of matrix multiplication where the operator matrix is small. GiMMiK also removes any sparsity form the operator matrix as well as attempts to reduce common sub-expressions.
12 | 
13 | How do I install GiMMiK?
14 | ------------------------
15 | Clone the git repository and use `setup.py` to install the GiMMiK package. You will need the following dependencies:
16 | 
17 | * `mako <http://www.makotemplates.org/>`_
18 | * `numpy >= 1.7 <http://www.numpy.org/>`_
19 | 
20 | Once obtained, you can install GiMMiK by running
21 | 
22 | ::
23 | 
24 |     python setup.py install
25 | 
26 | to perform a system-wide install. Alternatively, run
27 | ::
28 | 
29 |     python setup.py install --user
30 | 
31 | to install the package locally.
32 | 
33 | How do I use GiMMiK?
34 | --------------------
35 | Once installed, you are ready to use GiMMiK.
36 | 
37 | .. code:: python
38 | 
39 |     from gimmik import generate_mm
40 | 
41 |     ...
42 | 
43 |     # Generate a CUDA kernel for C = 2*mat*B
44 |     src = generate_mm(mat, np.float32, platform='cuda', alpha=2.0, beta=0.0)
45 | 
46 |     ...
47 | 
48 | Who uses GiMMiK?
49 | ----------------
50 | GiMMiK was develop to improve performance of the `PyFR <http://www.pyfr.org>`_ framework.
51 | 


--------------------------------------------------------------------------------
/gimmik/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from gimmik._version import __version__
 4 | from gimmik.c import CMatMul
 5 | from gimmik.copenmp import COpenMPMatMul
 6 | from gimmik.cuda import CUDAMatMul
 7 | from gimmik.ispc import ISPCMatMul
 8 | from gimmik.hip import HIPMatMul
 9 | from gimmik.metal import MetalMatMul
10 | from gimmik.opencl import OpenCLMatMul
11 | 
12 | 
13 | def generate_mm(mat, dtype, platform, alpha=1.0, beta=0.0, funcn='gimmik_mm',
14 |                 n=None, ldb=None, ldc=None):
15 |     import warnings
16 | 
17 |     warnings.warn('generate_mm is deprecated, use MatMul', DeprecationWarning)
18 | 
19 |     platmap = {
20 |         'c': CMatMul,
21 |         'c-omp': COpenMPMatMul,
22 |         'cuda': CUDAMatMul,
23 |         'ispc': ISPCMatMul,
24 |         'hip': HIPMatMul,
25 |         'opencl': OpenCLMatMul
26 |     }
27 | 
28 |     mm = platmap[platform](alpha*mat, beta, None, n, ldb, ldc)
29 |     return next(mm.kernels(dtype, kname=funcn))[0]
30 | 


--------------------------------------------------------------------------------
/gimmik/_version.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | __version__ = '3.2.1'
4 | 


--------------------------------------------------------------------------------
/gimmik/base.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import itertools as it
  4 | import pkgutil
  5 | import re
  6 | 
  7 | from mako.lookup import TemplateLookup
  8 | from mako.template import Template
  9 | import numpy as np
 10 | 
 11 | 
 12 | class _PlatformTemplateLookup(TemplateLookup):
 13 |     def __init__(self, platform):
 14 |         self.platform = platform
 15 | 
 16 |     def adjust_uri(self, uri, relto):
 17 |         return uri
 18 | 
 19 |     def get_template(self, name):
 20 |         platform = self.platform
 21 |         src = pkgutil.get_data(__name__, f'kernels/{platform}/{name}.mako')
 22 | 
 23 |         return Template(src, lookup=self)
 24 | 
 25 | 
 26 | def _dot(bfn, row, maxsplit=1):
 27 |     nzixs, = np.nonzero(row)
 28 | 
 29 |     if not nzixs.size:
 30 |         return '0.0'
 31 | 
 32 |     nsplit = max(min(maxsplit, nzixs.size // 3), 1)
 33 |     snzixs = np.array_split(nzixs, nsplit)
 34 | 
 35 |     frags = [' + '.join(f'{row[i]}*{bfn(i)}' for i in ix) for ix in snzixs]
 36 |     return ' + '.join(f'({f})' for f in frags)
 37 | 
 38 | 
 39 | def _partition(mat, into, by):
 40 |     if by == 'rows':
 41 |         return [list(range(i, len(mat), into)) for i in range(into)]
 42 |     elif by == 'cols':
 43 |         return [list(range(i, len(mat.T), into)) for i in range(into)]
 44 |     else:
 45 |         raise ValueError('Invalid partition by')
 46 | 
 47 | 
 48 | def _chunk(l, chunksz):
 49 |     l, n = iter(l), len(l)
 50 |     nchunks = -(-n // chunksz)
 51 | 
 52 |     return [list(it.islice(l, chunksz)) for i in range(nchunks)]
 53 | 
 54 | 
 55 | class MatMul:
 56 |     platform = None
 57 | 
 58 |     def __init__(self, A, beta=0.0, aligne=None, n=None, ldb=None, ldc=None):
 59 |         self.A = A
 60 |         self.beta = beta
 61 |         self.aligne = aligne
 62 | 
 63 |         if n is None and ldb is None and ldc is None:
 64 |             self.n = self.ldb = self.ldc = None
 65 |         elif n is not None and ldb is not None and ldc is not None:
 66 |             if aligne is not None and (ldb % aligne or ldc % aligne):
 67 |                 raise ValueError('ldb/ldc not compatible with aligne')
 68 | 
 69 |             self.n, self.ldb, self.ldc = n, ldb, ldc
 70 |         else:
 71 |             raise ValueError('Must provide all of (n, ldb, ldc) or none')
 72 | 
 73 |         # Check the matrix has a non-zero
 74 |         if not A.any():
 75 |             raise ValueError('A can not be empty')
 76 | 
 77 |         # Extract the shape of A
 78 |         self.m, self.k = m, k = A.shape
 79 | 
 80 |         # Determine the index of the first and last non-zero in each row of A
 81 |         self.afix = (A != 0).argmax(axis=1)
 82 |         self.alix = k - 1 - (A != 0)[:, ::-1].argmax(axis=1)
 83 | 
 84 |         # Mark rows of A which are all zero
 85 |         self.afix = np.where(np.any(A != 0, axis=1), self.afix, -1)
 86 |         self.alix = np.where(np.any(A != 0, axis=1), self.alix, -1)
 87 |         self.has_zero_rows = np.any(self.afix == -1)
 88 | 
 89 |         # Determine which entries of B partake in the multiplication
 90 |         self.bix = np.nonzero(np.any(A != 0, axis=0))[0]
 91 |         self.bix = {kx: k for k, kx in enumerate(self.bix)}
 92 | 
 93 |     def kernels(self, dtype, kname='gimmik_mm', **kwargs):
 94 |         basemeta = self.basemeta
 95 | 
 96 |         # Process the data type
 97 |         dtype = np.dtype(dtype).type
 98 |         if dtype == np.float32:
 99 |             dtype, dsize = 'float', 4
100 |         elif dtype == np.float64:
101 |             dtype, dsize = 'double', 8
102 |         else:
103 |             raise ValueError('Invalid floating point data type')
104 | 
105 |         # Common template arguments
106 |         baseargs = {
107 |             'dtype': dtype, 'kname': kname,
108 |             'A': self.A, 'beta': self.beta, 'width': 1,
109 |             'm': self.m, 'n': self.n, 'k': self.k,
110 |             'ldb': self.ldb, 'ldc': self.ldc,
111 |             'afix': self.afix, 'alix': self.alix, 'bix': self.bix,
112 |             'dot': _dot, 'partition': _partition, 'chunk': _chunk
113 |         }
114 | 
115 |         # Incrementally generate and render the kernels
116 |         gen = self._kernel_generators(dtype, dsize, **kwargs)
117 |         try:
118 |             resp = None
119 |             while True:
120 |                 # Generate the next kernel in the sequence
121 |                 name, exargs, exmeta = gen.send(resp)
122 | 
123 |                 # Merge in the base arguments and metadata
124 |                 args = baseargs | exargs
125 |                 meta = basemeta | exmeta
126 | 
127 |                 # Render the kernel template
128 |                 src = self._render_kernel(dtype, name, args)
129 | 
130 |                 # Post-process the metadata
131 |                 meta['tplname'] = name
132 |                 self._process_meta(meta)
133 | 
134 |                 # Yield the source and metadata and await a response
135 |                 resp = yield (src, meta)
136 |         except StopIteration:
137 |             pass
138 | 
139 |     def _process_meta(self, meta):
140 |         pass
141 | 
142 |     def _render_kernel(self, dtype, tplname, tplargs):
143 |         tpl = _PlatformTemplateLookup(self.platform).get_template(tplname)
144 |         src = tpl.render(**tplargs)
145 | 
146 |         # At single precision suffix all floating point constants by 'f'
147 |         if dtype == 'float':
148 |             src = re.sub(r'(?=\d*[.eE])(?=\.?\d)\d*\.?\d*(?:[eE][+-]?\d+)?',
149 |                          r'\g<0>f', src)
150 | 
151 |         # Cleanup
152 |         src = re.sub(r'^\w+\n$', '', src.strip())
153 |         src = re.sub(r'\n\n+', r'\n\n', src) + '\n'
154 |         src = re.sub(r'\w+$', '', src)
155 |         return src
156 | 


--------------------------------------------------------------------------------
/gimmik/c.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from gimmik.base import MatMul
 4 | 
 5 | 
 6 | class CMatMul(MatMul):
 7 |     platform = 'c'
 8 |     basemeta = {}
 9 | 
10 |     def _kernel_generators(self, dtype, dsize):
11 |         yield ('cstream', {}, {})
12 | 


--------------------------------------------------------------------------------
/gimmik/copenmp.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from gimmik.base import MatMul
 4 | 
 5 | 
 6 | class COpenMPMatMul(MatMul):
 7 |     platform = 'c-openmp'
 8 |     basemeta = {}
 9 | 
10 |     def _kernel_generators(self, dtype, dsize):
11 |         yield ('cstream', {}, {})
12 | 
13 | 


--------------------------------------------------------------------------------
/gimmik/cuda.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from gimmik.base import MatMul
 4 | 
 5 | 
 6 | class CUDAMatMul(MatMul):
 7 |     platform = 'cuda'
 8 |     basemeta = {'block': (128, 1, 1), 'width': 1, 'shared': 0,
 9 |                 'dynamic_shared': 0}
10 | 
11 |     def _kernel_generators(self, dtype, dsize, *, compute_capability=None):
12 |         # B loading, C streaming kernel
13 |         yield ('cstream', {}, {})
14 | 
15 |         # B streaming, C accumulation kernel
16 |         yield ('bstream', {}, {})
17 | 
18 |         # Four-way m-split B streaming, C accumulation kernel
19 |         ms, bsz, blkx = 4, 24, 32
20 |         args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx}
21 |         meta = {'block': (blkx, ms, 1), 'shared': 2*bsz*blkx*dsize}
22 |         yield ('bstream-msplit', args, meta)
23 | 
24 |         # Two-way k-split B loading, C streaming kernel
25 |         ks, csz, blkx = 2, 24, 32
26 |         args = {'ksplit': ks, 'csz': csz, 'blockx': blkx}
27 |         meta = {'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize}
28 |         yield ('cstream-ksplit', args, meta)
29 | 
30 |         # At single precision also consider vectorized kernels
31 |         if (dtype == 'float' and
32 |             self.aligne is not None and self.aligne % 2 == 0):
33 |             # Vector B loading, C streaming kernel
34 |             args = {'dtype': 'float2', 'width': 2}
35 |             meta = {'width': 2}
36 |             yield ('cstream', args, meta)
37 | 
38 |             # Vector four-way m-split B streaming, C accumulation kernel
39 |             ms, bsz, blkx = 4, 16, 32
40 |             args = {'dtype': 'float2', 'width': 2, 'msplit': ms,
41 |                     'bsz': bsz, 'blockx': blkx}
42 |             meta = {'block': (blkx, ms, 1), 'width': 2,
43 |                     'shared': 2*blkx*bsz*2*dsize}
44 |             yield ('bstream-msplit', args, meta)
45 | 
46 |             # Vector two-way k-split B loading, C streaming kernel
47 |             ks, csz, blkx = 2, 24, 32
48 |             args = {'dtype': 'float2', 'width': 2, 'ksplit': ks,
49 |                     'csz': csz, 'blockx': blkx}
50 |             meta = {'block': (blkx, ks, 1), 'width': 2,
51 |                     'shared': 2*(ks - 1)*csz*blkx*dsize}
52 |             yield ('cstream-ksplit', args, meta)
53 | 
54 |     def _process_meta(self, meta):
55 |         if self.n is not None:
56 |             div = meta['block'][0]*meta['width']
57 |             meta['grid'] = (-(-self.n // div), 1, 1)
58 | 


--------------------------------------------------------------------------------
/gimmik/hip.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from gimmik.base import MatMul
 4 | 
 5 | 
 6 | class HIPMatMul(MatMul):
 7 |     platform = 'hip'
 8 |     basemeta = {'block': (128, 1, 1), 'width': 1, 'shared': 0}
 9 | 
10 |     def _kernel_generators(self, dtype, dsize, *, gcn_arch=None, warp_size=64):
11 |         # B loading, C streaming kernel
12 |         yield ('cstream', {}, {})
13 | 
14 |         # B streaming, C accumulation kernel
15 |         yield ('bstream', {}, {})
16 | 
17 |         # Four-way m-split B streaming, C accumulation kernel
18 |         ms, bsz, blkx = 4, 24, 64
19 |         args = {'msplit': ms, 'bsz': bsz, 'blockx': blkx}
20 |         meta = {'block': (blkx, ms, 1), 'shared': 2*bsz*blkx*dsize}
21 |         yield ('bstream-msplit', args, meta)
22 | 
23 |         # Two-way k-split B loading, C streaming kernel
24 |         ks, csz, blkx = 2, 24, 64
25 |         args = {'ksplit': ks, 'csz': csz, 'blockx': blkx}
26 |         meta = {'block': (blkx, ks, 1), 'shared': (ks - 1)*csz*blkx*dsize}
27 |         yield ('cstream-ksplit', args, meta)
28 | 
29 |     def _process_meta(self, meta):
30 |         if self.n is not None:
31 |             div = meta['block'][0]*meta['width']
32 |             meta['grid'] = (-(-self.n // div), 1, 1)
33 | 


--------------------------------------------------------------------------------
/gimmik/ispc.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from gimmik.base import MatMul
 4 | 
 5 | 
 6 | class ISPCMatMul(MatMul):
 7 |     platform = 'ispc'
 8 |     basemeta = {}
 9 | 
10 |     def _kernel_generators(self, dtype, dsize):
11 |         yield ('cstream', {}, {})
12 | 


--------------------------------------------------------------------------------
/gimmik/kernels/c-openmp/cstream.mako:
--------------------------------------------------------------------------------
 1 | void
 2 | % if n is None:
 3 | ${kname}(int n,
 4 |          const ${dtype}* restrict b, int ldb,
 5 |          ${dtype}* restrict c, int ldc)
 6 | {
 7 | % else:
 8 | ${kname}(const ${dtype}* restrict b, ${dtype}* restrict c)
 9 | {
10 |     const int n = ${n};
11 |     const ${'long long' if k*ldb >= 2**31 else 'int'} ldb = ${ldb};
12 |     const ${'long long' if m*ldc >= 2**31 else 'int'} ldc = ${ldc};
13 | % endif
14 | 
15 |     #pragma omp parallel for simd private(dotp)
16 |     for (int i = 0; i < n; i++)
17 |     {
18 | % for j, jx in enumerate(A):
19 |   % if beta == 0:
20 |         c[i + ${j}*ldc] = ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)};
21 |   % elif beta == 1:
22 |         c[i + ${j}*ldc] += ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)};
23 |   % else:
24 |         c[i + ${j}*ldc] = ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)}
25 |                         + ${beta}*c[i + ${j}*ldc];
26 |   % endif
27 | % endfor
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/gimmik/kernels/c/cstream.mako:
--------------------------------------------------------------------------------
 1 | void
 2 | % if n is None:
 3 | ${kname}(int n,
 4 |          const ${dtype}* restrict b, int ldb,
 5 |          ${dtype}* restrict c, int ldc)
 6 | {
 7 | % else:
 8 | ${kname}(const ${dtype}* restrict b, ${dtype}* restrict c)
 9 | {
10 |     const int n = ${n};
11 |     const ${'long long' if k*ldb >= 2**31 else 'int'} ldb = ${ldb};
12 |     const ${'long long' if m*ldc >= 2**31 else 'int'} ldc = ${ldc};
13 | % endif
14 | 
15 |     #pragma omp simd
16 |     for (int i = 0; i < n; i++)
17 |     {
18 | % for j, jx in enumerate(A):
19 |   % if beta == 0:
20 |         c[i + ${j}*ldc] = ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)};
21 |   % elif beta == 1:
22 |         c[i + ${j}*ldc] += ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)};
23 |   % else:
24 |         c[i + ${j}*ldc] = ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)}
25 |                         + ${beta}*c[i + ${j}*ldc];
26 |   % endif
27 | % endfor
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/gimmik/kernels/cuda/base.mako:
--------------------------------------------------------------------------------
 1 | % if dtype.endswith('4'):
 2 | inline __device__ ${dtype} operator+(${dtype} a, ${dtype} b)
 3 | { return make_${dtype}(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
 4 | 
 5 | inline __device__ ${dtype} operator*(${dtype[:-1]} a, ${dtype} b)
 6 | { return make_${dtype}(a*b.x, a*b.y, a*b.z, a*b.w); }
 7 | 
 8 | inline __device__ void operator+=(${dtype} &a, ${dtype} b)
 9 | { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; }
10 | 
11 | inline __device__ ${dtype} make_zero()
12 | { return make_${dtype}(0, 0, 0, 0); }
13 | % elif dtype.endswith('2'):
14 | inline __device__ ${dtype} operator+(${dtype} a, ${dtype} b)
15 | { return make_${dtype}(a.x + b.x, a.y + b.y); }
16 | 
17 | inline __device__ ${dtype} operator*(${dtype[:-1]} a, ${dtype} b)
18 | { return make_${dtype}(a*b.x, a*b.y); }
19 | 
20 | inline __device__ void operator+=(${dtype} &a, ${dtype} b)
21 | { a.x += b.x; a.y += b.y; }
22 | 
23 | inline __device__ ${dtype} make_zero()
24 | { return make_${dtype}(0, 0); }
25 | % else:
26 | inline __device__ ${dtype} make_zero()
27 | { return 0; }
28 | % endif
29 | 
30 | ${next.body()}
31 | 


--------------------------------------------------------------------------------
/gimmik/kernels/cuda/bstream-msplit.mako:
--------------------------------------------------------------------------------
 1 | <%inherit file='base'/>
 2 | 
 3 | <%
 4 | mx = partition(A, into=msplit, by='rows')
 5 | bchunks = chunk(bix, bsz)
 6 | %>
 7 | 
 8 | __global__ void
 9 | % if n is None:
10 | ${kname}(int n,
11 |          const ${dtype}* __restrict__ b, int ldb,
12 |          ${dtype}* __restrict__ c, int ldc)
13 | {
14 |   % if width > 1:
15 |     n = ((n + ${width} - 1) / ${width}) * ${width};
16 |     ldb /= ${width};
17 |     ldc /= ${width};
18 |   % endif
19 | % else:
20 | ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
21 | {
22 |     const int n = ${-(-n // width)};
23 |     const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
24 |     const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
25 | % endif
26 |     int i = blockDim.x*blockIdx.x + threadIdx.x;
27 | 
28 |     ${dtype} bv, csub[${-(-m // msplit)}];
29 |     __shared__ ${dtype} bsub[2][${bsz}][${blockx}];
30 | 
31 |     if (i >= n)
32 |       return;
33 | 
34 | ## Iterate over each row-chunk of C
35 | % for cid, mcx in enumerate(mx):
36 |     if (threadIdx.y == ${cid})
37 |     {
38 |   ## Iterate over each row-chunk of B
39 |   % for bb in range(len(bchunks)):
40 |     ## Fill the initial shared memory block
41 |     % if loop.first:
42 |       % for kx in bchunks[0]:
43 |         % if loop.index % msplit == cid:
44 |         bsub[0][${loop.index}][threadIdx.x] = __ldcg(b + i + ${kx}*ldb);
45 |         % endif
46 |       % endfor
47 |         __barrier_sync(0);
48 |     % endif
49 |     ## Start filling the next shared memory block
50 |     % if not loop.last:
51 |       % for kx in bchunks[bb + 1]:
52 |         % if loop.index % msplit == cid:
53 |         bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = __ldcg(b + i + ${kx}*ldb);
54 |         % endif
55 |       % endfor
56 |     % endif
57 |     ## Accumulate our dot products
58 |     % for kx in bchunks[bb]:
59 |         bv = bsub[${bb % 2}][${loop.index}][threadIdx.x];
60 |       % for j, jx in enumerate(A[mcx, kx]):
61 |         % if jx != 0 and kx == afix[mcx[j]]:
62 |         csub[${j}] = ${jx}*bv;
63 |         % elif jx != 0:
64 |         csub[${j}] += ${jx}*bv;
65 |         % endif
66 |         ## If we're done with this dot product then store to global
67 |         % if kx == alix[mcx[j]] and beta == 0:
68 |         __stcg(c + i + ${mcx[j]}*ldc, csub[${j}]);
69 |         % elif kx == alix[mcx[j]] and beta == 1:
70 |         c[i + ${mcx[j]}*ldc] += csub[${j}];
71 |         % elif kx == alix[mcx[j]]:
72 |         c[i + ${mcx[j]}*ldc] = csub[${j}] + ${beta}*c[i + ${mcx[j]}*ldc];
73 |         % endif
74 |       % endfor
75 |     % endfor
76 |         __barrier_sync(0);
77 |   % endfor
78 |   ## Handle rows of A which are all zero
79 |   % for j, jx in enumerate(afix):
80 |     % if jx == -1 and j % msplit == cid and beta == 0:
81 |         __stcg(c + i + ${j}*ldc, make_zero());
82 |     % elif jx == -1 and j % msplit == cid and beta != 1:
83 |         c[i + ${j}*ldc] *= ${beta};
84 |     % endif
85 |   % endfor
86 |     }
87 | % endfor
88 | }
89 | 


--------------------------------------------------------------------------------
/gimmik/kernels/cuda/bstream.mako:
--------------------------------------------------------------------------------
 1 | <%inherit file='base'/>
 2 | 
 3 | __global__ void
 4 | % if n is None:
 5 | ${kname}(int n,
 6 |          const ${dtype}* __restrict__ b, int ldb,
 7 |          ${dtype}* __restrict__ c, int ldc)
 8 | {
 9 |   % if width > 1:
10 |     n = ((n + ${width} - 1) / ${width}) * ${width};
11 |     ldb /= ${width};
12 |     ldc /= ${width};
13 |   % endif
14 | % else:
15 | ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
16 | {
17 |     const int n = ${-(-n // width)};
18 |     const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
19 |     const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
20 | % endif
21 |     const int i = blockDim.x*blockIdx.x + threadIdx.x;
22 | 
23 |     if (i < n)
24 |     {
25 |         ${dtype} bv, csub[${m}];
26 | 
27 | ## Iterare through the used rows of B
28 | % for kx in bix:
29 |         bv = __ldcg(b + i + ${kx}*ldb);
30 |   % for j, jx in enumerate(A[:, kx]):
31 |     % if jx != 0 and kx == afix[j]:
32 |         csub[${j}] = ${jx}*bv;
33 |     % elif jx != 0:
34 |         csub[${j}] += ${jx}*bv;
35 |     % endif
36 |     ##
37 |     % if kx == alix[j] and beta == 0:
38 |         __stcg(c + i + ${j}*ldc, csub[${j}]);
39 |     % elif kx == alix[j] and beta == 1:
40 |         c[i + ${j}*ldc] += csub[${j}];
41 |     % elif kx == alix[j]:
42 |         c[i + ${j}*ldc] = csub[${j}] + ${beta}*c[i + ${j}*ldc];
43 |     % endif
44 |   % endfor
45 | % endfor
46 | 
47 | ## Handle rows of A which are all zero
48 | % for j, jx in enumerate(afix):
49 |   % if jx == -1 and beta == 0:
50 |         c[i + ${j}*ldc] = make_zero();
51 |   % elif jx == -1 and beta != 1:
52 |         c[i + ${j}*ldc] *= ${beta};
53 |   % endif
54 | % endfor
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/gimmik/kernels/cuda/cstream-ksplit.mako:
--------------------------------------------------------------------------------
 1 | <%inherit file='base'/>
 2 | 
 3 | <%
 4 | kparts = partition(A, ksplit, by='cols')
 5 | cchunks = chunk(range(m), csz)
 6 | loaded = set()
 7 | %>
 8 | 
 9 | __global__ void
10 | % if n is None:
11 | ${kname}(int n,
12 |          const ${dtype}* __restrict__ b, int ldb,
13 |          ${dtype}* __restrict__ c, int ldc)
14 | {
15 |   % if width > 1:
16 |     n = ((n + ${width} - 1) / ${width}) * ${width};
17 |     ldb /= ${width};
18 |     ldc /= ${width};
19 |   % endif
20 | % else:
21 | ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
22 | {
23 |     const int n = ${-(-n // width)};
24 |     const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
25 |     const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
26 | % endif
27 |     const int i = blockDim.x*blockIdx.x + threadIdx.x;
28 | 
29 |     ${dtype} cv[${-(-csz // ksplit)}], bv[${-(-k // ksplit)}], dotp;
30 |     __shared__ ${dtype} csub[${ksplit - 1}][${csz}][${blockx}];
31 | 
32 |     if (i >= n)
33 |         return;
34 | 
35 | ## Iterate over the column-partitions of B
36 | % for bid, kbx in enumerate(kparts):
37 |     if (threadIdx.y == ${bid})
38 |     {
39 |   ## Iterate over the row-partitions of C
40 |   % for cchunk in cchunks:
41 |     ## Evaluate our partial dot products
42 |     % for j in cchunk:
43 |       ## Load in any missing parts of B
44 |       % for kx in kbx:
45 |         % if A[j, kx] != 0 and kx not in loaded:
46 |         bv[${loop.index}] = __ldcg(b + i + ${kx}*ldb); <% loaded.add(kx) %>
47 |         % endif
48 |       % endfor
49 |       % if (dotex := dot(lambda kx: f'bv[{kx}]', A[j, kbx])) != '0.0':
50 |         dotp = ${dotex};
51 |       % else:
52 |         dotp = make_zero();
53 |       % endif
54 |       ## Save to a register
55 |       % if loop.index % ksplit == bid:
56 |         cv[${loop.index // ksplit}] = dotp;
57 |       ## Save to shared memory
58 |       % else:
59 |         csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][threadIdx.x] = dotp;
60 |       % endif
61 |     % endfor
62 |         __barrier_sync(0);
63 |     ## Sum and output the final set of dot products
64 |     % for j in cchunk:
65 |       % if loop.index % ksplit == bid:
66 |         dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]'
67 |                                                           for i in range(ksplit - 1))};
68 |         % if beta == 0:
69 |         __stcg(c + i + ${j}*ldc, dotp);
70 |         % elif beta == 1:
71 |         c[i + ${j}*ldc] += dotp;
72 |         % else:
73 |         c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc];
74 |         % endif
75 |       % endif
76 |     % endfor
77 |         __barrier_sync(0);
78 |   % endfor
79 |     }
80 | % endfor
81 | }
82 | 


--------------------------------------------------------------------------------
/gimmik/kernels/cuda/cstream.mako:
--------------------------------------------------------------------------------
 1 | <%inherit file='base'/>
 2 | 
 3 | <% ksplit = 2 if m < 36 else 1 %>
 4 | 
 5 | __global__ void
 6 | % if n is None:
 7 | ${kname}(int n,
 8 |          const ${dtype}* __restrict__ b, int ldb,
 9 |          ${dtype}* __restrict__ c, int ldc)
10 | {
11 |   % if width > 1:
12 |     n = ((n + ${width} - 1) / ${width}) * ${width};
13 |     ldb /= ${width};
14 |     ldc /= ${width};
15 |   % endif
16 | % else:
17 | ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
18 | {
19 |     const int n = ${-(-n // width)};
20 |     const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
21 |     const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
22 | % endif
23 |     const int i = blockDim.x*blockIdx.x + threadIdx.x;
24 |     ${dtype} dotp;
25 | 
26 |     if (i < n)
27 |     {
28 | % for j, jx in enumerate(A):
29 |   % if (dotex := dot(lambda kx: f'b[i + {kx}*ldb]', jx, maxsplit=ksplit)) != '0.0':
30 |         dotp = ${dotex};
31 |   % else:
32 |         dotp = make_zero();
33 |   % endif
34 |   % if beta == 0:
35 |         c[i + ${j}*ldc] = dotp;
36 |   % elif beta == 1 and dotex != '0.0':
37 |         c[i + ${j}*ldc] += dotp;
38 |   % else:
39 |         c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc];
40 |   % endif
41 | % endfor
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/gimmik/kernels/hip/base.mako:
--------------------------------------------------------------------------------
 1 | % if dtype.endswith('4'):
 2 | static inline __device__ ${dtype} make_zero()
 3 | { return make_${dtype}(0, 0, 0, 0); }
 4 | % elif dtype.endswith('2'):
 5 | static inline __device__ ${dtype} make_zero()
 6 | { return make_${dtype}(0, 0); }
 7 | % else:
 8 | static inline __device__ ${dtype} make_zero()
 9 | { return 0; }
10 | % endif
11 | 
12 | ${next.body()}
13 | 


--------------------------------------------------------------------------------
/gimmik/kernels/hip/bstream-msplit.mako:
--------------------------------------------------------------------------------
 1 | <%inherit file='base'/>
 2 | 
 3 | <%
 4 | mx = partition(A, into=msplit, by='rows')
 5 | bchunks = chunk(bix, bsz)
 6 | %>
 7 | 
 8 | __global__ __launch_bounds__(${blockx*msplit}) void
 9 | % if n is None:
10 | ${kname}(int n,
11 |          const ${dtype}* __restrict__ b, int ldb,
12 |          ${dtype}* __restrict__ c, int ldc)
13 | {
14 |   % if width > 1:
15 |     n = ((n + ${width} - 1) / ${width}) * ${width};
16 |     ldb /= ${width};
17 |     ldc /= ${width};
18 |   % endif
19 | % else:
20 | ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
21 | {
22 |     const int n = ${-(-n // width)};
23 |     const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
24 |     const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
25 | % endif
26 |     int i = blockDim.x*blockIdx.x + threadIdx.x;
27 | 
28 |     ${dtype} bv, csub[${-(-m // msplit)}];
29 |     __shared__ ${dtype} bsub[2][${bsz}][${blockx}];
30 | 
31 | ## Fill the initial shared memory block
32 | % for cid in range(msplit):
33 |     if (i < n && threadIdx.y == ${cid})
34 |     {
35 |   % for kx in bchunks[0]:
36 |     % if loop.index % msplit == cid:
37 |         bsub[0][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb];
38 |     % endif
39 |   % endfor
40 |     }
41 | % endfor
42 |     __syncthreads();
43 | 
44 | ## Iterate over each row-chunk of B
45 | % for bb in range(len(bchunks)):
46 |   ## Iterate over each row-chunk of C
47 |   % for cid, mcx in enumerate(mx):
48 |     if (i < n && threadIdx.y == ${cid})
49 |     {
50 |     ## Start filling the next shared memory block
51 |     % if not loop.parent.last:
52 |       % for kx in bchunks[bb + 1]:
53 |         % if loop.index % msplit == cid:
54 |         bsub[${(bb + 1) % 2}][${loop.index}][threadIdx.x] = b[i + ${kx}*ldb];
55 |         % endif
56 |       % endfor
57 |     % endif
58 |     ## Accumulate our dot products
59 |     % for kx in bchunks[bb]:
60 |         bv = bsub[${bb % 2}][${loop.index}][threadIdx.x];
61 |       % for j, jx in enumerate(A[mcx, kx]):
62 |         % if jx != 0 and kx == afix[mcx[j]]:
63 |         csub[${j}] = ${jx}*bv;
64 |         % elif jx != 0:
65 |         csub[${j}] += ${jx}*bv;
66 |         % endif
67 |         ## If we're done with this dot product then store to global
68 |         % if kx == alix[mcx[j]] and beta == 0:
69 |         c[i + ${mcx[j]}*ldc] = csub[${j}];
70 |         % elif kx == alix[mcx[j]] and beta == 1:
71 |         c[i + ${mcx[j]}*ldc] += csub[${j}];
72 |         % elif kx == alix[mcx[j]]:
73 |         c[i + ${mcx[j]}*ldc] = csub[${j}] + ${beta}*c[i + ${mcx[j]}*ldc];
74 |         % endif
75 |       % endfor
76 |     % endfor
77 |     ## Handle rows of A which are all zero
78 |     % if loop.parent.last:
79 |       % for j, jx in enumerate(afix):
80 |         % if jx == -1 and j % msplit == cid and beta == 0:
81 |         c[i + ${j}*ldc] = make_zero();
82 |         % elif jx == -1 and j % msplit == cid and beta != 1:
83 |         c[i + ${j}*ldc] *= ${beta};
84 |         % endif
85 |       % endfor
86 |     % endif
87 |     }
88 |   % endfor
89 |     __syncthreads();
90 | % endfor
91 | }
92 | 


--------------------------------------------------------------------------------
/gimmik/kernels/hip/bstream.mako:
--------------------------------------------------------------------------------
 1 | <%inherit file='base'/>
 2 | 
 3 | __global__ __launch_bounds__(128) void
 4 | % if n is None:
 5 | ${kname}(int n,
 6 |          const ${dtype}* __restrict__ b, int ldb,
 7 |          ${dtype}* __restrict__ c, int ldc)
 8 | {
 9 |   % if width > 1:
10 |     n = ((n + ${width} - 1) / ${width}) * ${width};
11 |     ldb /= ${width};
12 |     ldc /= ${width};
13 |   % endif
14 | % else:
15 | ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
16 | {
17 |     const int n = ${-(-n // width)};
18 |     const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
19 |     const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
20 | % endif
21 |     const int i = blockDim.x*blockIdx.x + threadIdx.x;
22 | 
23 |     if (i < n)
24 |     {
25 |         ${dtype} bv, csub[${m}];
26 | 
27 | ## Iterare through the used rows of B
28 | % for kx in bix:
29 |         bv = b[i + ${kx}*ldb];
30 |   % for j, jx in enumerate(A[:, kx]):
31 |     % if jx != 0 and kx == afix[j]:
32 |         csub[${j}] = ${jx}*bv;
33 |     % elif jx != 0:
34 |         csub[${j}] += ${jx}*bv;
35 |     % endif
36 |     ##
37 |     % if kx == alix[j] and beta == 0:
38 |         c[i + ${j}*ldc] = csub[${j}];
39 |     % elif kx == alix[j] and beta == 1:
40 |         c[i + ${j}*ldc] += csub[${j}];
41 |     % elif kx == alix[j]:
42 |         c[i + ${j}*ldc] = csub[${j}] + ${beta}*c[i + ${j}*ldc];
43 |     % endif
44 |   % endfor
45 | % endfor
46 | 
47 | ## Handle rows of A which are all zero
48 | % for j, jx in enumerate(afix):
49 |   % if jx == -1 and beta == 0:
50 |         c[i + ${j}*ldc] = make_zero();
51 |   % elif jx == -1 and beta != 1:
52 |         c[i + ${j}*ldc] *= ${beta};
53 |   % endif
54 | % endfor
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/gimmik/kernels/hip/cstream-ksplit.mako:
--------------------------------------------------------------------------------
 1 | <%inherit file='base'/>
 2 | 
 3 | <%
 4 | kparts = partition(A, ksplit, by='cols')
 5 | cchunks = chunk(range(m), csz)
 6 | loaded = set()
 7 | %>
 8 | 
 9 | __global__ __launch_bounds__(${blockx*ksplit}) void
10 | % if n is None:
11 | ${kname}(int n,
12 |          const ${dtype}* __restrict__ b, int ldb,
13 |          ${dtype}* __restrict__ c, int ldc)
14 | {
15 |   % if width > 1:
16 |     n = ((n + ${width} - 1) / ${width}) * ${width};
17 |     ldb /= ${width};
18 |     ldc /= ${width};
19 |   % endif
20 | % else:
21 | ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
22 | {
23 |     const int n = ${-(-n // width)};
24 |     const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
25 |     const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
26 | % endif
27 |     int i = blockDim.x*blockIdx.x + threadIdx.x;
28 | 
29 |     ${dtype} cv[${-(-csz // ksplit)}], bv[${-(-k // ksplit)}], dotp;
30 |     __shared__ ${dtype} csub[${ksplit - 1}][${csz}][${blockx}];
31 | 
32 | ## Iterate over the row-partitions of C
33 | % for cchunk in cchunks:
34 |   ## Iterate over the row-partitions of B
35 |   % for bid, kbx in enumerate(kparts):
36 |     if (i < n && threadIdx.y == ${bid})
37 |     {
38 |     ## Evaluate our partial dot products
39 |     % for j in cchunk:
40 |       ## Load in any missing parts of B
41 |       % for kx in kbx:
42 |         % if A[j, kx] != 0 and kx not in loaded:
43 |         bv[${loop.index}] = b[i + ${kx}*ldb]; <% loaded.add(kx) %>
44 |         % endif
45 |       % endfor
46 |       % if (dotex := dot(lambda kx: f'bv[{kx}]', A[j, kbx])) != '0.0':
47 |         dotp = ${dotex};
48 |       % else:
49 |         dotp = make_zero();
50 |       % endif
51 |       ## Save to a register
52 |       % if loop.index % ksplit == bid:
53 |         cv[${loop.index // ksplit}] = dotp;
54 |       ## Save to shared memory
55 |       % else:
56 |         csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][threadIdx.x] = dotp;
57 |       % endif
58 |     % endfor
59 |     }
60 |   % endfor
61 |     __syncthreads();
62 |   ## Iterate over the column-partitions of B
63 |   % for bid, kbx in enumerate(kparts):
64 |     if (i < n && threadIdx.y == ${bid})
65 |     {
66 |     ## Sum and output the final set of dot products
67 |     % for j in cchunk:
68 |       % if loop.index % ksplit == bid:
69 |         dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][threadIdx.x]'
70 |                                                           for i in range(ksplit - 1))};
71 |         % if beta == 0:
72 |         c[i + ${j}*ldc] = dotp;
73 |         % elif beta == 1:
74 |         c[i + ${j}*ldc] += dotp;
75 |         % else:
76 |         c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc];
77 |         % endif
78 |       % endif
79 |     % endfor
80 |     }
81 |   % endfor
82 |     __syncthreads();
83 | % endfor
84 | }
85 | 


--------------------------------------------------------------------------------
/gimmik/kernels/hip/cstream.mako:
--------------------------------------------------------------------------------
 1 | <%inherit file='base'/>
 2 | 
 3 | <% ksplit = 2 if m < 36 else 1 %>
 4 | 
 5 | __global__ __launch_bounds__(128) void
 6 | % if n is None:
 7 | ${kname}(int n,
 8 |          const ${dtype}* __restrict__ b, int ldb,
 9 |          ${dtype}* __restrict__ c, int ldc)
10 | {
11 |   % if width > 1:
12 |     n = ((n + ${width} - 1) / ${width}) * ${width};
13 |     ldb /= ${width};
14 |     ldc /= ${width};
15 |   % endif
16 | % else:
17 | ${kname}(const ${dtype}* __restrict__ b, ${dtype}* __restrict__ c)
18 | {
19 |     const int n = ${-(-n // width)};
20 |     const ${'long long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
21 |     const ${'long long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
22 | % endif
23 |     const int i = blockDim.x*blockIdx.x + threadIdx.x;
24 |     ${dtype} dotp;
25 | 
26 |     if (i < n)
27 |     {
28 | % for j, jx in enumerate(A):
29 |   % if (dotex := dot(lambda kx: f'b[i + {kx}*ldb]', jx, maxsplit=ksplit)) != '0.0':
30 |         dotp = ${dotex};
31 |   % else:
32 |         dotp = make_zero();
33 |   % endif
34 |   % if beta == 0:
35 |         c[i + ${j}*ldc] = dotp;
36 |   % elif beta == 1 and dotex != '0.0':
37 |         c[i + ${j}*ldc] += dotp;
38 |   % else:
39 |         c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc];
40 |   % endif
41 | % endfor
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/gimmik/kernels/ispc/cstream.mako:
--------------------------------------------------------------------------------
 1 | export void
 2 | % if n is None:
 3 | ${kname}(uniform int n,
 4 |          const uniform ${dtype} b[], uniform int ldb,
 5 |          ${dtype} uniform c[], uniform int ldc)
 6 | {
 7 | % else:
 8 | ${kname}(const uniform ${dtype} b[], ${dtype} uniform c[])
 9 | {
10 |     const uniform int n = ${n};
11 |     const uniform ${'long long' if k*ldb >= 2**31 else 'int'} ldb = ${ldb};
12 |     const uniform ${'long long' if m*ldc >= 2**31 else 'int'} ldc = ${ldc};
13 | % endif
14 | 
15 |     foreach (i = 0 ... n)
16 |     {
17 | % for j, jx in enumerate(A):
18 |   % if beta == 0:
19 |         c[i + ${j}*ldc] = ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)};
20 |   % elif beta == 1:
21 |         c[i + ${j}*ldc] += ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)};
22 |   % else:
23 |         c[i + ${j}*ldc] = ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)}
24 |                         + ${beta}*c[i + ${j}*ldc];
25 |   % endif
26 | % endfor
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/gimmik/kernels/metal/base.mako:
--------------------------------------------------------------------------------
 1 | #include <metal_stdlib>
 2 | 
 3 | using namespace metal;
 4 | 
 5 | % if dtype.endswith('4'):
 6 | static inline ${dtype} make_zero()
 7 | { return ${dtype}(0, 0, 0, 0); }
 8 | % elif dtype.endswith('2'):
 9 | static inline ${dtype} make_zero()
10 | { return ${dtype}(0, 0); }
11 | % else:
12 | static inline ${dtype} make_zero()
13 | { return 0; }
14 | % endif
15 | 
16 | ${next.body()}
17 | 


--------------------------------------------------------------------------------
/gimmik/kernels/metal/bstream-msplit.mako:
--------------------------------------------------------------------------------
 1 | <%inherit file='base'/>
 2 | 
 3 | <%
 4 | mx = partition(A, into=msplit, by='rows')
 5 | bchunks = chunk(bix, bsz)
 6 | %>
 7 | 
 8 | kernel void
 9 | % if n is None:
10 | ${kname}(constant int& n_,
11 |          device ${dtype}* b, constant int& ldb_,
12 |          device ${dtype}* c, constant int& ldc_,
13 |          uint2 tpig [[thread_position_in_grid]],
14 |          uint2 tpitg [[thread_position_in_threadgroup]])
15 | {
16 |     const int n = ((n_ + ${width} - 1) / ${width}) * ${width};
17 |     const int ldb = ldb_ / ${width};
18 |     const int ldc = ldc_ / ${width};
19 | % else:
20 | ${kname}(device const ${dtype}* b, device ${dtype}* c,
21 |          uint2 tpig [[thread_position_in_grid]],
22 |          uint2 tpitg [[thread_position_in_threadgroup]])
23 | {
24 |     const int n = ${-(-n // width)};
25 |     const ${'long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
26 |     const ${'long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
27 | % endif
28 |     const int i = tpig.x;
29 | 
30 |     ${dtype} bv, csub[${-(-m // msplit)}];
31 |     threadgroup ${dtype} bsub[2][${bsz}][${blockx}];
32 | 
33 | ## Fill the initial shared memory block
34 | % for cid in range(msplit):
35 |     if (i < n && tpitg.y == ${cid})
36 |     {
37 |   % for kx in bchunks[0]:
38 |     % if loop.index % msplit == cid:
39 |         bsub[0][${loop.index}][tpitg.x] = b[i + ${kx}*ldb];
40 |     % endif
41 |   % endfor
42 |     }
43 | % endfor
44 |     threadgroup_barrier(mem_flags::mem_threadgroup);
45 | 
46 | ## Iterate over each row-chunk of B
47 | % for bb in range(len(bchunks)):
48 |   ## Iterate over each row-chunk of C
49 |   % for cid, mcx in enumerate(mx):
50 |     if (i < n && tpitg.y == ${cid})
51 |     {
52 |     ## Start filling the next shared memory block
53 |     % if not loop.parent.last:
54 |       % for kx in bchunks[bb + 1]:
55 |         % if loop.index % msplit == cid:
56 |         bsub[${(bb + 1) % 2}][${loop.index}][tpitg.x] = b[i + ${kx}*ldb];
57 |         % endif
58 |       % endfor
59 |     % endif
60 |     ## Accumulate our dot products
61 |     % for kx in bchunks[bb]:
62 |         bv = bsub[${bb % 2}][${loop.index}][tpitg.x];
63 |       % for j, jx in enumerate(A[mcx, kx]):
64 |         % if jx != 0 and kx == afix[mcx[j]]:
65 |         csub[${j}] = ${jx}*bv;
66 |         % elif jx != 0:
67 |         csub[${j}] += ${jx}*bv;
68 |         % endif
69 |         ## If we're done with this dot product then store to global
70 |         % if kx == alix[mcx[j]] and beta == 0:
71 |         c[i + ${mcx[j]}*ldc] = csub[${j}];
72 |         % elif kx == alix[mcx[j]] and beta == 1:
73 |         c[i + ${mcx[j]}*ldc] += csub[${j}];
74 |         % elif kx == alix[mcx[j]]:
75 |         c[i + ${mcx[j]}*ldc] = csub[${j}] + ${beta}*c[i + ${mcx[j]}*ldc];
76 |         % endif
77 |       % endfor
78 |     % endfor
79 |     ## Handle rows of A which are all zero
80 |     % if loop.parent.last:
81 |       % for j, jx in enumerate(afix):
82 |         % if jx == -1 and j % msplit == cid and beta == 0:
83 |         c[i + ${j}*ldc] = make_zero();
84 |         % elif jx == -1 and j % msplit == cid and beta != 1:
85 |         c[i + ${j}*ldc] *= ${beta};
86 |         % endif
87 |       % endfor
88 |     % endif
89 |     }
90 |   % endfor
91 |     threadgroup_barrier(mem_flags::mem_threadgroup);
92 | % endfor
93 | }
94 | 


--------------------------------------------------------------------------------
/gimmik/kernels/metal/bstream.mako:
--------------------------------------------------------------------------------
 1 | <%inherit file='base'/>
 2 | 
 3 | kernel void
 4 | % if n is None:
 5 | ${kname}(constant int& n_,
 6 |          device ${dtype}* b, constant int& ldb_,
 7 |          device ${dtype}* c, constant int& ldc_,
 8 |          uint i [[thread_position_in_grid]])
 9 | {
10 |     const int n = ((n_ + ${width} - 1) / ${width}) * ${width};
11 |     const int ldb = ldb_ / ${width};
12 |     const int ldc = ldc_ / ${width};
13 | % else:
14 | ${kname}(device const ${dtype}* b, device ${dtype}* c,
15 |          uint i [[thread_position_in_grid]])
16 | {
17 |     const int n = ${-(-n // width)};
18 |     const ${'long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
19 |     const ${'long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
20 | % endif
21 | 
22 |     if (i < n)
23 |     {
24 |         ${dtype} bv, csub[${m}];
25 | 
26 | ## Iterare through the used rows of B
27 | % for kx in bix:
28 |         bv = b[i + ${kx}*ldb];
29 |   % for j, jx in enumerate(A[:, kx]):
30 |     % if jx != 0 and kx == afix[j]:
31 |         csub[${j}] = ${jx}*bv;
32 |     % elif jx != 0:
33 |         csub[${j}] += ${jx}*bv;
34 |     % endif
35 |     ##
36 |     % if kx == alix[j] and beta == 0:
37 |         c[i + ${j}*ldc] = csub[${j}];
38 |     % elif kx == alix[j] and beta == 1:
39 |         c[i + ${j}*ldc] += csub[${j}];
40 |     % elif kx == alix[j]:
41 |         c[i + ${j}*ldc] = csub[${j}] + ${beta}*c[i + ${j}*ldc];
42 |     % endif
43 |   % endfor
44 | % endfor
45 | 
46 | ## Handle rows of A which are all zero
47 | % for j, jx in enumerate(afix):
48 |   % if jx == -1 and beta == 0:
49 |         c[i + ${j}*ldc] = make_zero();
50 |   % elif jx == -1 and beta != 1:
51 |         c[i + ${j}*ldc] *= ${beta};
52 |   % endif
53 | % endfor
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------
/gimmik/kernels/metal/cstream-ksplit.mako:
--------------------------------------------------------------------------------
 1 | <%inherit file='base'/>
 2 | 
 3 | <%
 4 | kparts = partition(A, ksplit, by='cols')
 5 | cchunks = chunk(range(m), csz)
 6 | loaded = set()
 7 | %>
 8 | 
 9 | kernel void
10 | % if n is None:
11 | ${kname}(constant int& n_,
12 |          device ${dtype}* b, constant int& ldb_,
13 |          device ${dtype}* c, constant int& ldc_,
14 |          uint2 tpig [[thread_position_in_grid]],
15 |          uint2 tpitg [[thread_position_in_threadgroup]])
16 | {
17 |     const int n = ((n_ + ${width} - 1) / ${width}) * ${width};
18 |     const int ldb = ldb_ / ${width};
19 |     const int ldc = ldc_ / ${width};
20 | % else:
21 | ${kname}(device const ${dtype}* b, device ${dtype}* c,
22 |          uint2 tpig [[thread_position_in_grid]],
23 |          uint2 tpitg [[thread_position_in_threadgroup]])
24 | {
25 |     const int n = ${-(-n // width)};
26 |     const ${'long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
27 |     const ${'long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
28 | % endif
29 |     const int i = tpig.x;
30 | 
31 |     ${dtype} cv[${-(-csz // ksplit)}], bv[${-(-k // ksplit)}], dotp;
32 |     threadgroup ${dtype} csub[${ksplit - 1}][${csz}][${blockx}];
33 | 
34 | ## Iterate over the row-partitions of C
35 | % for cchunk in cchunks:
36 |   ## Iterate over the row-partitions of B
37 |   % for bid, kbx in enumerate(kparts):
38 |     if (i < n && tpitg.y == ${bid})
39 |     {
40 |     ## Evaluate our partial dot products
41 |     % for j in cchunk:
42 |       ## Load in any missing parts of B
43 |       % for kx in kbx:
44 |         % if A[j, kx] != 0 and kx not in loaded:
45 |         bv[${loop.index}] = b[i + ${kx}*ldb]; <% loaded.add(kx) %>
46 |         % endif
47 |       % endfor
48 |       % if (dotex := dot(lambda kx: f'bv[{kx}]', A[j, kbx])) != '0.0':
49 |         dotp = ${dotex};
50 |       % else:
51 |         dotp = make_zero();
52 |       % endif
53 |       ## Save to a register
54 |       % if loop.index % ksplit == bid:
55 |         cv[${loop.index // ksplit}] = dotp;
56 |       ## Save to shared memory
57 |       % else:
58 |         csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][tpitg.x] = dotp;
59 |       % endif
60 |     % endfor
61 |     }
62 |   % endfor
63 |     threadgroup_barrier(mem_flags::mem_threadgroup);
64 |   ## Iterate over the column-partitions of B
65 |   % for bid, kbx in enumerate(kparts):
66 |     if (i < n && tpitg.y == ${bid})
67 |     {
68 |     ## Sum and output the final set of dot products
69 |     % for j in cchunk:
70 |       % if loop.index % ksplit == bid:
71 |         dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][tpitg.x]'
72 |                                                           for i in range(ksplit - 1))};
73 |         % if beta == 0:
74 |         c[i + ${j}*ldc] = dotp;
75 |         % elif beta == 1:
76 |         c[i + ${j}*ldc] += dotp;
77 |         % else:
78 |         c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc];
79 |         % endif
80 |       % endif
81 |     % endfor
82 |     }
83 |   % endfor
84 |     threadgroup_barrier(mem_flags::mem_threadgroup);
85 | % endfor
86 | }
87 | 


--------------------------------------------------------------------------------
/gimmik/kernels/metal/cstream.mako:
--------------------------------------------------------------------------------
 1 | <%inherit file='base'/>
 2 | 
 3 | <% ksplit = 2 if m < 36 else 1 %>
 4 | 
 5 | kernel void
 6 | % if n is None:
 7 | ${kname}(constant int& n_,
 8 |          device ${dtype}* b, constant int& ldb_,
 9 |          device ${dtype}* c, constant int& ldc_,
10 |          uint i [[thread_position_in_grid]])
11 | {
12 |     const int n = ((n_ + ${width} - 1) / ${width}) * ${width};
13 |     const int ldb = ldb_ / ${width};
14 |     const int ldc = ldc_ / ${width};
15 | % else:
16 | ${kname}(device const ${dtype}* b, device ${dtype}* c,
17 |          uint i [[thread_position_in_grid]])
18 | {
19 |     const int n = ${-(-n // width)};
20 |     const ${'long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
21 |     const ${'long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
22 | % endif
23 |     ${dtype} dotp;
24 | 
25 |     if (i < n)
26 |     {
27 | % for j, jx in enumerate(A):
28 |   % if (dotex := dot(lambda kx: f'b[i + {kx}*ldb]', jx, maxsplit=ksplit)) != '0.0':
29 |         dotp = ${dotex};
30 |   % else:
31 |         dotp = make_zero();
32 |   % endif
33 |   % if beta == 0:
34 |         c[i + ${j}*ldc] = dotp;
35 |   % elif beta == 1 and dotex != '0.0':
36 |         c[i + ${j}*ldc] += dotp;
37 |   % else:
38 |         c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc];
39 |   % endif
40 | % endfor
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/gimmik/kernels/opencl/bstream-msplit.mako:
--------------------------------------------------------------------------------
 1 | <%
 2 | mx = partition(A, into=msplit, by='rows')
 3 | bchunks = chunk(bix, bsz)
 4 | %>
 5 | 
 6 | __kernel __attribute__((reqd_work_group_size(${blockx}, ${msplit}, 1))) void
 7 | % if n is None:
 8 | ${kname}(int n,
 9 |          __global const ${dtype}* restrict b, int ldb,
10 |          __global ${dtype}* restrict c, int ldc)
11 | {
12 |   % if width > 1:
13 |     n = ((n + ${width} - 1) / ${width}) * ${width};
14 |     ldb /= ${width};
15 |     ldc /= ${width};
16 |   % endif
17 | % else:
18 | ${kname}(__global const ${dtype}* restrict b, __global ${dtype}* restrict c)
19 | {
20 |     const int n = ${-(-n // width)};
21 |     const ${'long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
22 |     const ${'long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
23 | % endif
24 |     int i = get_global_id(0);
25 |     int lx = get_local_id(0), ly = get_local_id(1);
26 | 
27 |     ${dtype} bv, csub[${-(-m // msplit)}];
28 |     __local ${dtype} bsub[2][${bsz}][${blockx}];
29 | 
30 | ## Fill the initial shared memory block
31 | % for cid in range(msplit):
32 |     if (i < n && ly == ${cid})
33 |     {
34 |   % for kx in bchunks[0]:
35 |     % if loop.index % msplit == cid:
36 |         bsub[0][${loop.index}][lx] = b[i + ${kx}*ldb];
37 |     % endif
38 |   % endfor
39 |     }
40 | % endfor
41 |     work_group_barrier(CLK_LOCAL_MEM_FENCE);
42 | 
43 | ## Iterate over each row-chunk of B
44 | % for bb in range(len(bchunks)):
45 |   ## Iterate over each row-chunk of C
46 |   % for cid, mcx in enumerate(mx):
47 |     if (i < n && ly == ${cid})
48 |     {
49 |     ## Start filling the next shared memory block
50 |     % if not loop.parent.last:
51 |       % for kx in bchunks[bb + 1]:
52 |         % if loop.index % msplit == cid:
53 |         bsub[${(bb + 1) % 2}][${loop.index}][lx] = b[i + ${kx}*ldb];
54 |         % endif
55 |       % endfor
56 |     % endif
57 |     ## Accumulate our dot products
58 |     % for kx in bchunks[bb]:
59 |         bv = bsub[${bb % 2}][${loop.index}][lx];
60 |       % for j, jx in enumerate(A[mcx, kx]):
61 |         % if jx != 0 and kx == afix[mcx[j]]:
62 |         csub[${j}] = ${jx}*bv;
63 |         % elif jx != 0:
64 |         csub[${j}] += ${jx}*bv;
65 |         % endif
66 |         ## If we're done with this dot product then store to global
67 |         % if kx == alix[mcx[j]] and beta == 0:
68 |         c[i + ${mcx[j]}*ldc] = csub[${j}];
69 |         % elif kx == alix[mcx[j]] and beta == 1:
70 |         c[i + ${mcx[j]}*ldc] += csub[${j}];
71 |         % elif kx == alix[mcx[j]]:
72 |         c[i + ${mcx[j]}*ldc] = csub[${j}] + ${beta}*c[i + ${mcx[j]}*ldc];
73 |         % endif
74 |       % endfor
75 |     % endfor
76 |     ## Handle rows of A which are all zero
77 |     % if loop.parent.last:
78 |       % for j, jx in enumerate(afix):
79 |         % if jx == -1 and j % msplit == cid and beta == 0:
80 |         c[i + ${j}*ldc] = 0;
81 |         % elif jx == -1 and j % msplit == cid and beta != 1:
82 |         c[i + ${j}*ldc] *= ${beta};
83 |         % endif
84 |       % endfor
85 |     % endif
86 |     }
87 |   % endfor
88 |     work_group_barrier(CLK_LOCAL_MEM_FENCE);
89 | % endfor
90 | }
91 | 


--------------------------------------------------------------------------------
/gimmik/kernels/opencl/bstream.mako:
--------------------------------------------------------------------------------
 1 | __kernel void
 2 | % if n is None:
 3 | ${kname}(int n,
 4 |          __global const ${dtype}* restrict b, int ldb,
 5 |          __global ${dtype}* restrict c, int ldc)
 6 | {
 7 | % else:
 8 | ${kname}(__global const ${dtype}* restrict b, __global ${dtype}* restrict c)
 9 | {
10 |     const int n = ${n};
11 |     const ${'long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
12 |     const ${'long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
13 | % endif
14 |     int i = get_global_id(0);
15 | 
16 |     if (i < n)
17 |     {
18 |         ${dtype} bv, csub[${m}];
19 | 
20 | ## Iterare through the used rows of B
21 | % for kx in bix:
22 |         bv = b[i + ${kx}*ldb];
23 |   % for j, jx in enumerate(A[:, kx]):
24 |     % if jx != 0 and kx == afix[j]:
25 |         csub[${j}] = ${jx}*bv;
26 |     % elif jx != 0:
27 |         csub[${j}] += ${jx}*bv;
28 |     % endif
29 |     ##
30 |     % if kx == alix[j] and beta == 0:
31 |         c[i + ${j}*ldc] = csub[${j}];
32 |     % elif kx == alix[j] and beta == 1:
33 |         c[i + ${j}*ldc] += csub[${j}];
34 |     % elif kx == alix[j]:
35 |         c[i + ${j}*ldc] = csub[${j}] + ${beta}*c[i + ${j}*ldc];
36 |     % endif
37 |   % endfor
38 | % endfor
39 | 
40 | ## Handle rows of A which are all zero
41 | % for j, jx in enumerate(afix):
42 |   % if jx == -1 and beta == 0:
43 |         c[i + ${j}*ldc] = 0;
44 |   % elif jx == -1 and beta != 1:
45 |         c[i + ${j}*ldc] *= ${beta};
46 |   % endif
47 | % endfor
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/gimmik/kernels/opencl/cstream-ksplit.mako:
--------------------------------------------------------------------------------
 1 | <%
 2 | kparts = partition(A, ksplit, by='cols')
 3 | cchunks = chunk(range(m), csz)
 4 | loaded = set()
 5 | %>
 6 | 
 7 | __kernel __attribute__((reqd_work_group_size(${blockx}, ${ksplit}, 1))) void
 8 | % if n is None:
 9 | ${kname}(int n,
10 |          __global const ${dtype}* restrict b, int ldb,
11 |          __global ${dtype}* restrict c, int ldc)
12 | {
13 |   % if width > 1:
14 |     n = ((n + ${width} - 1) / ${width}) * ${width};
15 |     ldb /= ${width};
16 |     ldc /= ${width};
17 |   % endif
18 | % else:
19 | ${kname}(__global const ${dtype}* restrict b, __global ${dtype}* restrict c)
20 | {
21 |     const int n = ${-(-n // width)};
22 |     const ${'long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
23 |     const ${'long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
24 | % endif
25 |     int i = get_global_id(0);
26 |     int lx = get_local_id(0), ly = get_local_id(1);
27 | 
28 |     ${dtype} cv[${-(-csz // ksplit)}], bv[${-(-k // ksplit)}], dotp;
29 |     __local ${dtype} csub[${ksplit - 1}][${csz}][${blockx}];
30 | 
31 | ## Iterate over the row-partitions of C
32 | % for cchunk in cchunks:
33 |   ## Iterate over the row-partitions of B
34 |   % for bid, kbx in enumerate(kparts):
35 |     if (i < n && ly == ${bid})
36 |     {
37 |     ## Evaluate our partial dot products
38 |     % for j in cchunk:
39 |       ## Load in any missing parts of B
40 |       % for kx in kbx:
41 |         % if A[j, kx] != 0 and kx not in loaded:
42 |         bv[${loop.index}] = b[i + ${kx}*ldb]; <% loaded.add(kx) %>
43 |         % endif
44 |       % endfor
45 |       % if (dotex := dot(lambda kx: f'bv[{kx}]', A[j, kbx])) != '0.0':
46 |         dotp = ${dotex};
47 |       % else:
48 |         dotp = 0;
49 |       % endif
50 |       ## Save to a register
51 |       % if loop.index % ksplit == bid:
52 |         cv[${loop.index // ksplit}] = dotp;
53 |       ## Save to shared memory
54 |       % else:
55 |         csub[${bid - (bid > loop.index % ksplit)}][${loop.index}][lx] = dotp;
56 |       % endif
57 |     % endfor
58 |     }
59 |   % endfor
60 |     work_group_barrier(CLK_LOCAL_MEM_FENCE);
61 |   ## Iterate over the column-partitions of B
62 |   % for bid, kbx in enumerate(kparts):
63 |     if (i < n && ly == ${bid})
64 |     {
65 |     ## Sum and output the final set of dot products
66 |     % for j in cchunk:
67 |       % if loop.index % ksplit == bid:
68 |         dotp = cv[${loop.index // ksplit}] + ${' + '.join(f'csub[{i}][{loop.index}][lx]'
69 |                                                           for i in range(ksplit - 1))};
70 |         % if beta == 0:
71 |         c[i + ${j}*ldc] = dotp;
72 |         % elif beta == 1:
73 |         c[i + ${j}*ldc] += dotp;
74 |         % else:
75 |         c[i + ${j}*ldc] = dotp + ${beta}*c[i + ${j}*ldc];
76 |         % endif
77 |       % endif
78 |     % endfor
79 |     }
80 |   % endfor
81 |     work_group_barrier(CLK_LOCAL_MEM_FENCE);
82 | % endfor
83 | }
84 | 


--------------------------------------------------------------------------------
/gimmik/kernels/opencl/cstream.mako:
--------------------------------------------------------------------------------
 1 | __kernel void
 2 | % if n is None:
 3 | ${kname}(int n,
 4 |          __global const ${dtype}* restrict b, int ldb,
 5 |          __global ${dtype}* restrict c, int ldc)
 6 | {
 7 |   % if width > 1:
 8 |     n = ((n + ${width} - 1) / ${width}) * ${width};
 9 |     ldb /= ${width};
10 |     ldc /= ${width};
11 |   % endif
12 | % else:
13 | ${kname}(__global const ${dtype}* restrict b, __global ${dtype}* restrict c)
14 | {
15 |     const int n = ${-(-n // width)};
16 |     const ${'long' if k*ldb >= width*2**31 else 'int'} ldb = ${ldb // width};
17 |     const ${'long' if m*ldc >= width*2**31 else 'int'} ldc = ${ldc // width};
18 | % endif
19 |     int i = get_global_id(0);
20 | 
21 |     if (i < n)
22 |     {
23 | % for j, jx in enumerate(A):
24 |   % if beta == 0:
25 |         c[i + ${j}*ldc] = ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)};
26 |   % elif beta == 1:
27 |         c[i + ${j}*ldc] += ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)};
28 |   % else:
29 |         c[i + ${j}*ldc] = ${dot(lambda kx: f'b[i + {kx}*ldb]', jx)}
30 |                         + ${beta}*c[i + ${j}*ldc];
31 |   % endif
32 | % endfor
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/gimmik/metal.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from gimmik.base import MatMul
 4 | 
 5 | 
 6 | class MetalMatMul(MatMul):
 7 |     platform = 'metal'
 8 |     basemeta = {'threadgroup': (128, 1, 1), 'threadgroup_mem_size': 0,
 9 |                 'width': 1}
10 | 
11 |     def _kernel_generators(self, dtype, dsize):
12 |         # B loading, C streaming kernel
13 |         yield ('cstream', {}, {})
14 | 
15 |         # B streaming, C accumulation kernel
16 |         yield ('bstream', {}, {})
17 | 
18 |         # Four-way m-split B streaming, C accumulation kernel
19 |         ms, bsz, blkx = 4, 16, 32
20 |         args = {'msplit': ms, 'blockx': blkx, 'bsz': bsz}
21 |         meta = {'threadgroup': (blkx, ms, 1),
22 |                 'threadgroup_mem_size': 2*blkx*bsz*dsize}
23 |         yield ('bstream-msplit', args, meta)
24 | 
25 |         # Four-way m-split B streaming, C accumulation kernel
26 |         ms, bsz, blkx = 4, 20, 32
27 |         args = {'msplit': ms, 'blockx': blkx, 'bsz': bsz}
28 |         meta = {'threadgroup': (blkx, ms, 1),
29 |                 'threadgroup_mem_size': 2*blkx*bsz*dsize}
30 |         yield ('bstream-msplit', args, meta)
31 | 
32 |         # Two-way k-split B loading, C streaming kernel
33 |         ks, csz, blkx = 2, 20, 32
34 |         args = {'ksplit': ks, 'csz': csz, 'blockx': blkx}
35 |         meta = {'threadgroup': (blkx, ks, 1),
36 |                 'threadgroup_mem_size': (ks - 1)*csz*blkx*dsize}
37 |         yield ('cstream-ksplit', args, meta)
38 | 
39 |         if self.aligne is not None and self.aligne % 2 == 0:
40 |             # Vector B loading, C streaming kernel
41 |             args = {'dtype': 'float2', 'width': 2}
42 |             meta = {'width': 2}
43 |             yield ('cstream', args, meta)
44 | 
45 |             # Vector B streaming, C accumulation kernel
46 |             yield ('bstream', args, meta)
47 | 
48 |             # Vector four-way m-split B streaming, C accumulation kernel
49 |             ms, bsz, blkx = 4, 16, 32
50 |             args = {'dtype': 'float2', 'width': 2, 'msplit': ms,
51 |                     'blockx': blkx, 'bsz': bsz}
52 |             meta = {'threadgroup': (blkx, ms, 1),
53 |                     'threadgroup_mem_size': 2*blkx*bsz*dsize, 'width': 2}
54 |             yield ('bstream-msplit', args, meta)
55 | 
56 |     def _process_meta(self, meta):
57 |         if self.n is not None:
58 |             tg = meta['threadgroup']
59 |             meta['grid'] = (-(-self.n // meta['width']), tg[1], 1)
60 | 


--------------------------------------------------------------------------------
/gimmik/opencl.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from gimmik.base import MatMul
 4 | 
 5 | 
 6 | class OpenCLMatMul(MatMul):
 7 |     platform = 'opencl'
 8 |     basemeta = {'local_work_size': None, 'local_mem_size': 0, 'width': 1}
 9 | 
10 |     def _kernel_generators(self, dtype, dsize, *, local_mem_size=None):
11 |         max_local_mem = local_mem_size or 1024**3
12 | 
13 |         # B loading, C streaming kernel
14 |         yield ('cstream', {}, {})
15 | 
16 |         # B streaming, C accumulation kernel
17 |         yield ('bstream', {}, {})
18 | 
19 |         # Four-way m-split B streaming, C accumulation kernel
20 |         ms, bsz, blkx = 4, 16, 64
21 |         args = {'msplit': ms, 'blockx': blkx, 'bsz': bsz}
22 |         meta = {'local_work_size': (blkx, ms),
23 |                 'local_mem_size': 2*blkx*bsz*dsize}
24 |         if meta['local_mem_size'] < max_local_mem:
25 |             yield ('bstream-msplit', args, meta)
26 | 
27 |         # Two-way k-split B loading, C streaming kernel
28 |         ks, csz, blkx = 2, 32, 64
29 |         args = {'ksplit': ks, 'csz': csz, 'blockx': blkx}
30 |         meta = {'local_work_size': (blkx, ks),
31 |                 'local_mem_size': (ks - 1)*csz*blkx*dsize}
32 |         if meta['local_mem_size'] < max_local_mem:
33 |             yield ('cstream-ksplit', args, meta)
34 | 
35 |         # At single precision also consider vectorized kernels
36 |         if (dtype == 'float' and
37 |             self.aligne is not None and self.aligne % 2 == 0):
38 |             # Vector B loading, C streaming kernel
39 |             args = {'dtype': 'float2', 'width': 2}
40 |             meta = {'width': 2}
41 |             yield ('cstream', args, meta)
42 | 
43 |             # Vector four-way m-split B streaming, C accumulation kernel
44 |             ms, bsz, blkx = 4, 16, 64
45 |             args = {'dtype': 'float2', 'width': 2, 'msplit': ms,
46 |                     'blockx': blkx, 'bsz': bsz}
47 |             meta = {'local_work_size': (blkx, ms),
48 |                     'local_mem_size': 2*blkx*bsz*dsize, 'width': 2}
49 |             if meta['local_mem_size'] < max_local_mem:
50 |                 yield ('bstream-msplit', args, meta)
51 | 
52 |     def _process_meta(self, meta):
53 |         if self.n is not None:
54 |             lws, width = meta['local_work_size'], meta['width']
55 |             if lws is not None:
56 |                 meta['global_work_size'] = (-(-self.n // width), lws[1])
57 |             else:
58 |                 meta['global_work_size'] = (-(-self.n // width),)
59 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | from setuptools import setup
 6 | import sys
 7 | 
 8 | 
 9 | # Python version
10 | if sys.version_info[:2] < (3, 9):
11 |     print('GiMMiK requires Python 3.9 or newer')
12 |     sys.exit(-1)
13 | 
14 | # GiMMiK version
15 | vfile = open('gimmik/_version.py').read()
16 | vsrch = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", vfile, re.M)
17 | 
18 | if vsrch:
19 |     version = vsrch.group(1)
20 | else:
21 |     print('Unable to find a version string in gimmik/_version.py')
22 | 
23 | # Data
24 | package_data = {
25 |     'gimmik': ['kernels/*/*.mako'],
26 | }
27 | 
28 | # Hard dependencies
29 | install_requires = [
30 |     'mako',
31 |     'numpy >= 1.7'
32 | ]
33 | 
34 | # Info
35 | classifiers = [
36 |     'License :: OSI Approved :: BSD License',
37 |     'Programming Language :: Python :: 3.9',
38 |     'Programming Language :: Python :: 3.10',
39 |     'Programming Language :: Python :: 3.11',
40 |     'Topic :: Scientific/Engineering'
41 | ]
42 | 
43 | # Long Description
44 | long_description = '''GiMMiK is a Python based kernel generator for
45 | matrix multiplication kernels for various accelerator platforms.  For
46 | small operator matrices the generated kernels are capable of
47 | outperfoming the state-of-the-art general matrix multiplication
48 | routines such as cuBLAS GEMM or clBLAS GEMM.  GiMMiK was originally
49 | developed as part of Bartosz Wozniak's master's thesis in the
50 | Department of Computing at Imperial College London and is currently
51 | maintained by Freddie Witherden.'''
52 | 
53 | # Keywords
54 | keywords = ['Matrix Multiplication', 'ISPC', 'GPU', 'CUDA', 'HIP', 'Metal',
55 |             'OpenCL']
56 | 
57 | setup(name='gimmik',
58 |       version=version,
59 | 
60 |       # Packages
61 |       packages=['gimmik'],
62 |       package_data=package_data,
63 |       install_requires=install_requires,
64 | 
65 |       # Metadata
66 |       description='Generator of Matrix Multiplication Kernels',
67 |       long_description=long_description,
68 |       maintainer='Freddie Witherden',
69 |       maintainer_email='freddie@witherden.org',
70 |       url='https://github.com/vincentlab/GiMMiK',
71 |       license='BSD',
72 |       keywords=keywords,
73 |       classifiers=classifiers)
74 | 


--------------------------------------------------------------------------------