├── python
    ├── dsc
    │   ├── py.typed
    │   ├── nn
    │   │   ├── functional.py
    │   │   ├── utils.py
    │   │   └── __init__.py
    │   ├── device.py
    │   ├── __init__.py
    │   ├── gpu
    │   │   └── __init__.py
    │   ├── context.py
    │   ├── dtype.py
    │   └── profiler.py
    └── tests
    │   ├── utils_cpu.py
    │   ├── test_ops_cpu.py
    │   ├── test_indexing.py
    │   ├── test_ops_common.py
    │   └── test_ops_gpu.py
├── docs
    └── logo.png
├── .github
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   └── bug_report.md
    └── workflows
    │   └── tests.yml
├── .gitignore
├── requirements.txt
├── pyproject.toml
├── LICENSE
├── setup.py
├── dsc
    ├── include
    │   ├── cpu
    │   │   ├── dsc_blas.h
    │   │   ├── dsc_iter.h
    │   │   ├── dsc_ops.h
    │   │   ├── dsc_cpu.h
    │   │   └── dsc_tracing.h
    │   ├── dsc_dtype.h
    │   ├── gpu
    │   │   ├── platform
    │   │   │   ├── dsc_cuda_platform.h
    │   │   │   └── dsc_hip_platform.h
    │   │   ├── dsc_tracing.h
    │   │   ├── dsc_gpu.h
    │   │   └── dsc_ops.h
    │   ├── dsc_device.h
    │   └── dsc.h
    └── src
    │   ├── cpu
    │       └── dsc_device_cpu.cpp
    │   └── gpu
    │       └── dsc_device_gpu.cpp
├── .clang-format
├── .clang-tidy
├── IDEAS.md
├── Makefile
├── README.md
└── examples
    └── models
        ├── gpt2.py
        └── qwen2_5.py


/python/dsc/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nirw4nna/dsc/HEAD/docs/logo.png


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: nirw4nna
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | .vscode/
 3 | 
 4 | venv/
 5 | 
 6 | *.egg-info*
 7 | __pycache__/
 8 | .pytest*/
 9 | .ruff*/
10 | 
11 | # Traces
12 | *.json
13 | 
14 | *.o
15 | *.so
16 | *.dll
17 | *.a


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Required by the dsc python library
 2 | numpy
 3 | psutil
 4 | tqdm
 5 | 
 6 | 
 7 | # Only used for testing and benchmarking
 8 | matplotlib
 9 | PyQt5
10 | pytest
11 | tabulate
12 | ruff
13 | pyright
14 | torch
15 | transformers # Tokenizers for examples/models


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report issue
 4 | title: "[BUG]"
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 
16 | Include code snippet:
17 | ```python
18 | 
19 | ```
20 | 
21 | **Expected behavior**
22 | A clear and concise description of what you expected to happen.
23 | 
24 | **Desktop (please complete the following information):**
25 |  - OS: [e.g. Ubuntu-Linux]
26 |  - Version [e.g. 0.1]
27 | 
28 | **Additional context**
29 | Add any other context about the problem here.
30 | 


--------------------------------------------------------------------------------
/python/dsc/nn/functional.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
 2 | #  All rights reserved.
 3 | #
 4 | #  This code is licensed under the terms of the 3-clause BSD license
 5 | #  (https://opensource.org/license/bsd-3-clause).
 6 | 
 7 | 
 8 | from ..tensor import Tensor, tanh, power, max, sum, exp
 9 | from ..profiler import trace
10 | import math
11 | 
12 | 
13 | @trace('gelu')
14 | def gelu(x: Tensor) -> Tensor:
15 |     return 0.5 * x * (1.0 + tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * power(x, 3))))
16 | 
17 | 
18 | @trace('softmax')
19 | def softmax(x: Tensor, axis: int = -1) -> Tensor:
20 |     e = exp((x - max(x, axis=axis, keepdims=True)))
21 |     sum_e = sum(e, axis=axis, keepdims=True)
22 |     return e / sum_e
23 | 
24 | 
25 | @trace('silu')
26 | def silu(x: Tensor) -> Tensor:
27 |     return x * (1 / (1 + exp(-x)))


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.pyright]
 2 | include = ["python/dsc"]
 3 | exclude = [
 4 |     "**/node_modules",
 5 |     "**/__pycache__",
 6 |     "benchmarks",
 7 |     "python/tests"
 8 | ]
 9 | venvPath = "."
10 | venv = "venv"
11 | 
12 | reportMissingImports = "error"
13 | 
14 | pythonVersion = "3.10"
15 | 
16 | [tool.ruff]
17 | # Exclude a variety of commonly ignored directories.
18 | exclude = [
19 |     ".eggs",
20 |     ".git",
21 |     ".pyenv",
22 |     ".pytest_cache",
23 |     ".ruff_cache",
24 |     ".vscode",
25 |     ".idea",
26 |     "__pypackages__",
27 |     "node_modules",
28 |     "site-packages",
29 |     "venv",
30 |     "benchmarks/*"
31 | ]
32 | 
33 | # Same as Black.
34 | line-length = 88
35 | indent-width = 4
36 | 
37 | target-version = "py310"
38 | 
39 | [tool.ruff.lint]
40 | ignore = ["F401"]
41 | 
42 | # Allow fix for all enabled rules (when `--fix`) is provided.
43 | fixable = ["ALL"]
44 | 
45 | [tool.ruff.format]
46 | quote-style = "single"
47 | 
48 | indent-style = "space"
49 | 
50 | # Like Black, respect magic trailing commas.
51 | skip-magic-trailing-comma = false
52 | 
53 | # Like Black, automatically detect the appropriate line ending.
54 | line-ending = "auto"


--------------------------------------------------------------------------------
/python/dsc/device.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
 2 | #  All rights reserved.
 3 | #
 4 | #  This code is licensed under the terms of the 3-clause BSD license
 5 | #  (https://opensource.org/license/bsd-3-clause).
 6 | 
 7 | from enum import Enum
 8 | from typing import Union
 9 | 
10 | 
11 | DeviceType = Union['Device', str]
12 | 
13 | class Device(Enum):
14 |     DEFAULT = -1
15 |     CPU = 0
16 |     GPU = 1
17 | 
18 |     def __repr__(self) -> str:
19 |         return DEVICE_LOOKUP[self]
20 | 
21 |     def __str__(self) -> str:
22 |         return repr(self)
23 | 
24 | 
25 | def _get_device(dev: DeviceType) -> Device:
26 |     if isinstance(dev, Device):
27 |         return dev
28 |     else:
29 |         if dev in DEVICE_REVERSE_LOOKUP:
30 |             return DEVICE_REVERSE_LOOKUP[dev]
31 |         else:
32 |             raise RuntimeError(f'string "{dev}" is not a valid Device')
33 | 
34 | 
35 | DEVICE_VALUE_LOOKUP = {val.value: val for val in Device.__members__.values()}
36 | 
37 | DEVICE_LOOKUP = {
38 |     Device.DEFAULT: 'default',
39 |     Device.CPU: 'cpu',
40 |     Device.GPU: 'gpu',
41 | }
42 | 
43 | DEVICE_REVERSE_LOOKUP = {val: key for key, val in DEVICE_LOOKUP.items()}


--------------------------------------------------------------------------------
/python/dsc/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
 2 | # All rights reserved.
 3 | #
 4 | # This code is licensed under the terms of the 3-clause BSD license
 5 | # (https://opensource.org/license/bsd-3-clause).
 6 | 
 7 | from dsc.context import init, print_mem_usage, set_default_device
 8 | from dsc.tensor import (
 9 |     Tensor,
10 |     from_numpy,
11 |     frombuffer,
12 |     reshape,
13 |     concat,
14 |     split,
15 |     transpose,
16 |     tril,
17 |     arange,
18 |     repeat,
19 |     randn,
20 |     cos,
21 |     sin,
22 |     tanh,
23 |     exp,
24 |     sqrt,
25 |     rsqrt,
26 |     add,
27 |     sub,
28 |     mul,
29 |     true_div,
30 |     sum,
31 |     mean,
32 |     var,
33 |     matmul,
34 |     outer,
35 |     max,
36 |     min,
37 |     power,
38 |     equal,
39 |     not_equal, 
40 |     less,
41 |     less_equal,
42 |     greater,
43 |     greater_equal,
44 |     tensor,
45 |     ones,
46 |     ones_like,
47 |     zeros,
48 |     zeros_like,
49 |     full,
50 |     full_like,
51 |     empty,
52 |     empty_like,
53 |     kth,
54 |     multinomial,
55 |     where,
56 | )
57 | from dsc.dtype import Dtype, bool_, i32, bf16, f32, f64
58 | from dsc.profiler import trace
59 | from dsc.device import Device
60 | import dsc.gpu as gpu
61 | import dsc.nn as nn


--------------------------------------------------------------------------------
/python/dsc/gpu/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
 2 | #  All rights reserved.
 3 | #
 4 | #  This code is licensed under the terms of the 3-clause BSD license
 5 | #  (https://opensource.org/license/bsd-3-clause).
 6 | 
 7 | from .._bindings import (
 8 |     _dsc_get_gpu_platform,
 9 |     _dsc_gpu_available,
10 |     _dsc_gpu_devices,
11 |     _dsc_gpu_set_device,
12 |     _dsc_gpu_dev_capability,
13 |     _dsc_gpu_dev_mem,
14 |     _dsc_gpu_sync,
15 |     _dsc_gpu_has_bf16,
16 |     _DSC_PLATFORM_CUDA,
17 |     _DSC_PLATFORM_ROCM,
18 | )
19 | 
20 | 
21 | def is_available() -> bool:
22 |     return _dsc_gpu_available(None)
23 | 
24 | def is_cuda() -> bool:
25 |     return _dsc_get_gpu_platform(None) == _DSC_PLATFORM_CUDA
26 | 
27 | def is_rocm() -> bool:
28 |     return _dsc_get_gpu_platform(None) == _DSC_PLATFORM_ROCM
29 | 
30 | def has_bf16() -> bool:
31 |     return _dsc_gpu_has_bf16(None)
32 | 
33 | def device_count() -> int:
34 |     return _dsc_gpu_devices(None)
35 | 
36 | def set_device(device: int):
37 |     _dsc_gpu_set_device(None, device)
38 | 
39 | def get_device_capability(device: int) -> int:
40 |     return _dsc_gpu_dev_capability(None, device)
41 | 
42 | def get_device_mem(device: int) -> int:
43 |     return _dsc_gpu_dev_mem(None, device)
44 | 
45 | def synchronize():
46 |     _dsc_gpu_sync(None)


--------------------------------------------------------------------------------
/python/tests/utils_cpu.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
 2 | # All rights reserved.
 3 | #
 4 | # This code is licensed under the terms of the 3-clause BSD license
 5 | # (https://opensource.org/license/bsd-3-clause).
 6 | 
 7 | 
 8 | import dsc
 9 | import numpy as np
10 | from typing import List
11 | import os
12 | 
13 | 
14 | DEVICE = os.getenv('DEVICE', 'cpu')
15 | 
16 | def all_close(actual: dsc.Tensor, target: np.ndarray, eps=1e-5):
17 |     actual_np = actual.numpy()
18 |     diffs = ~np.isclose(actual_np, target, atol=eps, rtol=eps, equal_nan=True)
19 |     close = len(actual_np[diffs]) == 0
20 |     return close
21 | 
22 | 
23 | def random_nd(shape: List[int], dtype: np.dtype = np.float64):
24 |     if dtype == np.bool:
25 |         return np.random.randint(0, 2, size=tuple(shape)).astype(dtype)
26 |     elif dtype == np.int32:
27 |         # Return a positive integer tensor if the dtype is int32 so that we don't have issues
28 |         # with power
29 |         return np.random.randint(0, 10, size=tuple(shape)).astype(dtype)
30 |     else:
31 |         return np.random.randn(*tuple(shape)).astype(dtype)
32 | 
33 | 
34 | DTYPES = [np.bool, np.int32, np.float32, np.float64]
35 | DSC_DTYPES = {
36 |     np.bool: dsc.bool_,
37 |     np.int32: dsc.i32,
38 |     np.float32: dsc.f32,
39 |     np.float64: dsc.f64,
40 | }
41 | 
42 | def is_float(dtype) -> bool:
43 |     return dtype == np.float32 or dtype == np.float64
44 | 
45 | def is_bool(dtype) -> bool:
46 |     return dtype == np.bool
47 | 
48 | def is_integer(dtype) -> bool:
49 |     return dtype == np.int32
50 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 |    list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its
16 |    contributors may be used to endorse or promote products derived from
17 |    this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/python/dsc/context.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
 2 | # All rights reserved.
 3 | #
 4 | # This code is licensed under the terms of the 3-clause BSD license
 5 | # (https://opensource.org/license/bsd-3-clause).
 6 | 
 7 | from ._bindings import _dsc_ctx_init, _dsc_print_mem_usage, _dsc_set_default_device
 8 | from .device import _get_device, DeviceType
 9 | import psutil
10 | 
11 | _ctx_instance = None
12 | 
13 | 
14 | class _DscContext:
15 |     def __init__(self, main_mem: int):
16 |         self._ctx = _dsc_ctx_init(main_mem)
17 | 
18 |     # TODO: (3)
19 |     # def __del__(self):
20 |         # _dsc_ctx_free(self._ctx)
21 | 
22 | 
23 | def _get_ctx():
24 |     global _ctx_instance
25 |     if _ctx_instance is None:
26 |         # Workaround: instead of throwing an error if the context is not initialized
27 |         # we can simply initialize one with a fixed amount of memory that is a small %
28 |         # of the total available memory.
29 |         total_mem = psutil.virtual_memory().total
30 |         mem = int(total_mem * 0.1)
31 |         print(
32 |             f'DSC has not been explicitly initialized. Using {round(mem / (1024. * 1024.))}MB.'
33 |             f' If you require more memory please call dsc.init() once before executing your code.'
34 |         )
35 |         _ctx_instance = _DscContext(mem)
36 |     return _ctx_instance._ctx
37 | 
38 | 
39 | def init(mem_size: int):
40 |     global _ctx_instance
41 |     if _ctx_instance is None:
42 |         _ctx_instance = _DscContext(mem_size)
43 |     else:
44 |         raise RuntimeWarning('Context already initialized')
45 | 
46 | def print_mem_usage():
47 |     _dsc_print_mem_usage(_get_ctx())
48 | 
49 | def set_default_device(device: DeviceType):
50 |     _dsc_set_default_device(_get_ctx(), _get_device(device))


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
 2 | #  All rights reserved.
 3 | #
 4 | #  This code is licensed under the terms of the 3-clause BSD license
 5 | #  (https://opensource.org/license/bsd-3-clause).
 6 | 
 7 | from setuptools import setup, find_packages
 8 | from setuptools.command.install import install
 9 | import subprocess
10 | import os
11 | from pathlib import Path
12 | 
13 | 
14 | def _compile_cpp():
15 |     subprocess.check_call(
16 |         ['make', 'shared', 'DSC_FAST=1'], cwd=os.path.dirname(os.path.abspath(__file__))
17 |     )
18 | 
19 | 
20 | class BuildCmd(install):
21 |     def run(self):
22 |         _compile_cpp()
23 |         install.run(self)
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     with open(Path(__file__).parent / 'README.md', 'r', encoding='utf-8') as f:
28 |         long_description = f.read()
29 | 
30 |     packages = find_packages('python')
31 |     package_dir = {'': 'python'}
32 |     package_data = {'dsc': ['*.so']}
33 |     setup(
34 |         name='dsc',
35 |         version='0.1',
36 |         author='Christian Gilli',
37 |         author_email='christian.gilli11@gmail.com',
38 |         license='BSD-3-Clause',
39 |         description='DSPCraft tensor processing library.',
40 |         long_description=long_description,
41 |         long_description_content_type='text/markdown',
42 |         url='https://github.com/dspcraft/dsc',
43 |         packages=packages,
44 |         package_dir=package_dir,
45 |         install_requires=[
46 |             'numpy',
47 |             'psutil',
48 |         ],
49 |         extras_require={'dev': ['matplotlib', 'pytest', 'tabulate', 'pyright', 'ruff']},
50 |         cmdclass={
51 |             'install': BuildCmd,
52 |         },
53 |         include_package_data=True,
54 |         package_data=package_data,
55 |         python_requires='>=3.10',
56 |     )
57 | 


--------------------------------------------------------------------------------
/dsc/include/cpu/dsc_blas.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
 2 | // All rights reserved.
 3 | //
 4 | // This code is licensed under the terms of the 3-clause BSD license
 5 | // (https://opensource.org/license/bsd-3-clause).
 6 | 
 7 | #pragma once
 8 | 
 9 | #include "dsc.h"
10 | 
11 | enum dsc_blas_trans : bool {
12 |     NO_TRANS,
13 |     TRANS
14 | };
15 | 
16 | struct dsc_blas_ctx;
17 | 
18 | // ============================================================
19 | // Setup / Teardown
20 | //
21 | 
22 | extern dsc_blas_ctx *dsc_blas_init();
23 | 
24 | extern void dsc_blas_destroy(dsc_blas_ctx *ctx);
25 | 
26 | // ============================================================
27 | // GEMM-related functions
28 | //
29 | 
30 | extern void dsc_dgemm(dsc_blas_ctx *ctx, dsc_blas_trans trans_b,
31 |                       int m, int n, int k,
32 |                       const f64 *DSC_RESTRICT a, int stride_a,
33 |                       const f64 *DSC_RESTRICT b, int stride_b,
34 |                       f64 *DSC_RESTRICT c, int stride_c);
35 | 
36 | extern void dsc_sgemm(dsc_blas_ctx *ctx, dsc_blas_trans trans_b,
37 |                       int m, int n, int k,
38 |                       const f32 *DSC_RESTRICT a, int stride_a,
39 |                       const f32 *DSC_RESTRICT b, int stride_b,
40 |                       f32 *DSC_RESTRICT c, int stride_c);
41 | 
42 | // ============================================================
43 | // GEVM-related functions
44 | //
45 | 
46 | extern void dsc_dgevm_trans(dsc_blas_ctx *ctx,
47 |                             int n, int k,
48 |                             const f64 *DSC_RESTRICT a,
49 |                             const f64 *DSC_RESTRICT b, int stride_b,
50 |                             f64 *DSC_RESTRICT c);
51 | 
52 | extern void dsc_sgevm_trans(dsc_blas_ctx *ctx,
53 |                             int n, int k,
54 |                             const f32 *DSC_RESTRICT a,
55 |                             const f32 *DSC_RESTRICT b, int stride_b,
56 |                             f32 *DSC_RESTRICT c);


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | BasedOnStyle: LLVM
 2 | AccessModifierOffset: -4
 3 | AlignAfterOpenBracket: Align
 4 | AlignConsecutiveAssignments: None
 5 | AlignOperands: Align
 6 | AllowAllArgumentsOnNextLine: false
 7 | AllowAllConstructorInitializersOnNextLine: false
 8 | AllowAllParametersOfDeclarationOnNextLine: false
 9 | AllowShortBlocksOnASingleLine: Always
10 | AllowShortCaseLabelsOnASingleLine: false
11 | AllowShortFunctionsOnASingleLine: All
12 | AllowShortIfStatementsOnASingleLine: Always
13 | AllowShortLambdasOnASingleLine: All
14 | AllowShortLoopsOnASingleLine: true
15 | AlwaysBreakAfterReturnType: None
16 | AlwaysBreakTemplateDeclarations: Yes
17 | BreakBeforeBraces: Custom
18 | BraceWrapping:
19 |   AfterCaseLabel: false
20 |   AfterClass: false
21 |   AfterControlStatement: Never
22 |   AfterEnum: false
23 |   AfterFunction: false
24 |   AfterNamespace: false
25 |   AfterUnion: false
26 |   BeforeCatch: false
27 |   BeforeElse: false
28 |   IndentBraces: false
29 |   SplitEmptyFunction: false
30 |   SplitEmptyRecord: true
31 | BreakBeforeBinaryOperators: None
32 | BreakBeforeTernaryOperators: true
33 | BreakConstructorInitializers: BeforeColon
34 | BreakInheritanceList: BeforeColon
35 | ColumnLimit: 0
36 | CompactNamespaces: false
37 | ContinuationIndentWidth: 8
38 | IndentCaseLabels: true
39 | IndentPPDirectives: None
40 | IndentWidth: 4
41 | KeepEmptyLinesAtTheStartOfBlocks: true
42 | MaxEmptyLinesToKeep: 2
43 | NamespaceIndentation: None
44 | ObjCSpaceAfterProperty: false
45 | ObjCSpaceBeforeProtocolList: true
46 | PointerAlignment: Right
47 | ReflowComments: false
48 | SpaceAfterCStyleCast: true
49 | SpaceAfterLogicalNot: false
50 | SpaceAfterTemplateKeyword: false
51 | SpaceBeforeAssignmentOperators: true
52 | SpaceBeforeCpp11BracedList: false
53 | SpaceBeforeCtorInitializerColon: true
54 | SpaceBeforeInheritanceColon: true
55 | SpaceBeforeParens: ControlStatements
56 | SpaceBeforeRangeBasedForLoopColon: false
57 | SpaceInEmptyParentheses: false
58 | SpacesBeforeTrailingComments: 0
59 | SpacesInAngles: false
60 | SpacesInCStyleCastParentheses: false
61 | SpacesInContainerLiterals: false
62 | SpacesInParentheses: false
63 | SpacesInSquareBrackets: false
64 | TabWidth: 4
65 | UseTab: Never
66 | 


--------------------------------------------------------------------------------
/python/dsc/dtype.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
 2 | # All rights reserved.
 3 | #
 4 | # This code is licensed under the terms of the 3-clause BSD license
 5 | # (https://opensource.org/license/bsd-3-clause).
 6 | 
 7 | from enum import Enum
 8 | import numpy as np
 9 | from ctypes import POINTER, c_float, c_double, c_bool, c_int32, c_uint16
10 | from typing import Union
11 | 
12 | 
13 | ScalarType = Union[bool, int, float]
14 | 
15 | 
16 | class Dtype(Enum):
17 |     BOOL = 0
18 |     I32 = 1
19 |     BF16 = 2
20 |     F32 = 3
21 |     F64 = 4
22 | 
23 |     def __repr__(self) -> str:
24 |         return TYPENAME_LOOKUP[self]
25 | 
26 |     def __str__(self) -> str:
27 |         return repr(self)
28 | 
29 |     @staticmethod
30 |     def from_string(val: str) -> 'Dtype':
31 |         return TYPENAME_REVERSE_LOOKUP[val.lower()]
32 | 
33 | bool_ = Dtype.BOOL
34 | i32 = Dtype.I32
35 | bf16 = Dtype.BF16
36 | f32 = Dtype.F32
37 | f64 = Dtype.F64
38 | 
39 | DTYPE_VALUE_LOOKUP = {val.value: val for val in Dtype.__members__.values()}
40 | 
41 | TYPENAME_LOOKUP = {
42 |     Dtype.BOOL: 'bool',
43 |     Dtype.I32: 'i32',
44 |     Dtype.BF16: 'bf16',
45 |     Dtype.F32: 'f32',
46 |     Dtype.F64: 'f64',
47 | }
48 | 
49 | TYPENAME_REVERSE_LOOKUP = {v: k for k, v in TYPENAME_LOOKUP.items()}
50 | 
51 | DTYPE_TO_CTYPE = {
52 |     Dtype.BOOL: POINTER(c_bool),
53 |     Dtype.I32: POINTER(c_int32),
54 |     Dtype.BF16: POINTER(c_uint16),
55 |     Dtype.F32: POINTER(c_float),
56 |     Dtype.F64: POINTER(c_double),
57 | }
58 | 
59 | DTYPE_SIZE = {
60 |     Dtype.BOOL: 1,
61 |     Dtype.I32: 4,
62 |     Dtype.BF16: 2,
63 |     Dtype.F32: 4,
64 |     Dtype.F64: 8,
65 | }
66 | 
67 | NP_TO_DTYPE = {
68 |     np.dtype(np.bool): Dtype.BOOL,
69 |     np.dtype(np.int32): Dtype.I32,
70 |     np.dtype(np.float16): Dtype.BF16, # TODO: this is wrong! NumPy doesn't support BF16
71 |     np.dtype(np.float32): Dtype.F32,
72 |     np.dtype(np.float64): Dtype.F64,
73 | }
74 | 
75 | DTYPE_TO_NP = {val: key for key, val in NP_TO_DTYPE.items()}
76 | 
77 | DTYPE_CONVERSION_TABLES = [
78 |     [Dtype.BOOL, Dtype.I32, Dtype.F32, Dtype.F32, Dtype.F64],
79 |     [Dtype.BOOL, Dtype.I32, Dtype.F32, Dtype.F32, Dtype.F64],
80 |     [Dtype.F32, Dtype.F32, Dtype.F32, Dtype.F32, Dtype.F64],
81 |     [Dtype.F32, Dtype.F32, Dtype.F32, Dtype.F32, Dtype.F64],
82 |     [Dtype.F64, Dtype.F64, Dtype.F64, Dtype.F64, Dtype.F64],
83 | ]
84 | 


--------------------------------------------------------------------------------
/python/dsc/nn/utils.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
 2 | #  All rights reserved.
 3 | #
 4 | #  This code is licensed under the terms of the 3-clause BSD license
 5 | #  (https://opensource.org/license/bsd-3-clause).
 6 | #
 7 | # This code is licensed under the terms of the 3-clause BSD license
 8 | # (https://opensource.org/license/bsd-3-clause).
 9 | 
10 | from typing import Dict, Optional
11 | from ..tensor import Tensor, frombuffer
12 | from ..dtype import Dtype
13 | import struct, pathlib, os, hashlib, urllib.request, json, ctypes
14 | from tqdm import tqdm
15 | 
16 | 
17 | def _fetch(url: str, invalidate_cache: bool = False) -> pathlib.Path:
18 |     cache_dir = pathlib.Path.home() / '.cache' / 'dsc' / 'blob'
19 |     if invalidate_cache and cache_dir.exists():
20 |         os.removedirs(cache_dir)
21 | 
22 |     cache_dir.mkdir(parents=True, exist_ok=True)
23 |     fp = cache_dir / hashlib.md5(url.encode('utf-8')).hexdigest()
24 |     if not fp.exists():
25 |         with urllib.request.urlopen(url, timeout=10) as r:
26 |             assert r.status == 200
27 |             pbar = tqdm(total=r.length, unit='B', unit_scale=True, desc=url)
28 |             with open(fp, mode="w+b") as f:
29 |                 while chunk := r.read(8192):
30 |                     pbar.update(f.write(chunk))
31 |     return fp
32 | 
33 | 
34 | def safe_load(url: str, invalidate_cache: bool = False,
35 |               trim_prefix: Optional[str] = None,
36 |               use_dtype: Optional[Dtype] = None) -> Dict[str, Tensor]:
37 |     fp = _fetch(url, invalidate_cache)
38 |     b = fp.read_bytes()
39 |     n = struct.unpack_from('<Q', b)[0]
40 |     assert 0 < n < (len(b) - 8)
41 | 
42 |     header = json.loads(b[8:8+n].decode('utf-8'))
43 |     res = {}
44 |     for key, val in header.items():
45 |         if key == '__metadata__':
46 |             continue
47 |         
48 |         if trim_prefix is not None:
49 |             key = key.removeprefix(trim_prefix)
50 | 
51 |         dtype = Dtype.from_string(val['dtype'])
52 |         shape = val['shape']
53 |         offset_start = val['data_offsets'][0]
54 |         offset_stop = val['data_offsets'][1]
55 | 
56 |         data_ptr = ctypes.cast(ctypes.c_char_p(b[8+n+offset_start:8+n+offset_stop]), ctypes.c_void_p)
57 | 
58 |         raw_tensor = frombuffer(shape, dtype, data_ptr)
59 |         if use_dtype is not None and use_dtype != dtype:
60 |             raw_tensor = raw_tensor.cast(use_dtype)
61 | 
62 |         res[key] = raw_tensor
63 | 
64 |     return res


--------------------------------------------------------------------------------
/python/dsc/profiler.py:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
 2 | #  All rights reserved.
 3 | #
 4 | #  This code is licensed under the terms of the 3-clause BSD license
 5 | #  (https://opensource.org/license/bsd-3-clause).
 6 | 
 7 | from ._bindings import _dsc_tracing_enabled, _dsc_insert_trace, _dsc_dump_traces
 8 | from .context import _get_ctx
 9 | from time import perf_counter
10 | from http.server import SimpleHTTPRequestHandler
11 | import socketserver
12 | from functools import wraps
13 | import atexit
14 | import os
15 | 
16 | 
17 | def _is_tracing_enabled() -> bool:
18 |     return bool(_dsc_tracing_enabled())
19 | 
20 | 
21 | @atexit.register
22 | def _dump_traces():
23 |     if not _is_tracing_enabled():
24 |         return
25 | 
26 |     _dsc_dump_traces(_get_ctx())
27 |     trace_value = int(os.getenv('TRACE', '0'))
28 |     if trace_value >= 2:
29 |         _serve_traces()
30 | 
31 | 
32 | def trace(name: str):
33 |     def _decorator(func):
34 |         if not _is_tracing_enabled():
35 |             return func
36 | 
37 |         # Encode name and cat once
38 |         name_ = name.encode('ascii')
39 |         @wraps(func)
40 |         def _wrapper(*args, **kwargs):
41 |             start_us = int(perf_counter() * 1e6)
42 |             res = func(*args, **kwargs)
43 |             end_us = int(perf_counter() * 1e6)
44 |             _dsc_insert_trace(_get_ctx(), name_, start_us, end_us - start_us)
45 |             return res
46 |         return _wrapper
47 |     return _decorator
48 | 
49 | 
50 | class _PerfettoServer(SimpleHTTPRequestHandler):
51 |     def log_message(self, format, *args):
52 |         # Suppress the output of the HTTP server
53 |         pass
54 | 
55 |     def end_headers(self):
56 |         self.send_header('Access-Control-Allow-Origin', '*')
57 |         return super().end_headers()
58 | 
59 |     def do_GET(self):
60 |         self.server.last_request = self.path  # pyright: ignore[reportAttributeAccessIssue]
61 |         return super().do_GET()
62 | 
63 |     def do_POST(self):
64 |         self.send_error(404, 'File not found')
65 | 
66 | 
67 | def _serve_traces():
68 |     # Taken from https://github.com/jax-ml/jax
69 |     port = 9001
70 |     socketserver.TCPServer.allow_reuse_address = True
71 |     with socketserver.TCPServer(('127.0.0.1', port), _PerfettoServer) as httpd:
72 |         url = f'https://ui.perfetto.dev/#!/?url=http://127.0.0.1:{port}/traces.json'
73 |         print(f'Open URL in browser: {url}')
74 | 
75 |         while httpd.__dict__.get('last_request') != '/traces.json':
76 |             httpd.handle_request()
77 | 


--------------------------------------------------------------------------------
/dsc/src/cpu/dsc_device_cpu.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
 2 | // All rights reserved.
 3 | //
 4 | // This code is licensed under the terms of the 3-clause BSD license
 5 | // (https://opensource.org/license/bsd-3-clause).
 6 | 
 7 | #include "cpu/dsc_blas.h"
 8 | #include "cpu/dsc_tracing.h"
 9 | #include "dsc_device.h"
10 | #include <cstring>
11 | 
12 | #if defined(_WIN32)
13 | #   include <malloc.h>
14 | 
15 | #   define dsc_aligned_alloc(ALIGN, SIZE)   _aligned_malloc(SIZE, ALIGN)
16 | #   define dsc_aligned_free(PTR)            _aligned_free(PTR)
17 | #else
18 | #   include <cstdlib>
19 | 
20 | #   define dsc_aligned_alloc(ALIGN, SIZE)   aligned_alloc(ALIGN, SIZE)
21 | #   define dsc_aligned_free(PTR)            free(PTR)
22 | #endif
23 | 
24 | #define DSC_DEVICE_CPU_ALIGN ((usize) 4096)
25 | 
26 | static void cpu_memcpy(void *dst, const void *src, const usize nb, dsc_memcpy_dir) {
27 |     memcpy(dst, src, nb);
28 | }
29 | 
30 | static void cpu_memset(void *dst, const int c, const usize nb) {
31 |     memset(dst, c, nb);
32 | }
33 | 
34 | static void cpu_dispose(dsc_device *dev) {
35 |     dsc_aligned_free(dev->device_mem);
36 |     dsc_blas_ctx *blas_ctx = (dsc_blas_ctx *) dev->extra_info;
37 |     dsc_blas_destroy(blas_ctx);
38 | 
39 |     dsc_cpu_tracing_dispose(dev->trace_ctx);
40 | 
41 |     DSC_LOG_INFO("%s device disposed", DSC_DEVICE_NAMES[dev->type]);
42 | }
43 | 
44 | dsc_device *dsc_cpu_device(const usize mem_size) {
45 |     static dsc_device dev = {
46 |         .used_nodes = {},
47 |         .free_nodes = {},
48 |         .head = {},
49 |         .device_mem = {},
50 |         .alignment = 64, // I don't know about this...
51 |         .extra_info = dsc_blas_init(),
52 |         .trace_ctx = dsc_cpu_tracing_init(),
53 |         .mem_size = DSC_ALIGN(mem_size, DSC_DEVICE_CPU_ALIGN),
54 |         .used_mem = 0,
55 |         .type = CPU,
56 |         .memcpy = cpu_memcpy,
57 |         .memset = cpu_memset,
58 |         .dispose = cpu_dispose,
59 |         .next_trace = dsc_cpu_next_trace,
60 |         .dump_trace = dsc_cpu_tracing_dump,
61 |         .dump_json_metadata = dsc_cpu_dump_json_metadata
62 |     };
63 | 
64 |     dev.device_mem = dsc_aligned_alloc(DSC_DEVICE_CPU_ALIGN, dev.mem_size);
65 |     DSC_ASSERT(dev.device_mem != nullptr);
66 | 
67 |     dev.free_nodes[0].size = dev.mem_size;
68 |     dev.free_nodes[0].data = dev.device_mem;
69 |     dev.free_nodes[0].next = nullptr;
70 | 
71 |     dev.head = &dev.free_nodes[0];
72 | 
73 |     DSC_LOG_INFO("%s device initialized with a buffer of %ldMB",
74 |                  DSC_DEVICE_NAMES[dev.type],
75 |                  (usize) DSC_B_TO_MB(dev.mem_size));
76 | 
77 |     return &dev;
78 | }


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Unit Tests
 2 | on:
 3 |   pull_request:
 4 |     branches:
 5 |       - main
 6 |   workflow_dispatch:
 7 | 
 8 | 
 9 | jobs:
10 |   test:
11 |     strategy:
12 |       matrix:
13 |         include:
14 |           - host: ubuntu-latest
15 |             device: cpu
16 |           - host: self-hosted-amd-gpu
17 |             device: gpu
18 |           - host: self-hosted-nvidia-gpu
19 |             device: gpu
20 |     runs-on: ${{ matrix.host }}
21 | 
22 |     steps:
23 |       - name: Set env
24 |         run: |
25 |           echo "DEVICE=${{ matrix.device }}" >> $GITHUB_ENV
26 |       - name: Checkout Code
27 |         uses: actions/checkout@v4
28 |       - name: Set up Python 3.10
29 |         # Workaround until this gets moved to a proper server
30 |         if: matrix.host != 'self-hosted-nvidia-gpu'
31 |         uses: actions/setup-python@v5
32 |         with:
33 |           python-version: '3.10'
34 |       - name: Install dependencies
35 |         if: matrix.host != 'self-hosted-nvidia-gpu'
36 |         run: |
37 |           sudo apt-get update
38 |           sudo apt-get install -y build-essential
39 |       - name: Install DSC with requirements
40 |         if: matrix.host == 'ubuntu-latest'
41 |         run: |
42 |           pip install -e .
43 |           pip install -r requirements.txt
44 |       - name: Install DSC with requirements (NVIDIA)
45 |         if: matrix.host == 'self-hosted-nvidia-gpu'
46 |         run: |
47 |           python3 -m venv venv
48 |           # Make sure venv stays active across tasks
49 |           echo "${{ github.workspace }}/venv/bin" >> $GITHUB_PATH
50 |           source venv/bin/activate
51 |           pip install -e .
52 |           pip install -r requirements.txt
53 |       - name: Install DSC with requirements (AMD)
54 |         if: matrix.host == 'self-hosted-amd-gpu'
55 |         run: |
56 |           # Don't install torch stable, we need the nightly build with ROCm 6.4
57 |           sed -i 's/^torch/#torch/' requirements.txt
58 |           pip install -e .
59 |           pip install -r requirements.txt
60 |           pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.4
61 |       - name: Compile DSC C++ (CPU-only)
62 |         if: matrix.device == 'cpu'
63 |         run: make shared DSC_FAST=1
64 |       - name: Compile DSC C++ (GPU)
65 |         if: matrix.device == 'gpu'
66 |         run: make shared DSC_FAST=1 DSC_GPU=1
67 |       - name: Run common ops tests on ${{ matrix.device }}
68 |         run: |
69 |           cd python/tests/
70 |           pytest -s test_ops_common.py --no-header --no-summary -q
71 |       - name: Run ops tests on CPU
72 |         if: matrix.device == 'cpu'
73 |         run: |
74 |           cd python/tests/
75 |           pytest -s test_ops_cpu.py --no-header --no-summary -q
76 |       - name: Run GPU ops tests
77 |         if: matrix.device == 'gpu'
78 |         run: |
79 |           cd python/tests/
80 |           pytest -s test_ops_gpu.py --no-header --no-summary -q
81 |       - name: Run indexing tests on ${{ matrix.device }}
82 |         run: |
83 |           cd python/tests/
84 |           pytest -s test_indexing.py --no-header --no-summary -q


--------------------------------------------------------------------------------
/dsc/include/dsc_dtype.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | // All rights reserved.
  3 | //
  4 | // This code is licensed under the terms of the 3-clause BSD license
  5 | // (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | #pragma once
  8 | 
  9 | #include <cstddef>
 10 | #include <cstdint>
 11 | #include <limits>
 12 | #include <type_traits>
 13 | 
 14 | #if defined(__HIPCC__) && defined(DSC_BF16)
 15 | #   include <hip/hip_bf16.h>
 16 | #endif
 17 | 
 18 | #if defined(__NVCC__) && defined(DSC_BF16)
 19 | #   include <cuda_bf16.h>
 20 | #endif
 21 | 
 22 | 
 23 | #define DSC_DTYPES          ((int) 5)
 24 | #define DSC_DEFAULT_TYPE    F32
 25 | 
 26 | 
 27 | using i8 = int8_t;
 28 | using i16 = int16_t;
 29 | using i32 = int32_t;
 30 | using i64 = int64_t;
 31 | using u8 = uint8_t;
 32 | using u16 = uint16_t;
 33 | using u32 = uint32_t;
 34 | using u64 = uint64_t;
 35 | using size = ptrdiff_t;
 36 | using usize = size_t;
 37 | using byte = char;
 38 | using f32 = float;
 39 | using f64 = double;
 40 | #if defined(__HIPCC__) && defined(DSC_BF16)
 41 |     using bf16 = __hip_bfloat16;
 42 | #elif defined(__NVCC__) && defined(DSC_BF16)
 43 |     using bf16 = __nv_bfloat16;
 44 | #else
 45 |     using bf16 = u16;
 46 | #endif
 47 | 
 48 | enum dsc_dtype : u8 {
 49 |     BOOL,
 50 |     I32,
 51 |     BF16,
 52 |     F32,
 53 |     F64,
 54 | };
 55 | 
 56 | static constexpr usize DSC_DTYPE_SIZE[DSC_DTYPES] = {
 57 |         sizeof(bool),
 58 |         sizeof(i32),
 59 |         sizeof(bf16),
 60 |         sizeof(f32),
 61 |         sizeof(f64),
 62 | };
 63 | 
 64 | static constexpr const char *DSC_DTYPE_NAMES[DSC_DTYPES] = {
 65 |         "bool",
 66 |         "i32",
 67 |         "bf16",
 68 |         "f32",
 69 |         "f64",
 70 | };
 71 | 
 72 | 
 73 | // Conversion utility
 74 | template<typename T>
 75 | struct dsc_type_mapping;
 76 | 
 77 | template<>
 78 | struct dsc_type_mapping<bool> {
 79 |     static constexpr dsc_dtype value = BOOL;
 80 | };
 81 | 
 82 | template<>
 83 | struct dsc_type_mapping<i32> {
 84 |     static constexpr dsc_dtype value = I32;
 85 | };
 86 | 
 87 | template<>
 88 | struct dsc_type_mapping<bf16> {
 89 |     static constexpr dsc_dtype value = BF16;
 90 | };
 91 | 
 92 | template<>
 93 | struct dsc_type_mapping<f32> {
 94 |     static constexpr dsc_dtype value = F32;
 95 | };
 96 | 
 97 | template<>
 98 | struct dsc_type_mapping<f64> {
 99 |     static constexpr dsc_dtype value = F64;
100 | };
101 | 
102 | template<typename Ta, typename Tb>
103 | consteval bool dsc_is_type() {
104 |     return std::is_same_v<Ta, Tb>;
105 | }
106 | 
107 | template<typename T>
108 | consteval bool dsc_is_real() {
109 |     return dsc_is_type<T, bf16>() || dsc_is_type<T, f32>() || dsc_is_type<T, f64>();
110 | }
111 | 
112 | template<typename T, bool positive = true>
113 | consteval T dsc_inf() {
114 |     constexpr T sign = positive ? 1 : -1;
115 | 
116 |     if constexpr (dsc_is_type<T, f32>()) {
117 |         return sign * std::numeric_limits<f32>::infinity();
118 |     } else if constexpr (dsc_is_type<T, f64>()) {
119 |         return sign * std::numeric_limits<f64>::infinity();
120 |     } else {
121 |         static_assert("T is not supported");
122 |     }
123 | }
124 | 


--------------------------------------------------------------------------------
/dsc/src/gpu/dsc_device_gpu.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | // All rights reserved.
  3 | //
  4 | // This code is licensed under the terms of the 3-clause BSD license
  5 | // (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | #include "gpu/dsc_gpu.h"
  8 | #include "gpu/dsc_tracing.h"
  9 | #include "dsc_device.h"
 10 | 
 11 | // As per https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses
 12 | // "Any address of a variable residing in global memory or returned by one of the memory allocation routines
 13 | // from the driver or runtime API is always aligned to at least 256 bytes."
 14 | #define DSC_DEVICE_GPU_ALIGN    ((usize) 256)
 15 | #define DSC_MEMCPY_DIRECTIONS   ((int) 4)
 16 | 
 17 | static constexpr gpu_memcpy_kind DSC_GPU_MEMCPY_DIRECTIONS[DSC_MEMCPY_DIRECTIONS] = {
 18 |     gpu_memcpy_default,
 19 |     gpu_memcpy_device_2_host,
 20 |     gpu_memcpy_host_2_device,
 21 |     gpu_memcpy_device_2_device,
 22 | };
 23 | 
 24 | static DSC_GPU_KERNEL void k_init_random(gpu_rand_state *state) {
 25 |     DSC_GPU_TID();
 26 |     gpu_init_rand(clock64(), tid, 0, &state[tid]);
 27 | }
 28 | 
 29 | static void gpu_memcpy_wrapper(void *dst, const void *src, const usize nb, const dsc_memcpy_dir dir) {
 30 |     DSC_GPU_CHECK(gpu_memcpy(dst, src, nb, DSC_GPU_MEMCPY_DIRECTIONS[dir]));
 31 | }
 32 | 
 33 | static void gpu_memset_wrapper(void *dst, const int c, const usize nb) {
 34 |     DSC_GPU_CHECK(gpu_memset(dst, c, nb));
 35 | }
 36 | 
 37 | static void gpu_dispose(dsc_device *dev) {
 38 |     DSC_GPU_CHECK(gpu_free(dev->device_mem));
 39 | 
 40 |     const dsc_gpu_dev_info *info = (dsc_gpu_dev_info *) dev->extra_info;
 41 |     DSC_GPU_BLAS_CHECK(gpu_blas_destroy(info->blas_handle));
 42 | 
 43 |     DSC_GPU_CHECK(gpu_free(info->rand_state));
 44 | 
 45 |     dsc_gpu_tracing_dispose(dev->trace_ctx);
 46 | 
 47 |     DSC_LOG_INFO("%s:%s:%d device %s disposed",
 48 |                  DSC_DEVICE_NAMES[dev->type],
 49 |                  DSC_GPU_PLATFORM_NAMES[DSC_GPU_PLATFORM],
 50 |                  info->dev_idx,
 51 |                  info->name);
 52 | }
 53 | 
 54 | dsc_device *dsc_gpu_device(usize mem_size, const int dev_idx) {
 55 |     static dsc_gpu_dev_info extra = {
 56 |         .name = {},
 57 |         .rand_state = {},
 58 |         .blas_handle = {},
 59 |         .dev_idx = dev_idx,
 60 |         .platform = DSC_GPU_PLATFORM,
 61 |     };
 62 |     DSC_GPU_BLAS_CHECK(gpu_blas_create(&extra.blas_handle));
 63 | 
 64 |     // Allocate 90% of the device memory at most (is this too much?)
 65 |     const usize max_mem = (usize) (0.9 * (f64) dsc_gpu_dev_mem(dev_idx));
 66 |     mem_size = mem_size < max_mem ? mem_size : DSC_ALIGN(max_mem - (DSC_DEVICE_GPU_ALIGN - 1), DSC_DEVICE_GPU_ALIGN);
 67 |     static dsc_device dev = {
 68 |         .used_nodes = {},
 69 |         .free_nodes = {},
 70 |         .head = {},
 71 |         .device_mem = {},
 72 |         .alignment = DSC_DEVICE_GPU_ALIGN,
 73 |         .extra_info = &extra,
 74 |         .trace_ctx = dsc_gpu_tracing_init(),
 75 |         .mem_size = DSC_ALIGN(mem_size, DSC_DEVICE_GPU_ALIGN),
 76 |         .used_mem = 0,
 77 |         .type = GPU,
 78 |         .memcpy = gpu_memcpy_wrapper,
 79 |         .memset = gpu_memset_wrapper,
 80 |         .dispose = gpu_dispose,
 81 |         .next_trace = dsc_gpu_next_trace,
 82 |         .dump_trace = dsc_gpu_tracing_dump,
 83 |         .dump_json_metadata = dsc_gpu_dump_json_metadata,
 84 |     };
 85 | 
 86 |     DSC_GPU_CHECK(gpu_set_device(dev_idx));
 87 | 
 88 |     dsc_gpu_dev_name(dev_idx, extra.name);
 89 | 
 90 |     DSC_GPU_CHECK(gpu_malloc(&extra.rand_state, DSC_GPU_DEFAULT_THREADS * sizeof(gpu_rand_state)));
 91 | 
 92 |     k_init_random<<<1, DSC_GPU_DEFAULT_THREADS>>>(extra.rand_state);
 93 | 
 94 |     dsc_gpu_sync();
 95 | 
 96 |     DSC_GPU_CHECK(gpu_malloc(&dev.device_mem, dev.mem_size));
 97 | 
 98 |     dev.free_nodes[0].size = dev.mem_size;
 99 |     dev.free_nodes[0].data = dev.device_mem;
100 |     dev.free_nodes[0].next = nullptr;
101 | 
102 |     dev.head = &dev.free_nodes[0];
103 | 
104 |     DSC_LOG_INFO("%s:%s:%d device %s initialized with a buffer of %ldMB (total: %ldMB)",
105 |                  DSC_DEVICE_NAMES[dev.type],
106 |                  DSC_GPU_PLATFORM_NAMES[DSC_GPU_PLATFORM],
107 |                  dev_idx,
108 |                  extra.name,
109 |                  (usize) DSC_B_TO_MB(dev.mem_size),
110 |                  (usize) DSC_B_TO_MB(dsc_gpu_dev_mem(dev_idx)));
111 | 
112 |     return &dev;
113 | }


--------------------------------------------------------------------------------
/dsc/include/gpu/platform/dsc_cuda_platform.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | // All rights reserved.
  3 | //
  4 | // This code is licensed under the terms of the 3-clause BSD license
  5 | // (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | #pragma once
  8 | 
  9 | #include <cublas_v2.h>
 10 | 
 11 | #pragma GCC diagnostic push
 12 | #pragma GCC diagnostic ignored "-Wshadow"
 13 | #pragma GCC diagnostic ignored "-Wdouble-promotion"
 14 | #include <curand_kernel.h>
 15 | #pragma GCC diagnostic pop
 16 | 
 17 | #define DSC_GPU_PLATFORM CUDA
 18 | 
 19 | #define DSC_GPU_CHECK(err)                                          \
 20 |     do {                                                            \
 21 |         if (err != cudaSuccess) {                                   \
 22 |             DSC_LOG_FATAL("CUDA error: %s", cudaGetErrorName(err)); \
 23 |         }                                                           \
 24 |     } while (0)
 25 | 
 26 | #define DSC_GPU_BLAS_CHECK(err)                                            \
 27 |     do {                                                                   \
 28 |         if (err != CUBLAS_STATUS_SUCCESS) {                                \
 29 |             DSC_LOG_FATAL("cuBLAS error: %s", cublasGetStatusString(err)); \
 30 |         }                                                                  \
 31 |     } while (0)
 32 | 
 33 | 
 34 | // ============================================================
 35 | // Runtime API
 36 | //
 37 | 
 38 | #define gpu_get_device_count        cudaGetDeviceCount
 39 | #define gpu_get_device_properties   cudaGetDeviceProperties
 40 | #define gpu_device_sync             cudaDeviceSynchronize
 41 | 
 42 | #define gpu_malloc      cudaMalloc
 43 | #define gpu_free        cudaFree
 44 | #define gpu_memcpy      cudaMemcpy
 45 | #define gpu_memset      cudaMemset
 46 | #define gpu_set_device  cudaSetDevice
 47 | 
 48 | #define gpu_memcpy_default          cudaMemcpyDefault
 49 | #define gpu_memcpy_device_2_host    cudaMemcpyDeviceToHost
 50 | #define gpu_memcpy_host_2_device    cudaMemcpyHostToDevice
 51 | #define gpu_memcpy_device_2_device  cudaMemcpyDeviceToDevice
 52 | 
 53 | using gpu_memcpy_kind = cudaMemcpyKind;
 54 | using gpu_device_props = cudaDeviceProp;
 55 | 
 56 | // ============================================================
 57 | // Rand API
 58 | //
 59 | 
 60 | #define gpu_init_rand       curand_init
 61 | #define gpu_sample_normalf  curand_normal
 62 | #define gpu_sample_normal   curand_normal_double
 63 | 
 64 | using gpu_rand_state = curandState;
 65 | 
 66 | // ============================================================
 67 | // BLAS API
 68 | //
 69 | 
 70 | #define gpu_blas_create     cublasCreate
 71 | #define gpu_blas_destroy    cublasDestroy
 72 | #define gpu_blas_sgemm      cublasSgemm
 73 | #define gpu_blas_dgemm      cublasDgemm
 74 | #define gpu_blas_op         cublasOperation_t
 75 | #define gpu_blas_dtype      cudaDataType
 76 | #define GPU_BLAS_OP_T       CUBLAS_OP_T
 77 | #define GPU_BLAS_OP_N       CUBLAS_OP_N
 78 | #define GPU_GEMM_DTYPE_BF16 CUDA_R_16BF
 79 | #define GPU_GEMM_DTYPE_F32  CUDA_R_32F
 80 | 
 81 | using gpu_blas_handle = cublasHandle_t;
 82 | 
 83 | 
 84 | static DSC_INLINE cublasStatus_t gpu_blas_bfgemm(const gpu_blas_handle handle, const gpu_blas_op a_op, const gpu_blas_op b_op,
 85 |                                                  const int m, const int n, const int k, const void *DSC_RESTRICT alpha,
 86 |                                                  const void *DSC_RESTRICT xa, const gpu_blas_dtype a_dtype, const int stride_a,
 87 |                                                  const void *DSC_RESTRICT xb, const gpu_blas_dtype b_dtype, const int stride_b,
 88 |                                                  const void *DSC_RESTRICT beta, void *out, const gpu_blas_dtype out_dtype,
 89 |                                                  const int stride_out, const gpu_blas_dtype compute_dtype) {
 90 |     return cublasGemmEx(handle, a_op, b_op, m, n, k,
 91 |                         alpha, xa, a_dtype, stride_a,
 92 |                         xb, b_dtype, stride_b, beta,
 93 |                         out, out_dtype, stride_out,
 94 |                         compute_dtype, CUBLAS_GEMM_DEFAULT);
 95 | }
 96 | 
 97 | // ============================================================
 98 | // Event API
 99 | //
100 | 
101 | #define gpu_event_create        cudaEventCreate
102 | #define gpu_event_destroy       cudaEventDestroy
103 | #define gpu_event_record        cudaEventRecord
104 | #define gpu_event_synchronize   cudaEventSynchronize
105 | #define gpu_event_elapsed       cudaEventElapsedTime
106 | 
107 | using gpu_event = cudaEvent_t;
108 | 


--------------------------------------------------------------------------------
/dsc/include/gpu/platform/dsc_hip_platform.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | // All rights reserved.
  3 | //
  4 | // This code is licensed under the terms of the 3-clause BSD license
  5 | // (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | #pragma once
  8 | 
  9 | #include <hip/hip_runtime.h>
 10 | #pragma GCC diagnostic push
 11 | #pragma GCC diagnostic ignored "-Wdouble-promotion"
 12 | #include <rocrand/rocrand_kernel.h>
 13 | #pragma GCC diagnostic pop
 14 | 
 15 | #include <rocblas/rocblas.h>
 16 | 
 17 | 
 18 | #define DSC_GPU_PLATFORM ROCM
 19 | 
 20 | #define DSC_GPU_CHECK(err)                                          \
 21 |     do {                                                            \
 22 |         if (err != hipSuccess) {                                    \
 23 |             DSC_LOG_FATAL("HIP error: %s", hipGetErrorString(err)); \
 24 |         }                                                           \
 25 |     } while (0)
 26 | 
 27 | #define DSC_GPU_BLAS_CHECK(err)                                                \
 28 |     do {                                                                       \
 29 |         if (err != rocblas_status_success) {                                   \
 30 |             DSC_LOG_FATAL("rocBLAS error: %s", rocblas_status_to_string(err)); \
 31 |         }                                                                      \
 32 |     } while (0)
 33 | 
 34 | // ============================================================
 35 | // Runtime API
 36 | //
 37 | 
 38 | #define gpu_get_device_count        hipGetDeviceCount
 39 | #define gpu_get_device_properties   hipGetDeviceProperties
 40 | #define gpu_device_sync             hipDeviceSynchronize
 41 | 
 42 | #define gpu_malloc      hipMalloc
 43 | #define gpu_free        hipFree
 44 | #define gpu_memcpy      hipMemcpy
 45 | #define gpu_memset      hipMemset
 46 | #define gpu_set_device  hipSetDevice
 47 | 
 48 | #define gpu_memcpy_default          hipMemcpyDefault
 49 | #define gpu_memcpy_device_2_host    hipMemcpyDeviceToHost
 50 | #define gpu_memcpy_host_2_device    hipMemcpyHostToDevice
 51 | #define gpu_memcpy_device_2_device  hipMemcpyDeviceToDevice
 52 | 
 53 | using gpu_memcpy_kind = hipMemcpyKind;
 54 | using gpu_device_props = hipDeviceProp_t;
 55 | 
 56 | // ============================================================
 57 | // Rand API
 58 | //
 59 | 
 60 | #define gpu_init_rand       rocrand_init
 61 | #define gpu_sample_normalf  rocrand_normal
 62 | #define gpu_sample_normal   rocrand_normal_double
 63 | // Default for cuRAND
 64 | using gpu_rand_state = rocrand_state_xorwow;
 65 | 
 66 | // ============================================================
 67 | // BLAS API
 68 | //
 69 | 
 70 | #define gpu_blas_create         rocblas_create_handle
 71 | #define gpu_blas_destroy        rocblas_destroy_handle
 72 | #define gpu_blas_sgemm          rocblas_sgemm
 73 | #define gpu_blas_dgemm          rocblas_dgemm
 74 | #define gpu_blas_op             rocblas_operation
 75 | #define gpu_blas_dtype          rocblas_datatype
 76 | #define GPU_BLAS_OP_T           rocblas_operation_transpose
 77 | #define GPU_BLAS_OP_N           rocblas_operation_none
 78 | #define GPU_GEMM_DTYPE_BF16     rocblas_datatype_bf16_r
 79 | #define GPU_GEMM_DTYPE_F32      rocblas_datatype_f32_r
 80 | 
 81 | using gpu_blas_handle = rocblas_handle;
 82 | 
 83 | static DSC_INLINE rocblas_status gpu_blas_bfgemm(const gpu_blas_handle handle, const gpu_blas_op a_op, const gpu_blas_op b_op,
 84 |                                                  const int m, const int n, const int k, const void *DSC_RESTRICT alpha,
 85 |                                                  const void *DSC_RESTRICT xa, const gpu_blas_dtype a_dtype, const int stride_a,
 86 |                                                  const void *DSC_RESTRICT xb, const gpu_blas_dtype b_dtype, const int stride_b,
 87 |                                                  const void *DSC_RESTRICT beta, void *out, const gpu_blas_dtype out_dtype,
 88 |                                                  const int stride_out, const gpu_blas_dtype compute_dtype) {
 89 |     return rocblas_gemm_ex(handle, a_op, b_op, m, n, k,
 90 |                            alpha, xa, a_dtype, stride_a,
 91 |                            xb, b_dtype, stride_b, beta,
 92 |                            out, out_dtype, stride_out,
 93 |                            out, out_dtype, stride_out,
 94 |                            compute_dtype, rocblas_gemm_algo_standard, 0, 0);
 95 | }
 96 | 
 97 | // ============================================================
 98 | // Event API
 99 | //
100 | 
101 | #define gpu_event_create        hipEventCreate
102 | #define gpu_event_destroy       hipEventDestroy
103 | #define gpu_event_record        hipEventRecord
104 | #define gpu_event_synchronize   hipEventSynchronize
105 | #define gpu_event_elapsed       hipEventElapsedTime
106 | 
107 | using gpu_event = hipEvent_t;
108 | 


--------------------------------------------------------------------------------
/.clang-tidy:
--------------------------------------------------------------------------------
  1 | ---
  2 | Checks: '-*,
  3 | bugprone-argument-comment,
  4 | bugprone-assert-side-effect,
  5 | bugprone-bad-signal-to-kill-thread,
  6 | bugprone-branch-clone,
  7 | bugprone-copy-constructor-init,
  8 | bugprone-dangling-handle,
  9 | bugprone-dynamic-static-initializers,
 10 | bugprone-fold-init-type,
 11 | bugprone-forward-declaration-namespace,
 12 | bugprone-forwarding-reference-overload,
 13 | bugprone-inaccurate-erase,
 14 | bugprone-incorrect-roundings,
 15 | bugprone-integer-division,
 16 | bugprone-lambda-function-name,
 17 | bugprone-macro-repeated-side-effects,
 18 | bugprone-misplaced-operator-in-strlen-in-alloc,
 19 | bugprone-misplaced-pointer-arithmetic-in-alloc,
 20 | bugprone-misplaced-widening-cast,
 21 | bugprone-move-forwarding-reference,
 22 | bugprone-multiple-statement-macro,
 23 | bugprone-no-escape,
 24 | bugprone-parent-virtual-call,
 25 | bugprone-posix-return,
 26 | bugprone-reserved-identifier,
 27 | bugprone-sizeof-container,
 28 | bugprone-sizeof-expression,
 29 | bugprone-spuriously-wake-up-functions,
 30 | bugprone-string-constructor,
 31 | bugprone-string-integer-assignment,
 32 | bugprone-string-literal-with-embedded-nul,
 33 | bugprone-suspicious-enum-usage,
 34 | bugprone-suspicious-include,
 35 | bugprone-suspicious-memset-usage,
 36 | bugprone-suspicious-missing-comma,
 37 | bugprone-suspicious-semicolon,
 38 | bugprone-suspicious-string-compare,
 39 | bugprone-suspicious-memory-comparison,
 40 | bugprone-suspicious-realloc-usage,
 41 | bugprone-swapped-arguments,
 42 | bugprone-terminating-continue,
 43 | bugprone-throw-keyword-missing,
 44 | bugprone-too-small-loop-variable,
 45 | bugprone-undefined-memory-manipulation,
 46 | bugprone-undelegated-constructor,
 47 | bugprone-unhandled-self-assignment,
 48 | bugprone-unused-raii,
 49 | bugprone-unused-return-value,
 50 | bugprone-use-after-move,
 51 | bugprone-virtual-near-miss,
 52 | cert-dcl21-cpp,
 53 | cert-dcl58-cpp,
 54 | cert-err52-cpp,
 55 | cert-err60-cpp,
 56 | cert-flp30-c,
 57 | cert-str34-c,
 58 | cppcoreguidelines-interfaces-global-init,
 59 | cppcoreguidelines-narrowing-conversions,
 60 | cppcoreguidelines-pro-type-member-init,
 61 | cppcoreguidelines-pro-type-static-cast-downcast,
 62 | cppcoreguidelines-slicing,
 63 | google-default-arguments,
 64 | google-runtime-operator,
 65 | hicpp-exception-baseclass,
 66 | hicpp-multiway-paths-covered,
 67 | misc-misplaced-const,
 68 | misc-new-delete-overloads,
 69 | misc-non-copyable-objects,
 70 | misc-throw-by-value-catch-by-reference,
 71 | misc-unconventional-assign-operator,
 72 | misc-uniqueptr-reset-release,
 73 | modernize-avoid-bind,
 74 | modernize-concat-nested-namespaces,
 75 | modernize-deprecated-headers,
 76 | modernize-deprecated-ios-base-aliases,
 77 | modernize-make-shared,
 78 | modernize-make-unique,
 79 | modernize-pass-by-value,
 80 | modernize-raw-string-literal,
 81 | modernize-redundant-void-arg,
 82 | modernize-replace-auto-ptr,
 83 | modernize-replace-disallow-copy-and-assign-macro,
 84 | modernize-replace-random-shuffle,
 85 | modernize-return-braced-init-list,
 86 | modernize-shrink-to-fit,
 87 | modernize-unary-static-assert,
 88 | modernize-use-bool-literals,
 89 | modernize-use-emplace,
 90 | modernize-use-equals-default,
 91 | modernize-use-equals-delete,
 92 | modernize-use-noexcept,
 93 | modernize-use-nullptr,
 94 | modernize-use-override,
 95 | modernize-use-transparent-functors,
 96 | modernize-use-uncaught-exceptions,
 97 | mpi-buffer-deref,
 98 | mpi-type-mismatch,
 99 | openmp-use-default-none,
100 | performance-faster-string-find,
101 | performance-for-range-copy,
102 | performance-implicit-conversion-in-loop,
103 | performance-inefficient-algorithm,
104 | performance-inefficient-string-concatenation,
105 | performance-inefficient-vector-operation,
106 | performance-move-const-arg,
107 | performance-move-constructor-init,
108 | performance-no-automatic-move,
109 | performance-noexcept-move-constructor,
110 | performance-trivially-destructible,
111 | performance-type-promotion-in-math-fn,
112 | performance-unnecessary-copy-initialization,
113 | performance-unnecessary-value-param,
114 | portability-simd-intrinsics,
115 | readability-avoid-const-params-in-decls,
116 | readability-const-return-type,
117 | readability-container-size-empty,
118 | readability-convert-member-functions-to-static,
119 | readability-delete-null-pointer,
120 | readability-deleted-default,
121 | readability-inconsistent-declaration-parameter-name,
122 | readability-make-member-function-const,
123 | readability-misleading-indentation,
124 | readability-misplaced-array-index,
125 | readability-non-const-parameter,
126 | readability-redundant-control-flow,
127 | readability-redundant-declaration,
128 | readability-redundant-function-ptr-dereference,
129 | readability-redundant-smartptr-get,
130 | readability-redundant-string-cstr,
131 | readability-redundant-string-init,
132 | readability-simplify-subscript-expr,
133 | readability-static-accessed-through-instance,
134 | readability-static-definition-in-anonymous-namespace,
135 | readability-string-compare,
136 | readability-uniqueptr-delete-release,
137 | readability-use-anyofallof'


--------------------------------------------------------------------------------
/IDEAS.md:
--------------------------------------------------------------------------------
  1 | ## Code Gen
  2 | I'm thinking about adding some table-driven code generation into dsc.
  3 | Right now, if you want to add a new operation you have to go through the following steps:
  4 | 
  5 | 1. Declare the API function in `dsc.h`
  6 | 2. Implement the function (parameter validation + dispatching) in `dsc.cpp`
  7 | 3. For each backend:
  8 |    1. Declare the function that actually implements the core logic in `dsc_xxx.h`
  9 |    2. Actually implement the core logic of the function in `dsc_xxx.cpp`
 10 | 4. Implement the binding in `_bindings.py`
 11 | 
 12 | The actual logic itself for the *majority* of operations is basically the same.
 13 | Consider a generic binary operation, the flow is almost always:
 14 | 1. Validation + dispatch
 15 | ```c++
 16 | dsc_tensor *dsc_add(dsc_ctx *ctx,
 17 |                     dsc_tensor *xa,
 18 |                     dsc_tensor *xb,
 19 |                     dsc_tensor *out) {
 20 |     validate_binary_params();
 21 | 
 22 |     DSC_DISPATCH(xa->device, add, xa, xb, out);
 23 | 
 24 |     cleanup_binary();
 25 | 
 26 |     return out;
 27 | }
 28 | ```
 29 | 2. Implementation
 30 | ```c++
 31 | void dsc_cpu_add(dsc_device *,
 32 |                  const dsc_tensor *xa,
 33 |                  const dsc_tensor *xb,
 34 |                  dsc_tensor *out) {
 35 |     binary_op(xa, xb, out, cpu_add_op());
 36 | }
 37 | ```
 38 | The same goes for unary operations and for reductions. The only exceptions at the moment is probably
 39 | the GEMM and the operations related to indexing and slicing.
 40 | 
 41 | Also, most of the code in `dsc_ops.h` can be generate trivially.
 42 | 
 43 | And then there are the aspects related to testing and benchmarking: using this kind of approach could also
 44 | lead to better testing and benchmarking (ie. when I specify a new operation I can also specify 'how it should be tested / benchmarked').
 45 | 
 46 | Another important point: with this approach it could be even easier to implement tracing. For example, 
 47 | if I want to use the current approach I can just define the parameters that are important and generate
 48 | the macros and stuff.
 49 | 
 50 | 
 51 | **Key things to keep in mind:**
 52 | - The generated code must live alongside the handwritten code, I don't have a clear idea on how to do this,
 53 | same goes for the bindings. For tests and benchmarks is easier because I can actually put them in multiple files
 54 | right away.
 55 | - (this is related to the previous point) Limit file proliferation. I don't want the number of files to explode
 56 | due to this feature.
 57 | - **Generated code must be versioned** (ie. pushed to GitHub) **and must be modifiable** and the code generator
 58 | must not erase these updates if it's re-run.
 59 | 
 60 | 
 61 | ## Kernel Generation
 62 | This is somewhat related to the previous point. The idea is that for 'high-level' kernels (i.e. kernels that are defined
 63 | directly in Python) the overhead of going back and forth between Python and C++ can be significant.
 64 | Take for example the softmax kernel:
 65 | ```python
 66 | @trace('softmax')
 67 | def softmax(x: Tensor, axis: int = -1) -> Tensor:
 68 |     e = exp((x - max(x, axis=axis, keepdims=True)))
 69 |     sum_e = sum(e, axis=axis, keepdims=True)
 70 |     return e / sum_e
 71 | ```
 72 | This is a correct softmax that uses native operations under the hood, this means each operation will produce a new tensor
 73 | that Python must wrap and track and so on. Just be re-writing these same operations in C++ naively, without anything extra,
 74 | we get a **~20% speedup**. The Python side will then be replaced by:
 75 | ```python
 76 | @trace('softmax')
 77 | def softmax(x: Tensor, axis: int = -1) -> Tensor:
 78 |     return Tensor(_dsc_softmax(_get_ctx(), x.c_ptr, axis))
 79 | ```
 80 | 
 81 | The idea is to add a mechanism to do this sort of code generation automatically. There are a few things to keep in mind:
 82 | - Kernels may depend on other 'high-level' kernels (i.e. LayerNorm depends on mean and var).
 83 | - It should be possible to switch between the naive Python version and the generated version i.e. when debugging
 84 | 
 85 | 
 86 | ## Memory Management
 87 | It's clear that for model inference the general purpose allocator approach is not good enough. Right now, between allocations
 88 | and de-allocations more than 6% of the total execution time is wasted managing memory.
 89 | I don't know if using a proper arena for everything will work with Python garbage collector but at least on the C++ side
 90 | I should add that (opt-in ?).
 91 | This would be even more important if I manage to implement native kernel generation because the softmax example above
 92 | otherwise will turn out to look like this:
 93 | ```c++
 94 | dsc_tensor *dsc_softmax(dsc_ctx *ctx,
 95 |                         dsc_tensor *DSC_RESTRICT x,
 96 |                         const int axis) {
 97 |     dsc_tensor *m = dsc_max(ctx, x, nullptr, axis);
 98 |     dsc_tensor *dif = dsc_sub(ctx, x, m);
 99 |     dsc_tensor *e = dsc_exp(ctx, dif);
100 |     dsc_tensor *sum_e = dsc_sum(ctx, e, nullptr, axis);
101 | 
102 |     dsc_tensor *out = dsc_div(ctx, e, sum_e);
103 | 
104 |     dsc_tensor_free(ctx, m);
105 |     dsc_tensor_free(ctx, dif);
106 |     dsc_tensor_free(ctx, e);
107 |     dsc_tensor_free(ctx, sum_e);
108 |     return out;
109 | }
110 | ```
111 | Which is very ugly and cumbersome.


--------------------------------------------------------------------------------
/dsc/include/cpu/dsc_iter.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | // All rights reserved.
  3 | //
  4 | // This code is licensed under the terms of the 3-clause BSD license
  5 | // (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | #pragma once
  8 | 
  9 | #include "dsc.h"
 10 | 
 11 | 
 12 | namespace internal::iter {
 13 | // TODO: (4)
 14 | template<int Cur = 0>
 15 | constexpr int compute_index(const int *DSC_RESTRICT idx,
 16 |                             const int *DSC_RESTRICT stride) {
 17 |     // Note: computing the index on the fly is way easier than keeping track of the current index
 18 |     // and increasing/decreasing it after each step, but it requires some benchmarking!
 19 |     if constexpr (Cur == DSC_MAX_DIMS) {
 20 |         return 0;
 21 |     } else {
 22 |         return idx[Cur] * stride[Cur] + compute_index<Cur + 1>(idx, stride);
 23 |     }
 24 | }
 25 | }
 26 | 
 27 | struct dsc_axis_iterator {
 28 |     dsc_axis_iterator(const dsc_tensor *x,
 29 |                       const int axis,
 30 |                       const int axis_n = -1) :
 31 |             x_(x), axis_(axis),
 32 |             axis_n_((axis_n < 0 || axis_n > x->shape[axis]) ? x->shape[axis] : axis_n) {
 33 |     }
 34 | 
 35 |     DSC_INLINE void next() {
 36 |         if (++idx_[axis_] < axis_n_) [[likely]] return;
 37 | 
 38 |         idx_[axis_] = 0;
 39 |         bool still_left = false;
 40 |         for (int i = DSC_MAX_DIMS - 1; i >= 0; --i) {
 41 |             if (i == axis_) continue;
 42 | 
 43 |             if (++idx_[i] < x_->shape[i]) [[likely]] {
 44 |                 still_left = true;
 45 |                 break;
 46 |             }
 47 |             idx_[i] = 0;
 48 |             // If this is the last dimension and we rolled then we're done
 49 |             end_ = i == 0;
 50 |         }
 51 |         // If we are iterating over axis 0 and we arrive here then we're done
 52 |         end_ |= axis_ == 0 && !still_left;
 53 |     }
 54 | 
 55 |     DSC_INLINE int index() const {
 56 |         return internal::iter::compute_index(idx_, x_->stride);
 57 |     }
 58 | 
 59 |     DSC_INLINE bool has_next() const {
 60 |         return !end_;
 61 |     }
 62 | 
 63 |     DSC_INLINE int pos(const int idx) const {
 64 |         return idx_[idx > 0 ? idx : DSC_MAX_DIMS + idx];
 65 |     }
 66 | 
 67 | private:
 68 |     int idx_[DSC_MAX_DIMS]{};
 69 |     const dsc_tensor *DSC_RESTRICT x_;
 70 |     int axis_;
 71 |     int axis_n_;
 72 |     bool end_ = false;
 73 | };
 74 | 
 75 | struct dsc_broadcast_iterator {
 76 |     dsc_broadcast_iterator(const dsc_tensor *x, const int *out_shape) : x_shape_(x->shape),
 77 |                                                                         x_stride_(x->stride),
 78 |                                                                         out_shape_(out_shape) {
 79 |         for (int i = 0; i < DSC_MAX_DIMS; ++i) {
 80 |             x_broadcast_stride_[i] = x_shape_[i] < out_shape_[i] ? 0 : x_stride_[i];
 81 |         }
 82 |     }
 83 | 
 84 |     // Simple strided iterator
 85 |     dsc_broadcast_iterator(const int *x_shape, const int *x_stride) : x_shape_(x_shape),
 86 |                                                                       x_stride_(x_stride),
 87 |                                                                       out_shape_(x_shape) {
 88 |         for (int i = 0; i < DSC_MAX_DIMS; ++i) {
 89 |             x_broadcast_stride_[i] = x_stride_[i];
 90 |         }
 91 |     }
 92 | 
 93 |     DSC_INLINE void next() {
 94 |         for (int i = DSC_MAX_DIMS - 1; i >= 0; --i) {
 95 |             if (++x_idx_[i] < out_shape_[i]) [[likely]] {
 96 |                 index_ += x_broadcast_stride_[i];
 97 |                 return;
 98 |             }
 99 |             // Rollover this dimension
100 |             index_ -= (x_idx_[i] - 1) * x_broadcast_stride_[i];
101 |             x_idx_[i] = 0;
102 |         }
103 |     }
104 | 
105 |     DSC_INLINE int index() const {
106 |         return index_;
107 |     }
108 | 
109 | private:
110 |     int index_ = 0;
111 |     const int *x_shape_, *x_stride_, *out_shape_;
112 |     int x_broadcast_stride_[DSC_MAX_DIMS]{}, x_idx_[DSC_MAX_DIMS]{};
113 | };
114 | 
115 | struct dsc_slice_iterator {
116 |     dsc_slice_iterator(const dsc_tensor *x, const int n_slices, const dsc_slice *slices) :
117 |             shape_(x->shape), stride_(x->stride), n_dim_(x->n_dim) {
118 |         for (int i = 0; i < x->n_dim; ++i) {
119 |             const int dim_idx = dsc_tensor_dim_idx(x, i);
120 |             if (i < n_slices) {
121 |                 start_[dim_idx] = slices[i].start;
122 |                 stop_[dim_idx] = slices[i].stop;
123 |                 step_[dim_idx] = slices[i].step;
124 |             } else {
125 |                 start_[dim_idx] = 0;
126 |                 stop_[dim_idx] = shape_[dim_idx];
127 |                 step_[dim_idx] = 1;
128 |             }
129 | 
130 |             idx_[dim_idx] = start_[dim_idx];
131 |         }
132 |     }
133 | 
134 |     DSC_INLINE bool has_next() const {
135 |         return !end_;
136 |     }
137 | 
138 |     DSC_INLINE void next() {
139 |         for (int i = DSC_MAX_DIMS - 1; i >= (DSC_MAX_DIMS - n_dim_); --i) {
140 |             idx_[i] += step_[i];
141 |             if ((step_[i] > 0 && idx_[i] < stop_[i]) ||
142 |                 (step_[i] < 0 && idx_[i] > stop_[i])) [[likely]] {
143 |                 return;
144 |             }
145 |             idx_[i] = start_[i];
146 |         }
147 |         end_ = true;
148 |     }
149 | 
150 |     DSC_INLINE int index() const {
151 |         return internal::iter::compute_index<>(idx_, stride_);
152 |     }
153 | 
154 | private:
155 |     const int *shape_;
156 |     const int *stride_;
157 |     int idx_[DSC_MAX_DIMS]{};
158 |     int start_[DSC_MAX_DIMS]{};
159 |     int stop_[DSC_MAX_DIMS]{};
160 |     int step_[DSC_MAX_DIMS]{};
161 |     const int n_dim_;
162 |     bool end_ = false;
163 | };


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | CXX			=	g++
  2 | NVCC		=	nvcc
  3 | HIPCC		=	hipcc
  4 | 
  5 | HIPCCFLAGS	=	-std=c++20 -I$(ROCM)/include -I./dsc/include/ --offload-arch=native -Wall -Wextra -Wformat \
  6 |                 -Wcast-qual -Wcast-align -Wstrict-aliasing -Wpointer-arith -Wunused -Wdouble-promotion \
  7 |                 -Wno-missing-braces -Wcast-align -fno-exceptions -fno-rtti
  8 | NVCCFLAGS	=	-std=c++20 -I$(CUDA)/include -I./dsc/include/ -ccbin=$(CXX) -arch=native \
  9 | 				-forward-unknown-opts -Wall -Wextra -Wformat -Wnoexcept  \
 10 |                 -Wcast-qual -Wcast-align -Wstrict-aliasing -Wpointer-arith -Wunused -Wdouble-promotion \
 11 |                 -Wlogical-op -Wcast-align -fno-exceptions -fno-rtti
 12 | CXXFLAGS	=	-std=c++20 -I./dsc/include/ -Wall -Wextra -Wformat -Wnoexcept  \
 13 |  				-Wcast-qual -Wcast-align -Wstrict-aliasing -Wpointer-arith -Wunused -Wdouble-promotion \
 14 |  				-Wlogical-op -Wcast-align -fno-exceptions -fno-rtti -pthread
 15 | LDFLAGS		=	-lm
 16 | 
 17 | UNAME_M		:=	$(shell uname -m)
 18 | UNAME_S		:=	$(shell uname -s)
 19 | 
 20 | ifdef DSC_GPU
 21 | 	# Try to detect the GPU vendor based on the available compiler
 22 | 	DSC_CUDA := $(shell which $(NVCC) 2>/dev/null)
 23 | 	DSC_HIP := $(shell which $(HIPCC) 2>/dev/null)
 24 | 	ifdef DSC_CUDA
 25 | 		ifneq ($(wildcard /opt/cuda),)
 26 | 			CUDA ?= /opt/cuda
 27 | 		else
 28 | 			CUDA ?= /usr/local/cuda
 29 | 		endif
 30 | 	else
 31 | 		# Check for HIP only if CUDA is not defined
 32 | 		ifdef DSC_HIP
 33 | 			ifneq ($(wildcard /opt/rocm),)
 34 | 				ROCM ?=	/opt/rocm
 35 | 			else
 36 | 				ROCM ?=	/usr/local/rocm
 37 | 			endif
 38 | 		endif
 39 | 	endif
 40 | endif
 41 | 
 42 | # Make sure only one GPU platform is defined
 43 | ifdef DSC_CUDA
 44 | 	ifdef DSC_HIP
 45 | 		$(error ERROR: both DSC_CUDA and DSC_HIP are defined - this is not supported)
 46 | 	endif
 47 | endif
 48 | 
 49 | ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64))
 50 | 	# Use all available CPU extensions, x86 only
 51 | 	CXXFLAGS	+= 	-march=native -mtune=native
 52 | endif
 53 | 
 54 | ifndef DSC_LOG_LEVEL
 55 | 	ifdef DSC_FAST
 56 | 		DSC_LOG_LEVEL	:=	1
 57 | 	else
 58 | 		DSC_LOG_LEVEL	:=	0
 59 | 	endif
 60 | endif
 61 | 
 62 | CXXFLAGS	+=	-DDSC_LOG_LEVEL=$(DSC_LOG_LEVEL)
 63 | NVCCFLAGS	+=	-DDSC_LOG_LEVEL=$(DSC_LOG_LEVEL)
 64 | HIPCCFLAGS	+=	-DDSC_LOG_LEVEL=$(DSC_LOG_LEVEL)
 65 | 
 66 | ifdef DSC_FAST
 67 | 	# -Ofast turns on all the unsafe math optimizations, including -ffinite-math-only this is an issue when testing
 68 | 	# because Inf and NaN have different meaning but will be treated as equals when using -ffinite-math-only.
 69 | 	# When inferencing assuming only finite numbers is correct but since it's doesn't actually hurt performance
 70 | 	# let's keep this flag so we can run our tests without worrying about denormal numbers.
 71 | 	CXXFLAGS	+=	-Ofast -fno-finite-math-only -ffp-contract=fast -funroll-loops -flto=auto -fuse-linker-plugin
 72 | 	NVCCFLAGS	+=	-O3
 73 | 	HIPCCFLAGS	+=	-O3
 74 | else
 75 | 	CXXFLAGS	+=	-O0 -fno-omit-frame-pointer -g
 76 | 	NVCCFLAGS	+=	-O0 -fno-omit-frame-pointer -g -G
 77 | 	HIPCCFLAGS	+=	-O0 -fno-omit-frame-pointer -g
 78 | endif
 79 | 
 80 | ifdef DSC_TRACING
 81 | 	CXXFLAGS	+=	-DDSC_TRACING=1
 82 | 	NVCCFLAGS	+=	-DDSC_TRACING=1
 83 | 	HIPCCFLAGS	+=	-DDSC_TRACING=1
 84 | endif
 85 | 
 86 | # If we are not compiling the shared object and are in debug mode then run in ASAN mode
 87 | ifeq ($(MAKECMDGOALS),shared)
 88 | 	CXXFLAGS	+=	-fPIC
 89 | 	NVCCFLAGS	+=	-fPIC
 90 | 	HIPCCFLAGS	+=	-fPIC
 91 | endif
 92 | 
 93 | GPU_SRCS	:=	$(wildcard dsc/src/gpu/*.cpp)
 94 | GPU_OBJS	:=	$(GPU_SRCS:.cpp=.o)
 95 | 
 96 | # Enable CUDA support
 97 | ifdef DSC_CUDA
 98 | 	# BF16 is supported in compute capability >= 8.0 (Ampere)
 99 | 	HAS_BF16_GPU := $(shell compute_major=$$(nvidia-smi --query-gpu=compute_cap --format=noheader | cut -d. -f1); \
100 | 						if [ "$${compute_major}" -ge 8 ]; then echo 1; fi)
101 | 	ifeq ($(HAS_BF16_GPU), 1)
102 | 		NVCCFLAGS	+=	-DDSC_BF16
103 | 		CXXFLAGS	+=	-DDSC_BF16
104 | 	endif
105 | 
106 | 	CXXFLAGS	+=	-I$(CUDA)/include -DDSC_CUDA
107 | 	NVCCFLAGS	+=	-x cu -DDSC_CUDA
108 | 	LDFLAGS		+=	-L$(CUDA)/lib64 -lcudart -lcublas
109 | 
110 | 	OBJS		+=	$(GPU_OBJS)
111 | 
112 | $(GPU_OBJS): %.o: %.cpp
113 | 	$(NVCC) $(NVCCFLAGS) -c $< -o $@
114 | endif
115 | 
116 | # Enable HIP support
117 | ifdef DSC_HIP
118 | 	GPU_TARGETS := $(shell ${ROCM_PATH}/bin/rocm_agent_enumerator)
119 |     HAS_BF16_GPU := $(shell echo '${GPU_TARGETS}' | grep -q -E "gfx90a|gfx94[0-2]|gfx103[0-6]" && echo 1)
120 |     ifeq ($(HAS_BF16_GPU), 1)
121 | 		HIPCCFLAGS	+=	-DDSC_BF16
122 | 		CXXFLAGS	+=	-DDSC_BF16
123 |     endif
124 | 
125 | 	# TODO: is -D__HIP_PLATFORM_AMD__ required?
126 | 	CXXFLAGS	+=	-I$(ROCM)/include -DDSC_HIP -D__HIP_PLATFORM_AMD__
127 | 	HIPCCFLAGS	+=	-DDSC_HIP
128 | 	LDFLAGS		+=	-L$(ROCM)/lib -lamdhip64 -lrocrand -lrocblas
129 | 
130 | 	OBJS		+=	$(GPU_OBJS)
131 | 
132 | $(GPU_OBJS): %.o: %.cpp
133 | 	$(HIPCC) $(HIPCCFLAGS) -c $< -o $@
134 | endif
135 | 
136 | 
137 | $(info dsc build info: )
138 | $(info   OS:		$(UNAME_S))
139 | $(info   ARCH:		$(UNAME_M))
140 | $(info   CXX:		$(shell $(CXX) --version | head -n 1))
141 | $(info   CXXFLAGS:	$(CXXFLAGS))
142 | 
143 | ifdef DSC_CUDA
144 | $(info   NVCC:		$(shell $(NVCC) --version | head -n 4 | tail -n 1))
145 | $(info   NVCCFLAGS:	$(NVCCFLAGS))
146 | endif
147 | 
148 | ifdef DSC_HIP
149 | $(info   HIPCC:		$(shell $(HIPCC) --version | head -n 1 | tail -n 1))
150 | $(info   HIPCCFLAGS:	$(HIPCCFLAGS))
151 | endif
152 | 
153 | $(info   LDFLAGS:	$(LDFLAGS))
154 | $(info )
155 | 
156 | SRCS		=	$(wildcard dsc/src/*.cpp)
157 | SRCS		+=	$(wildcard dsc/src/cpu/*.cpp)
158 | OBJS		+=	$(SRCS:.cpp=.o)
159 | 
160 | SHARED_LIB	=	python/dsc/libdsc.so
161 | 
162 | .PHONY: clean shared
163 | 
164 | clean:
165 | 	rm -rf *.o *.so *.old $(OBJS) $(GPU_OBJS) $(SHARED_LIB)
166 | 
167 | shared: $(OBJS)
168 | 	$(CXX) $(CXXFLAGS) -shared $(OBJS) -o $(SHARED_LIB) $(LDFLAGS)
169 | 
170 | %.o: %.cpp
171 | 	$(CXX) $(CXXFLAGS) -c $< -o $@
172 | 


--------------------------------------------------------------------------------
/dsc/include/cpu/dsc_ops.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | // All rights reserved.
  3 | //
  4 | // This code is licensed under the terms of the 3-clause BSD license
  5 | // (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | #pragma once
  8 | 
  9 | #include "dsc.h"
 10 | #include <cmath>
 11 | 
 12 | struct cpu_cast_op {
 13 |     template<typename Tin, typename Tout>
 14 |     DSC_INLINE DSC_STRICTLY_PURE Tout operator()(const Tin in) const {
 15 |         union {
 16 |             f32 f;
 17 |             u32 i;
 18 |         } u;
 19 | 
 20 |         // TODO: 64 bit conversion f64 -> bf16
 21 |         if constexpr (dsc_is_type<Tin, bf16>()) {
 22 |             // Naive way of converting between BF16 and F32, if this has to be applied to a sequence of
 23 |             // elements it can be vectorized quite easily.
 24 |             u.i = (u32) in << 16;
 25 |             return (Tout) u.f;
 26 |         } else if constexpr (dsc_is_type<Tout, bf16>() && dsc_is_type<Tin, f32>()) {
 27 |             u.f = in;
 28 |             u.i >>= 16;
 29 |             return (Tout) u.i;
 30 |         } else {
 31 |             return (Tout) in;
 32 |         }
 33 |     }
 34 | };
 35 | 
 36 | struct cpu_add_op {
 37 |     template<typename T>
 38 |     DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const {
 39 |         if constexpr (dsc_is_type<T, bool>()) {
 40 |             return xa || xb;
 41 |         } else {
 42 |             return xa + xb;
 43 |         }
 44 |     }
 45 | };
 46 | 
 47 | struct cpu_sub_op {
 48 |     template<typename T>
 49 |     DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const {
 50 |         if constexpr (dsc_is_type<T, bool>()) {
 51 |             return xa ^ xb;
 52 |         } else {
 53 |             return xa - xb;
 54 |         }
 55 |     }
 56 | };
 57 | 
 58 | struct cpu_mul_op {
 59 |     template<typename T>
 60 |     DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const {
 61 |         if constexpr (dsc_is_type<T, bool>()) {
 62 |             return xa && xb;
 63 |         } else {
 64 |             return xa * xb;
 65 |         }
 66 |     }
 67 | };
 68 | 
 69 | struct cpu_div_op {
 70 |     template<typename T>
 71 |     DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const {
 72 |         return xa / xb;
 73 |     }
 74 | };
 75 | 
 76 | struct cpu_pow_op {
 77 |     DSC_INLINE DSC_STRICTLY_PURE i32 operator()(const i32 base, const i32 exp) const {
 78 |         i32 acc = 1;
 79 |         for (int i = 0; i < exp; ++i) acc *= base;
 80 |         return acc;
 81 |     }
 82 | 
 83 |     DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 base, const f32 exp) const {
 84 |         return powf(base, exp);
 85 |     }
 86 | 
 87 |     DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 base, const f64 exp) const {
 88 |         return pow(base, exp);
 89 |     }
 90 | };
 91 | 
 92 | struct cpu_cos_op {
 93 |     DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const {
 94 |         return cosf(x);
 95 |     }
 96 | 
 97 |     DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const {
 98 |         return cos(x);
 99 |     }
100 | };
101 | 
102 | struct cpu_sin_op {
103 |     DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const {
104 |         return sinf(x);
105 |     }
106 | 
107 |     DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const {
108 |         return sin(x);
109 |     }
110 | };
111 | 
112 | struct cpu_tanh_op {
113 |     DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const {
114 |         return tanhf(x);
115 |     }
116 | 
117 |     DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const {
118 |         return tanh(x);
119 |     }
120 | };
121 | 
122 | struct cpu_sqrt_op {
123 |     DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const {
124 |         return sqrtf(x);
125 |     }
126 | 
127 |     DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const {
128 |         return sqrt(x);
129 |     }
130 | };
131 | 
132 | struct cpu_exp_op {
133 |     DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const {
134 |         return expf(x);
135 |     }
136 | 
137 |     DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const {
138 |         return exp(x);
139 |     }
140 | };
141 | 
142 | struct cpu_max_op {
143 |     template<typename T>
144 |     DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const {
145 |         return DSC_MAX(xa, xb);
146 |     }
147 | };
148 | 
149 | struct cpu_min_op {
150 |     template<typename T>
151 |     DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const {
152 |         return DSC_MIN(xa, xb);
153 |     }
154 | };
155 | 
156 | struct cpu_eq_op {
157 |     template<typename T>
158 |     DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const {
159 |         return xa == xb;
160 |     }
161 | };
162 | 
163 | struct cpu_ne_op {
164 |     template<typename T>
165 |     DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const {
166 |         return !cpu_eq_op()(xa, xb);
167 |     }
168 | };
169 | 
170 | struct cpu_lt_op {
171 |     template<typename T>
172 |     DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const {
173 |         return xa < xb;
174 |     }
175 | };
176 | struct cpu_le_op {
177 |     template<typename T>
178 |     DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const {
179 |         return xa <= xb;
180 |     }
181 | };
182 | 
183 | struct cpu_gt_op {
184 |     template<typename T>
185 |     DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const {
186 |         return xa > xb;
187 |     }
188 | };
189 | 
190 | struct cpu_ge_op {
191 |     template<typename T>
192 |     DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const {
193 |         return xa >= xb;
194 |     }
195 | };
196 | 
197 | template<typename Op>
198 | consteval bool is_comparison_op() {
199 |     return dsc_is_type<Op, cpu_eq_op>() ||
200 |            dsc_is_type<Op, cpu_ne_op>() ||
201 |            dsc_is_type<Op, cpu_lt_op>() ||
202 |            dsc_is_type<Op, cpu_le_op>() ||
203 |            dsc_is_type<Op, cpu_gt_op>() ||
204 |            dsc_is_type<Op, cpu_ge_op>();
205 | }
206 | 
207 | template<typename Op>
208 | consteval bool is_bool_arith_op() {
209 |     return dsc_is_type<Op, cpu_add_op>() ||
210 |            dsc_is_type<Op, cpu_sub_op>() ||
211 |            dsc_is_type<Op, cpu_mul_op>();
212 | }


--------------------------------------------------------------------------------
/dsc/include/gpu/dsc_tracing.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | // All rights reserved.
  3 | //
  4 | // This code is licensed under the terms of the 3-clause BSD license
  5 | // (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | #pragma once
  8 | 
  9 | #include "dsc.h"
 10 | #include "dsc_tracing_common.h"
 11 | 
 12 | #if defined(DSC_TRACING)
 13 | 
 14 | #include "gpu/dsc_gpu.h"
 15 | 
 16 | #undef DSC_INSERT_TYPED_TRACE
 17 | #undef DSC_INSERT_NAMED_TRACE
 18 | 
 19 | #define DSC_INSERT_TYPED_TRACE(DEV, T, type_, grid_dim_, block_dim_) \
 20 |     dsc_gpu_trace_tracker<T> trace__ { (DEV)->trace_ctx, __FUNCTION__, (type_), (grid_dim_), (block_dim_), &args__ }
 21 | 
 22 | #define DSC_INSERT_NAMED_TRACE(DEV, T, type_, name_, grid_dim_, block_dim_) \
 23 |     dsc_gpu_trace_tracker<T> trace__ { (DEV)->trace_ctx, (name_), (type_), (grid_dim_), (block_dim_), &args__ }
 24 | 
 25 | 
 26 | struct dsc_gpu_trace {
 27 |     dsc_trace_common base;
 28 | 
 29 |     gpu_event start_event, stop_event;
 30 |     dim3 grid_dim, block_dim;
 31 |     f32 elapsed_ms;
 32 | 
 33 |     DSC_INLINE bool to_eval() const {
 34 |         return this->elapsed_ms <= 0.f;
 35 |     }
 36 | };
 37 | 
 38 | template<typename T>
 39 | struct dsc_gpu_trace_tracker {
 40 |     dsc_gpu_trace_tracker(dsc_trace_ctx *ctx,
 41 |                           const char *name,
 42 |                           const dsc_trace_type type,
 43 |                           const dim3 grid_dim,
 44 |                           const dim3 block_dim,
 45 |                           const T *args) {
 46 |         using namespace internal::tracing;
 47 | 
 48 |         if (dsc_tracing_is_enabled()) {
 49 |             check_if_full<dsc_gpu_trace>(ctx);
 50 |             trace_ = next_empty_trace<dsc_gpu_trace>(ctx);
 51 |             fill_trace(&trace_->base, name, type, args);
 52 |             DSC_GPU_CHECK(gpu_event_create(&trace_->start_event));
 53 |             DSC_GPU_CHECK(gpu_event_create(&trace_->stop_event));
 54 |             trace_->grid_dim = grid_dim;
 55 |             trace_->block_dim = block_dim;
 56 |             trace_->elapsed_ms = 0.f;
 57 |             DSC_GPU_CHECK(gpu_event_record(trace_->start_event));
 58 |         }
 59 |     }
 60 | 
 61 |     ~dsc_gpu_trace_tracker() {
 62 |         if (trace_) {
 63 |             DSC_GPU_CHECK(gpu_event_record(trace_->stop_event));
 64 |         }
 65 |     }
 66 | 
 67 | private:
 68 |     dsc_gpu_trace *trace_ = nullptr;
 69 | };
 70 | 
 71 | static DSC_INLINE dsc_trace_ctx *dsc_gpu_tracing_init() {
 72 |     return internal::tracing::init<dsc_gpu_trace>();
 73 | }
 74 | 
 75 | static DSC_INLINE void dsc_gpu_tracing_dispose(const dsc_trace_ctx *ctx) {
 76 |     internal::tracing::dispose(ctx);
 77 | }
 78 | 
 79 | static DSC_INLINE void dsc_gpu_tracing_dump(void *trace, FILE *json_file,
 80 |                                             const bool to_console, const bool to_json) {
 81 |     dsc_gpu_trace *gpu_trace = (dsc_gpu_trace *) trace;
 82 | 
 83 |     if (gpu_trace->to_eval()) {
 84 |         // Make sure this is called once for each trace
 85 |         DSC_GPU_CHECK(gpu_event_synchronize(gpu_trace->stop_event));
 86 |         DSC_GPU_CHECK(gpu_event_elapsed(&gpu_trace->elapsed_ms, gpu_trace->start_event,gpu_trace->stop_event));
 87 |     }
 88 | 
 89 |     const dsc_trace_common *base = &gpu_trace->base;
 90 | 
 91 |     const f64 elapsed_ms = (f64) gpu_trace->elapsed_ms;
 92 |     const u64 elapsed_us = (u64) (elapsed_ms * 1e3);
 93 |     const f64 bandwidth = (f64) base->rw_bytes / (elapsed_ms * 1e-3 * DSC_GB(1));
 94 | 
 95 |     if (to_console) {
 96 |         // So that we can align this
 97 |         char formatted_kernel_name[256];
 98 |         snprintf(formatted_kernel_name, 256,
 99 |                  "%s<(%d,%d,%d), (%d,%d,%d)>",
100 |                  base->name,
101 |                  gpu_trace->grid_dim.x,
102 |                  gpu_trace->grid_dim.y,
103 |                  gpu_trace->grid_dim.z,
104 |                  gpu_trace->block_dim.x,
105 |                  gpu_trace->block_dim.y,
106 |                  gpu_trace->block_dim.z);
107 | 
108 |         // Console dumping
109 |         printf("*** [%ld] \033[38;5;208m%-12s\033[0m %-40s %.2fms (%6ldus)\t|\t%10.2fGB/s (%ldB)\n",
110 |                base->ingestion_time_us,
111 |                "GPU",
112 |                formatted_kernel_name,
113 |                elapsed_ms,
114 |                elapsed_us,
115 |                bandwidth,
116 |                base->rw_bytes);
117 |     }
118 | 
119 |     if (to_json) {
120 |         fprintf(json_file, R"({"name":"%s","cat":"%s","ph":"X","ts":%ld,"dur":%ld,"pid":0,"tid":0)",
121 |                 base->name,
122 |                 DSC_TRACE_CATEGORY[base->type],
123 |                 base->ingestion_time_us,
124 |                 elapsed_us);
125 |         fprintf(json_file, R"(,"args":{"bandwidth":"%.2fGB/s")", bandwidth);
126 |         internal::tracing::dump_trace_base(json_file, base);
127 |         fprintf(json_file, R"==(,"launch_config":{"grid":"(%d,%d,%d)","block":"(%d,%d,%d)"}})==",
128 |                 gpu_trace->grid_dim.x,
129 |                 gpu_trace->grid_dim.y,
130 |                 gpu_trace->grid_dim.z,
131 |                 gpu_trace->block_dim.x,
132 |                 gpu_trace->block_dim.y,
133 |                 gpu_trace->block_dim.z);
134 |         fprintf(json_file, R"(})" ",\n");
135 |     }
136 | 
137 |     if (gpu_trace->to_eval()) {
138 |         DSC_GPU_CHECK(gpu_event_destroy(gpu_trace->start_event));
139 |         DSC_GPU_CHECK(gpu_event_destroy(gpu_trace->stop_event));
140 |     }
141 | }
142 | 
143 | static DSC_INLINE void dsc_gpu_next_trace(dsc_trace_ctx *ctx) {
144 |     internal::tracing::advance_current_trace<dsc_gpu_trace>(ctx);
145 | }
146 | 
147 | static DSC_INLINE void dsc_gpu_dump_json_metadata(FILE *json_file, void *extra_info) {
148 |     const dsc_gpu_dev_info *dev_info = (dsc_gpu_dev_info *) extra_info;
149 |     fprintf(json_file, R"({"name":"process_name","ph":"M","pid":%d,"tid":0,"args":{"name":"%s"},"process_sort_index":100})" ",\n",
150 |             dev_info->dev_idx,
151 |             dev_info->name);
152 |     fprintf(json_file, R"({"name":"thread_name","ph":"M","pid":%d,"tid":0,"args":{"name":"Stream"},"thread_sort_index":101})" ",\n",
153 |             dev_info->dev_idx);
154 | }
155 | 
156 | #else
157 | 
158 | static DSC_INLINE dsc_trace_ctx *dsc_gpu_tracing_init() { return nullptr; }
159 | static DSC_INLINE void dsc_gpu_tracing_dispose(const dsc_trace_ctx *) {}
160 | static DSC_INLINE void dsc_gpu_tracing_dump(void *, FILE *, bool, bool) {}
161 | static DSC_INLINE void dsc_gpu_next_trace(dsc_trace_ctx *) {}
162 | static DSC_INLINE void dsc_gpu_dump_json_metadata(FILE *, void *) {}
163 | 
164 | #endif // DSC_TRACING


--------------------------------------------------------------------------------
/dsc/include/cpu/dsc_cpu.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | // All rights reserved.
  3 | //
  4 | // This code is licensed under the terms of the 3-clause BSD license
  5 | // (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | #pragma once
  8 | 
  9 | #include "dsc.h"
 10 | 
 11 | 
 12 | struct dsc_device;
 13 | 
 14 | 
 15 | // ============================================================
 16 | // CPU-specific operations
 17 | //
 18 | 
 19 | extern void dsc_cpu_cast(dsc_device *dev,
 20 |                          const dsc_tensor *DSC_RESTRICT x,
 21 |                          dsc_tensor *DSC_RESTRICT out);
 22 | 
 23 | extern void dsc_cpu_arange(dsc_device *dev,
 24 |                            dsc_tensor *DSC_RESTRICT x,
 25 |                            f64 start, f64 step);
 26 | 
 27 | extern void dsc_cpu_repeat(dsc_device *dev,
 28 |                            const dsc_tensor *DSC_RESTRICT x,
 29 |                            dsc_tensor *DSC_RESTRICT out,
 30 |                            int repeats, int axis_idx);
 31 | 
 32 | extern void dsc_cpu_randn(dsc_device *dev, dsc_tensor *DSC_RESTRICT x);
 33 | 
 34 | extern void dsc_cpu_kth(dsc_device *dev,
 35 |                         const dsc_tensor *DSC_RESTRICT x,
 36 |                         dsc_tensor *DSC_RESTRICT out,
 37 |                         int k);
 38 | 
 39 | extern void dsc_cpu_multinomial(dsc_device *dev,
 40 |                                 const dsc_tensor *DSC_RESTRICT x,
 41 |                                 dsc_tensor *DSC_RESTRICT out,
 42 |                                 int num_samples);
 43 | 
 44 | extern void dsc_cpu_concat(dsc_device *dev,
 45 |                            dsc_tensor **to_concat,
 46 |                            int tensors,
 47 |                            dsc_tensor *DSC_RESTRICT out,
 48 |                            int axis_idx);
 49 | 
 50 | extern void dsc_cpu_transpose(dsc_device *dev,
 51 |                               const dsc_tensor *DSC_RESTRICT x,
 52 |                               dsc_tensor *DSC_RESTRICT out,
 53 |                               const int *new_shape,
 54 |                               const int *new_stride);
 55 | 
 56 | extern void dsc_cpu_tril(dsc_device *dev,
 57 |                          const dsc_tensor *DSC_RESTRICT x,
 58 |                          int diagonal,
 59 |                          dsc_tensor *DSC_RESTRICT out);
 60 | 
 61 | // ============================================================
 62 | // Indexing and Slicing
 63 | //
 64 | 
 65 | extern void dsc_cpu_get_slice(dsc_device *dev,
 66 |                               const dsc_tensor *DSC_RESTRICT x,
 67 |                               dsc_tensor *DSC_RESTRICT out,
 68 |                               int n_slices, const dsc_slice *slices,
 69 |                               bool whole);
 70 | 
 71 | extern void dsc_cpu_set_slice(dsc_device *dev,
 72 |                               dsc_tensor *DSC_RESTRICT xa,
 73 |                               bool xa_scalar,
 74 |                               const dsc_tensor *DSC_RESTRICT xb,
 75 |                               bool xb_scalar,
 76 |                               int n_slices,
 77 |                               const dsc_slice *slices,
 78 |                               bool whole);
 79 | 
 80 | // ============================================================
 81 | // Binary Operations
 82 | 
 83 | extern void dsc_cpu_add(dsc_device *dev,
 84 |                         const dsc_tensor *xa,
 85 |                         const dsc_tensor *xb,
 86 |                         dsc_tensor *out);
 87 | 
 88 | extern void dsc_cpu_sub(dsc_device *dev,
 89 |                         const dsc_tensor *xa,
 90 |                         const dsc_tensor *xb,
 91 |                         dsc_tensor *out);
 92 | 
 93 | extern void dsc_cpu_mul(dsc_device *dev,
 94 |                         const dsc_tensor *xa,
 95 |                         const dsc_tensor *xb,
 96 |                         dsc_tensor *out);
 97 | 
 98 | extern void dsc_cpu_div(dsc_device *dev,
 99 |                         const dsc_tensor *xa,
100 |                         const dsc_tensor *xb,
101 |                         dsc_tensor *out);
102 | 
103 | extern void dsc_cpu_pow(dsc_device *dev,
104 |                         const dsc_tensor *xa,
105 |                         const dsc_tensor *xb,
106 |                         dsc_tensor *out);
107 | 
108 | extern void dsc_cpu_matmul(dsc_device *devdev,
109 |                            const dsc_tensor *DSC_RESTRICT xa,
110 |                            const dsc_tensor *DSC_RESTRICT xb,
111 |                            bool trans_b,
112 |                            dsc_tensor *DSC_RESTRICT out);
113 | 
114 | extern void dsc_cpu_compare(dsc_device *dev,
115 |                             const dsc_tensor *xa,
116 |                             const dsc_tensor *xb,
117 |                             dsc_comparison_op comp,
118 |                             dsc_tensor *out);
119 | 
120 | extern void dsc_cpu_masked_fill(dsc_device *dev,
121 |                                 dsc_tensor *x,
122 |                                 const dsc_tensor *mask,
123 |                                 f64 value);
124 | 
125 | extern void dsc_cpu_outer(dsc_device *dev,
126 |                           const dsc_tensor *DSC_RESTRICT xa,
127 |                           const dsc_tensor *DSC_RESTRICT xb,
128 |                           dsc_tensor *DSC_RESTRICT out);
129 | 
130 | extern void dsc_cpu_where(dsc_device *dev,
131 |                           const dsc_tensor *DSC_RESTRICT condition,
132 |                           const dsc_tensor *DSC_RESTRICT input,
133 |                           const dsc_tensor *DSC_RESTRICT other,
134 |                           dsc_tensor *DSC_RESTRICT out);
135 | 
136 | // ============================================================
137 | // Unary Operations
138 | 
139 | extern void dsc_cpu_cos(dsc_device *dev,
140 |                         const dsc_tensor *DSC_RESTRICT x,
141 |                         dsc_tensor *DSC_RESTRICT out);
142 | 
143 | extern void dsc_cpu_sin(dsc_device *dev,
144 |                         const dsc_tensor *DSC_RESTRICT x,
145 |                         dsc_tensor *DSC_RESTRICT out);
146 | 
147 | extern void dsc_cpu_tanh(dsc_device *devdev,
148 |                          const dsc_tensor *DSC_RESTRICT x,
149 |                          dsc_tensor *DSC_RESTRICT out);
150 | 
151 | extern void dsc_cpu_exp(dsc_device *dev,
152 |                         const dsc_tensor *DSC_RESTRICT x,
153 |                         dsc_tensor *DSC_RESTRICT out);
154 | 
155 | extern void dsc_cpu_sqrt(dsc_device *dev,
156 |                          const dsc_tensor *DSC_RESTRICT x,
157 |                          dsc_tensor *DSC_RESTRICT out);
158 | 
159 | // ============================================================
160 | // Unary Operations Along Axis
161 | 
162 | extern void dsc_cpu_sum(dsc_device *dev,
163 |                         const dsc_tensor *DSC_RESTRICT x,
164 |                         dsc_tensor *DSC_RESTRICT out,
165 |                         int axis_idx);
166 | 
167 | extern void dsc_cpu_min(dsc_device *dev,
168 |                         const dsc_tensor *DSC_RESTRICT x,
169 |                         dsc_tensor *DSC_RESTRICT out,
170 |                         int axis_idx);
171 | 
172 | extern void dsc_cpu_max(dsc_device *dev,
173 |                         const dsc_tensor *DSC_RESTRICT x,
174 |                         dsc_tensor *DSC_RESTRICT out,
175 |                         int axis_idx);


--------------------------------------------------------------------------------
/python/tests/test_ops_cpu.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | # All rights reserved.
  3 | #
  4 | # This code is licensed under the terms of the 3-clause BSD license
  5 | # (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | import pytest
  8 | from random import randint, random
  9 | from utils_cpu import *
 10 | 
 11 | 
 12 | @pytest.fixture(scope='session', autouse=True)
 13 | def session_fixture():
 14 |     # This is invoked once before starting the test session
 15 |     dsc.init(int(2**30))
 16 |     dsc.set_default_device('cpu')
 17 |     yield
 18 | 
 19 | 
 20 | @pytest.fixture(autouse=True)
 21 | def teardown_fixture():
 22 |     # This is invoked automatically after each test
 23 |     yield
 24 | 
 25 | 
 26 | class TestOps:
 27 |     def test_binary(self):
 28 |         ops = {
 29 |             'add': (np.add, dsc.add),
 30 |             'sub': (np.subtract, dsc.sub),
 31 |             'mul': (np.multiply, dsc.mul),
 32 |             'div': (np.true_divide, dsc.true_div),
 33 |             'power': (np.power, dsc.power),
 34 |             'equal': (np.equal, dsc.equal),
 35 |             'not_equal': (np.not_equal, dsc.not_equal),
 36 |             'less': (np.less, dsc.less),
 37 |             'less_equal': (np.less_equal, dsc.less_equal),
 38 |             'greater': (np.greater, dsc.greater),
 39 |             'greater_equal': (np.greater_equal, dsc.greater_equal),
 40 |         }
 41 |         for op_name in ops.keys():
 42 |             np_op, dsc_op = ops[op_name]
 43 |             for dtype in DTYPES:
 44 |                 if op_name == 'sub':
 45 |                     np_op = np.bitwise_xor if is_bool(dtype) else np.subtract
 46 | 
 47 |                 print(f'Testing operator {op_name} with {dtype.__name__}')
 48 |                 shape = [randint(2, 10) for _ in range(4)]
 49 |                 x = random_nd(shape, dtype=dtype)
 50 |                 x_dsc = dsc.from_numpy(x)
 51 | 
 52 |                 # Same shape
 53 |                 y = random_nd(shape, dtype=dtype)
 54 |                 y_dsc = dsc.from_numpy(y)
 55 | 
 56 |                 res_np = np_op(x, y)
 57 |                 res_dsc = dsc_op(x_dsc, y_dsc)
 58 |                 r_res_np = np_op(y, x)
 59 |                 r_res_dsc = dsc_op(y_dsc, x_dsc)
 60 |                 assert all_close(res_dsc, res_np)
 61 |                 assert all_close(r_res_dsc, r_res_np)
 62 | 
 63 |                 # Broadcasting
 64 |                 collapse_idx = randint(0, 3)
 65 |                 shape[collapse_idx] = 1
 66 |                 y_b = random_nd(shape, dtype=dtype)
 67 |                 y_dsc_b = dsc.from_numpy(y_b)
 68 |                 res_np_b = np_op(x, y_b)
 69 |                 res_dsc_b = dsc_op(x_dsc, y_dsc_b)
 70 |                 r_res_np_b = np_op(y_b, x)
 71 |                 r_res_dsc_b = dsc_op(y_dsc_b, x_dsc)
 72 |                 assert all_close(res_dsc_b, res_np_b)
 73 |                 assert all_close(r_res_dsc_b, r_res_np_b)
 74 | 
 75 |                 # Scalar
 76 |                 if is_float(dtype):
 77 |                     y_s = random()
 78 |                 elif is_bool(dtype):
 79 |                     y_s = bool(randint(0, 1))
 80 |                 else:
 81 |                     y_s = randint(0, 10)
 82 | 
 83 |                 res_np_s = np_op(x, y_s)
 84 |                 res_dsc_s = dsc_op(x_dsc, y_s)
 85 |                 r_res_np_s = np_op(y_s, x)
 86 |                 r_res_dsc_s = dsc_op(y_s, x_dsc)
 87 | 
 88 |                 assert all_close(res_dsc_s, res_np_s)
 89 |                 assert all_close(r_res_dsc_s, r_res_np_s)
 90 | 
 91 |     def test_outer(self):
 92 |         for dtype in DTYPES:
 93 |             for _ in range(10):
 94 |                 xa = random_nd([randint(2, 50)], dtype)
 95 |                 xb = random_nd([randint(2, 50)], dtype)
 96 |                 xa_dsc = dsc.from_numpy(xa)
 97 |                 xb_dsc = dsc.from_numpy(xb)
 98 | 
 99 |                 out = np.outer(xa, xb)
100 |                 out_dsc = dsc.outer(xa_dsc, xb_dsc)
101 |                 assert all_close(out_dsc, out)
102 | 
103 |     def test_matmul(self):
104 |         def _mnk() -> tuple[int, int, int]:
105 |             return randint(50, 100), randint(50, 100), randint(50, 100)
106 | 
107 |         def _test_matmul(shape_a: List[int], shape_b: List[int], dt: np.dtype):
108 |             print(f'Testing {shape_a} @ {shape_b} with {dt.__name__}')
109 |             xa = random_nd(shape_a, dtype=dt)
110 |             xb = random_nd(shape_b, dtype=dt)
111 |             xa_dsc = dsc.from_numpy(xa)
112 |             xb_dsc = dsc.from_numpy(xb)
113 | 
114 |             res = xa @ xb
115 |             res_dsc = xa_dsc @ xb_dsc
116 |             assert all_close(res_dsc, res)
117 | 
118 |         for dtype in DSC_DTYPES:
119 |             if is_bool(dtype):
120 |                 continue
121 |             # 2D matrices
122 |             for _ in range(5):
123 |                 m, n, k = _mnk()
124 |                 _test_matmul([m, k], [k, n], dtype)
125 | 
126 |             # Batched case
127 |             for _ in range(5):
128 |                 batch_1, batch_2 = randint(2, 10), randint(2, 10)
129 |                 m, n, k = _mnk()
130 |                 _test_matmul([batch_1, batch_2, m, k], [batch_1, batch_2, k, n], dtype)
131 | 
132 |             # Batched case with broadcasting
133 |             for batch_1 in range(1, 6):
134 |                 for batch_2 in range(1, 6):
135 |                     m, n, k = _mnk()
136 |                     _test_matmul([batch_1 if batch_1%2 == 0 else 1,
137 |                                   batch_2 if batch_2%2 == 0 else 1, m, k],
138 |                                  [batch_1 if batch_1%2 == 1 else 1,
139 |                                   batch_2 if batch_2%2 == 1 else 1, k, n],
140 |                                  dtype)
141 | 
142 |     def test_unary(self):
143 |         ops = {
144 |             'sin': (np.sin, dsc.sin),
145 |             'cos': (np.cos, dsc.cos),
146 |             'tanh': (np.tanh, dsc.tanh),
147 |             'exp': (np.exp, dsc.exp),
148 |             'sqrt': (np.sqrt, dsc.sqrt),
149 |         }
150 |         for op_name in ops.keys():
151 |             np_op, dsc_op = ops[op_name]
152 |             for dtype in DTYPES:
153 |                 print(f'Testing {op_name} with {dtype.__name__}')
154 |                 x = random_nd([randint(1, 10) for _ in range(4)], dtype=dtype)
155 |                 x_dsc = dsc.from_numpy(x)
156 | 
157 |                 res_np = np_op(x)
158 |                 res_dsc = dsc_op(x_dsc)
159 |                 # There are precision issues when working with non-float types
160 |                 assert all_close(res_dsc, res_np, 1e-5 if is_float(dtype) else 1e-3)
161 | 
162 |     def test_unary_axis(self):
163 |         ops = {
164 |             'sum': (np.sum, dsc.sum),
165 |             'mean': (np.mean, dsc.mean),
166 |             'var': (np.var, dsc.var),
167 |             'max': (np.max, dsc.max),
168 |             'min': (np.min, dsc.min),
169 |         }
170 |         for op_name in ops.keys():
171 |             np_op, dsc_op = ops[op_name]
172 |             for dtype in DTYPES:
173 |                 for axis in range(-4, 4):
174 |                     print(f'Testing {op_name} with {dtype.__name__} along axis {axis}')
175 |                     x = random_nd(
176 |                         [randint(1, 10) for _ in range(4)], dtype=dtype
177 |                     )
178 |                     x_dsc = dsc.from_numpy(x)
179 | 
180 |                     res_np = np_op(x, axis=axis, keepdims=True)
181 |                     res_dsc = dsc_op(x_dsc, axis=axis, keepdims=True)
182 |                     assert all_close(res_dsc, res_np)
183 | 
184 |                     res_np_2 = np_op(x, axis=axis, keepdims=False)
185 |                     res_dsc_2 = dsc_op(x_dsc, axis=axis, keepdims=False)
186 |                     assert all_close(res_dsc_2, res_np_2)
187 | 


--------------------------------------------------------------------------------
/python/dsc/nn/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | # All rights reserved.
  3 | #
  4 | # This code is licensed under the terms of the 3-clause BSD license
  5 | # (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | from ..tensor import Tensor, power, matmul, rsqrt
  8 | from ..dtype import Dtype
  9 | from ..device import Device
 10 | from .._bindings import _dsc_new_tensor
 11 | from ..context import _get_ctx
 12 | from ..profiler import trace
 13 | from typing import Iterator, Dict, Iterable, Any, Tuple, Callable, Optional, OrderedDict, Mapping, List
 14 | from abc import ABC, abstractmethod
 15 | from tqdm import tqdm
 16 | from .utils import safe_load
 17 | from . import functional
 18 | 
 19 | 
 20 | class Parameter(Tensor):
 21 |     def __init__(self, shape: Tuple[int, ...], dtype: Dtype = Dtype.F32, device: Device = Device.DEFAULT, on_load: Optional[Callable[[Tensor], Tensor]] = None):
 22 |         # Parameters are lazy tensors (i.e. Tensors that don't have an underlying buffer)
 23 |         super().__init__(_dsc_new_tensor(_get_ctx(), shape, dtype, device, True))
 24 |         self._on_load = on_load
 25 | 
 26 |     def load(self, x: Tensor):
 27 |         if self._on_load is not None:
 28 |             x = self._on_load(x)
 29 |         super().load(x)
 30 | 
 31 | class Module(ABC):
 32 |     def __init__(self):
 33 |         super().__init__()
 34 |         self._parameters = {}
 35 |         self._modules = {}
 36 | 
 37 |     def register_parameter(self, name: str, param: Parameter):
 38 |         if name in self._parameters:
 39 |             raise RuntimeError(f'parameter "{name}" already registered')
 40 | 
 41 |         self._parameters[name] = param
 42 | 
 43 |     def register_module(self, name: str, module: 'Module'):
 44 |         if name in self._modules:
 45 |             raise RuntimeError(f'module "{name}" already registered')
 46 | 
 47 |         self._modules[name] = module
 48 | 
 49 |     def __setattr__(self, name: str, value: Any):
 50 |         if isinstance(value, Parameter):
 51 |             self.register_parameter(name, value)
 52 |         elif isinstance(value, Module):
 53 |             self.register_module(name, value)
 54 | 
 55 |         super().__setattr__(name, value)
 56 | 
 57 |     def parameters(self) -> Iterator[Parameter]:
 58 |         for param in self._parameters.values():
 59 |             yield param
 60 |         for module in self._modules.values():
 61 |             yield from module.parameters()
 62 | 
 63 |     def named_parameters(self, prefix: str = '') -> Iterator[Tuple[str, Parameter]]:
 64 |         for name, param in self._parameters.items():
 65 |             yield prefix + ('.' if prefix else '') + name, param
 66 |         for module_name, module in self._modules.items():
 67 |             submodule_prefix = prefix + ('.' if prefix else '') + module_name
 68 |             yield from module.named_parameters(submodule_prefix)
 69 | 
 70 |     def state_dict(self) -> OrderedDict[str, Parameter]:
 71 |         res = OrderedDict[str, Parameter]()
 72 |         for name, param in self.named_parameters():
 73 |             res[name] = param
 74 |         return res
 75 | 
 76 |     def from_state(self, state_dict: Dict[str, Tensor],
 77 |                    on_hook: Optional[List[Tuple[List[str], Callable[[Tensor], Tensor]]]] = None,
 78 |                    tied: Optional[Dict[str, str]] = None):
 79 |         with tqdm(total=len(state_dict), desc='Loading model parameters') as pbar:
 80 |             for name, param in self.named_parameters():
 81 |                 real_name = name
 82 |                 if tied is not None and name in tied:
 83 |                     name = tied[name]
 84 | 
 85 |                 if name not in state_dict:
 86 |                     pbar.write(f'{name} not found in DSC model')
 87 |                     pbar.update(1)
 88 |                     continue
 89 | 
 90 |                 tensor = state_dict[name]
 91 |                 if on_hook:
 92 |                     # on_hook defines transformations on tensors that are called before loading the tensors in DSC
 93 |                     for keys, hook in on_hook:
 94 |                         # If any of the keys starts with 'name' I'll apply the hook
 95 |                         if any(name.endswith(key) for key in keys):
 96 |                             tensor = hook(tensor)
 97 | 
 98 |                 pbar.set_description(f'{real_name if real_name == name else f"{real_name} (tied to {name})"} {tensor.shape} {tensor.dtype}')
 99 |                 param.load(tensor)
100 |                 pbar.update(1)
101 | 
102 |     @abstractmethod
103 |     def forward(self, *args, **kwargs):
104 |         pass
105 | 
106 |     def __call__(self, *args, **kwargs):
107 |         return self.forward(*args, **kwargs)
108 | 
109 | class ModuleList(Module):
110 |     def __init__(self, modules: Iterable[Module]):
111 |         super().__init__()
112 |         for i, module in enumerate(modules):
113 |             self.register_module(str(i), module)
114 |     
115 |     def __len__(self) -> int:
116 |         return len(self._modules)
117 |     
118 |     def __iter__(self) -> Iterator[Module]:
119 |         return iter(self._modules.values())
120 | 
121 |     def forward(self):
122 |         raise NotImplementedError('forward() is not supported in ModuleList')
123 | 
124 | class ModuleDict(Module):
125 |     def __init__(self, modules: Mapping[str, Module]):
126 |         super().__init__()
127 |         for name, module in modules.items():
128 |             setattr(self, name, module)
129 | 
130 |     def __len__(self) -> int:
131 |         return len(self._modules)
132 |     
133 |     def __iter__(self) -> Iterator[str]:
134 |         return iter(self._modules)
135 |     
136 |     def forward(self):
137 |         raise NotImplementedError('forward() is not supported in ModuleDict')
138 | 
139 | class Linear(Module):
140 |     def __init__(self, in_features: int, out_features: int, bias: bool = True, dtype: Dtype = Dtype.F32):
141 |         super().__init__()
142 |         self.weight = Parameter((out_features, in_features), dtype=dtype)
143 |         self.bias = Parameter((out_features, ), dtype=dtype) if bias else None
144 | 
145 |     @trace('Linear')
146 |     def forward(self, x: Tensor) -> Tensor:
147 |         out = matmul(x, self.weight, trans_b=True)
148 |         if self.bias:
149 |             out += self.bias
150 |         return out
151 | 
152 | class LayerNorm(Module):
153 |     def __init__(self, n_features: int, epsilon: float = 1e-5, dtype: Dtype = Dtype.F32):
154 |         super().__init__()
155 |         self.epsilon = epsilon
156 |         self.weight = Parameter((n_features, ), dtype=dtype)
157 |         self.bias = Parameter((n_features, ), dtype=dtype)
158 | 
159 |     @trace('LayerNorm')
160 |     def forward(self, x: Tensor) -> Tensor:
161 |         mean = x.mean(-1, keepdims=True)
162 |         var = x.var(-1, keepdims=True)
163 | 
164 |         out = (x - mean) / (var + self.epsilon) ** 0.5
165 |         out = out * self.weight + self.bias
166 |     
167 |         return out
168 | 
169 | class RMSNorm(Module):
170 |     def __init__(self, in_shape: int, epsilon: float = 1e-6, dtype: Dtype = Dtype.F32):
171 |         super().__init__()
172 |         self.epsilon = epsilon
173 |         self.weight = Parameter((in_shape, ), dtype=dtype)
174 | 
175 |     @trace('RMSNorm')
176 |     def forward(self, x: Tensor) -> Tensor:
177 |         var = power(x, 2).mean(-1, keepdims=True)
178 |         out = x * rsqrt(var + self.epsilon)
179 |         return out * self.weight
180 | 
181 | class Embedding(Module):
182 |     def __init__(self, num_embeddings: int, embedding_size: int, dtype: Dtype = Dtype.F32):
183 |         super().__init__()
184 |         self.weight = Parameter((num_embeddings, embedding_size), dtype=dtype)
185 | 
186 |     @trace('Embedding')
187 |     def forward(self, x: Tensor) -> Tensor:
188 |         return self.weight[x]
189 | 


--------------------------------------------------------------------------------
/dsc/include/dsc_device.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | // All rights reserved.
  3 | //
  4 | // This code is licensed under the terms of the 3-clause BSD license
  5 | // (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | #pragma once
  8 | 
  9 | #include "dsc.h"
 10 | #include <cstdlib>
 11 | 
 12 | #define dsc_node_is_free(PTR) ((PTR)->data == nullptr && (PTR)->next == nullptr && (PTR)->size == 0)
 13 | #define dsc_node_mark_free(PTR) \
 14 |     do {                        \
 15 |         (PTR)->data = nullptr;  \
 16 |         (PTR)->next = nullptr;  \
 17 |         (PTR)->size = 0;        \
 18 |     } while (0)
 19 | 
 20 | struct dsc_data_buffer {
 21 |     void *data;
 22 |     usize size;
 23 |     int refs;
 24 | };
 25 | 
 26 | struct dsc_free_node {
 27 |     void *data;
 28 |     dsc_free_node *next;
 29 |     usize size;
 30 | };
 31 | 
 32 | enum dsc_memcpy_dir : u8 {
 33 |     UNUSED,
 34 |     FROM_DEVICE,
 35 |     TO_DEVICE,
 36 |     ON_DEVICE
 37 | };
 38 | 
 39 | 
 40 | static constexpr dsc_memcpy_dir DSC_MEMCPY_DIRECTIONS_LOOKUP[DSC_MAX_DEVICES][DSC_MAX_DEVICES] = {
 41 |     {UNUSED, TO_DEVICE},
 42 |     {FROM_DEVICE, ON_DEVICE},
 43 | };
 44 | 
 45 | 
 46 | struct dsc_device {
 47 |     dsc_data_buffer used_nodes[DSC_MAX_OBJS];
 48 |     dsc_free_node free_nodes[DSC_MAX_OBJS];
 49 |     dsc_free_node *head;
 50 |     void *device_mem;
 51 |     usize alignment;
 52 | 
 53 |     // Extra device-specific infos
 54 |     void *extra_info;
 55 | 
 56 |     dsc_trace_ctx *trace_ctx;
 57 | 
 58 |     usize mem_size, used_mem;
 59 |     dsc_device_type type;
 60 | 
 61 |     void (*memcpy)  (void *dst, const void *src, usize nb, dsc_memcpy_dir dir);
 62 |     void (*memset)  (void *dst, int c, usize nb);
 63 |     void (*dispose) (dsc_device *dev);
 64 | 
 65 |     // Iterator method to get to the next trace. Will update trace_ctx->current_trace
 66 |     void (*next_trace)                  (dsc_trace_ctx *ctx);
 67 |     void (*dump_trace)                  (void *trace, FILE *json_file, bool to_console, bool to_json);
 68 |     // Dump device-specific json metadata events (i.e. set processor name). This is purely cosmetic
 69 |     void (*dump_json_metadata)          (FILE *json_file, void *extra_info);
 70 | };
 71 | 
 72 | namespace internal::alloc {
 73 | DSC_INLINE dsc_free_node *find_best(dsc_device *dev,
 74 |                                     const usize required_size,
 75 |                                     dsc_free_node **prev) {
 76 |     dsc_free_node *node = dev->head;
 77 |     dsc_free_node *best = node->size >= required_size ? node : nullptr;
 78 |     dsc_free_node *prev_node = nullptr;
 79 | 
 80 |     while (node->next != nullptr) {
 81 |         if (node->next->size >= required_size &&
 82 |             (best == nullptr || best->size >= node->next->size)) {
 83 |             prev_node = node;
 84 |             best = node->next;
 85 |         }
 86 |         node = node->next;
 87 |     }
 88 | 
 89 |     *prev = prev_node;
 90 | 
 91 |     return best;
 92 | }
 93 | 
 94 | DSC_INLINE void node_insert(dsc_free_node **head,
 95 |                             dsc_free_node *prev,
 96 |                             dsc_free_node *to_insert) {
 97 |     if (prev == nullptr) {
 98 |         if (*head != nullptr) {
 99 |             to_insert->next = *head;
100 |         }
101 |         *head = to_insert;
102 |     } else {
103 |         if (prev->next == nullptr) {
104 |             prev->next = to_insert;
105 |             to_insert->next = nullptr;
106 |         } else {
107 |             to_insert->next = prev->next;
108 |             prev->next = to_insert;
109 |         }
110 |     }
111 | }
112 | 
113 | DSC_INLINE void node_remove(dsc_free_node **head,
114 |                             dsc_free_node *prev,
115 |                             dsc_free_node *to_remove) {
116 |     if (prev == nullptr) {
117 |         *head = to_remove->next;
118 |     } else {
119 |         prev->next = to_remove->next;
120 |     }
121 | 
122 |     dsc_node_mark_free(to_remove);
123 | }
124 | 
125 | DSC_INLINE dsc_free_node *next_free_node(dsc_device *dev) {
126 |     for (int i = 0; i < DSC_MAX_OBJS; ++i) {
127 |         if (dsc_free_node *bin = &dev->free_nodes[i]; dsc_node_is_free(bin)) {
128 |             return bin;
129 |         }
130 |     }
131 |     return nullptr;
132 | }
133 | }
134 | 
135 | using namespace internal::alloc;
136 | 
137 | static DSC_INLINE dsc_data_buffer *dsc_data_alloc(dsc_device *dev, usize nb) {
138 |     DSC_ASSERT(dev != nullptr);
139 |     DSC_ASSERT(nb > 0);
140 | 
141 |     nb = DSC_ALIGN(nb, dev->alignment);
142 | 
143 |     dsc_free_node *prev = nullptr;
144 |     dsc_free_node *node = find_best(dev, nb, &prev);
145 |     if (node == nullptr) {
146 |         DSC_LOG_FATAL("error allocating %.2fKB on %s", DSC_B_TO_KB(nb), DSC_DEVICE_NAMES[dev->type]);
147 |     }
148 | 
149 |     if (const usize left = node->size - nb; left >= dev->alignment) {
150 |         dsc_free_node *new_node = next_free_node(dev);
151 |         if (new_node == nullptr) {
152 |             DSC_LOG_FATAL("memory reached critical fragmentation!");
153 |         }
154 | 
155 |         node->size = nb;
156 |         new_node->size = left;
157 |         // The data for the new bin starts after the previous one
158 |         new_node->data = (byte *) node->data + node->size;
159 |         node_insert(&dev->head, node, new_node);
160 |     }
161 | 
162 |     dsc_data_buffer *data_buf = nullptr;
163 |     for (int i = 0; i < DSC_MAX_OBJS; ++i) {
164 |         if (dsc_data_buffer *buf = &dev->used_nodes[i]; buf->refs == 0) {
165 |             data_buf = buf;
166 |             break;
167 |         }
168 |     }
169 |     if (!data_buf) {
170 |         DSC_LOG_FATAL("can't allocate any more objects!");
171 |     }
172 | 
173 |     data_buf->data = node->data;
174 |     data_buf->refs = 1;
175 |     data_buf->size = node->size;
176 |     dev->used_mem += nb;
177 | 
178 |     node_remove(&dev->head, prev, node);
179 | 
180 |     return data_buf;
181 | }
182 | 
183 | static DSC_INLINE void dsc_data_free(dsc_device *dev, dsc_data_buffer *ptr) {
184 |     DSC_ASSERT(dev != nullptr);
185 |     DSC_ASSERT(ptr != nullptr);
186 |     DSC_ASSERT(ptr->refs > 0);
187 | 
188 |     ptr->refs--;
189 | 
190 |     if (ptr->refs > 0) return;
191 | 
192 |     DSC_LOG_DEBUG("%p will be freed", ptr);
193 | 
194 |     const uintptr_t ptr_addr = (uintptr_t) ptr->data;
195 |     dsc_free_node *new_node = next_free_node(dev);
196 | 
197 |     dsc_free_node *node = dev->head, *prev = nullptr;
198 |     while (node != nullptr) {
199 |         if (const uintptr_t node_addr = (uintptr_t) node->data; ptr_addr < node_addr) {
200 |             new_node->size = ptr->size;
201 |             new_node->next = nullptr;
202 |             new_node->data = ptr->data;
203 |             node_insert(&dev->head, prev, new_node);
204 |             break;
205 |         }
206 | 
207 |         prev = node;
208 |         node = node->next;
209 |     }
210 | 
211 |     dev->used_mem -= new_node->size;
212 | 
213 |     // Coalescence
214 |     if (new_node->next != nullptr &&
215 |         (uintptr_t) ((byte *) new_node->data + new_node->size) == (uintptr_t) new_node->next->data) {
216 |         new_node->size += new_node->next->size;
217 |         node_remove(&dev->head, new_node, new_node->next);
218 |     }
219 | 
220 |     if (prev != nullptr && prev->next != nullptr &&
221 |         (uintptr_t) ((byte *) prev->data + prev->size) == (uintptr_t) new_node->data) {
222 |         prev->size += new_node->size;
223 |         node_remove(&dev->head, prev, new_node);
224 |     }
225 | }
226 | 
227 | extern dsc_device *dsc_cpu_device(usize mem_size);
228 | 
229 | #if defined(DSC_CUDA) || defined(DSC_HIP)
230 |     extern dsc_device *dsc_gpu_device(usize mem_size, int dev_idx);
231 | #else
232 |     static DSC_INLINE dsc_device *dsc_gpu_device(usize, int) {
233 |         return nullptr;
234 |     }
235 | #endif
236 | 
237 | #undef dsc_node_is_free
238 | #undef dsc_node_mark_free


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | <img src="docs/logo.png" alt="Logo" width="200">
  3 | 
  4 | <h3>
  5 | DSC
  6 | </h3>
  7 | 
  8 | [![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
  9 | [![Unit Tests](https://github.com/nirw4nna/dsc/actions/workflows/tests.yml/badge.svg)](https://github.com/nirw4nna/dsc/actions/workflows/tests.yml)
 10 | 
 11 | </div>
 12 | 
 13 | ---
 14 | 
 15 | ## About
 16 | DSC is a PyTorch-compatible tensor library and inference framework for machine learning models.
 17 | It features a C-compatible low-level API that is wrapped in a modern Python API very similar to NumPy / PyTorch but
 18 | with some nice usability improvements.
 19 | 
 20 | 
 21 | Some key features of DSC include:
 22 | - **Intuitive API**: DSC Python API closely resembles NumPy / PyTorch.
 23 | 
 24 | 
 25 | - **Built-in neural networks support**: DSC comes with `nn.Module` built-in. Porting a model from PyTorch to DSC
 26 | is trivial (check out the [examples](https://github.com/nirw4nna/dsc/tree/main/examples/models)).
 27 | 
 28 | 
 29 | - **Multiple backends**: DSC supports both **CPU** and **CUDA** with other backends being worked on.
 30 | Programs written using DSC can seamlessly switch between backends by simply adding a `dsc.set_default_device('...')`
 31 | instruction, no changes needed.
 32 | 
 33 | 
 34 | - **Minimal external dependencies**: DSC doesn't require external libraries to be efficient.
 35 | On CPU the core operations are written from scratch in portable C++, this makes code written using DSC extremely portable.
 36 | 
 37 | 
 38 | - **No runtime allocations**: DSC has its own custom memory allocator, memory is pre-allocated
 39 | only once so no extra calls to `malloc()` or `free()` are required. It's also possible
 40 | to switch to a linear allocator to remove the (minimal) overhead introduced by a general purpose allocator.
 41 | 
 42 | 
 43 | ---
 44 | 
 45 | 
 46 | ## Quick start
 47 | Getting started with DSC is very simple. The only requirements are:
 48 | - A compiler with good support for C++20
 49 | - GNU Make for building
 50 | 
 51 | On a Linux-based system these can be obtained with:
 52 | ```shell
 53 | sudo apt update
 54 | sudo apt install build-essential
 55 | ```
 56 | 
 57 | ### Installation
 58 | The recommended way to install DSC is from source:
 59 | ```shell
 60 | git clone git@github.com:nirw4nna/dsc.git
 61 | cd dsc/
 62 | python3 -m venv venv
 63 | source venv/bin/activate
 64 | python3 -m pip install -e .
 65 | ```
 66 | 
 67 | To build the C++ library:
 68 | ```shell
 69 | make clean; make shared DSC_FAST=1
 70 | ```
 71 | This will compile DSC without any debug information, you can specify different options
 72 | to enable/disable specific features:
 73 | 
 74 | | Option        | Description                                                                  |
 75 | |---------------|------------------------------------------------------------------------------|
 76 | | DSC_LOG_LEVEL | Configure the logging level (values: [0-3] with 0 meaning everything on)     |
 77 | | DSC_FAST      | Turn off logging (level=2) and compile with the highest optimisation level   |
 78 | | DSC_GPU       | Enable GPU support (**default=0**)                                           |
 79 | | DSC_MAX_OBJS  | Max number of DSC tensors that can be used at the same time (**default=1K**) |
 80 | | DSC_TRACING   | Enable tracing (**default=0**)                                               |
 81 | 
 82 | To verify that everything worked out as expected try a simple operation:
 83 | ```shell
 84 | python3 -c "import dsc; x = dsc.arange(10); print(x)"
 85 | ```
 86 | 
 87 | ### Environment Variables
 88 | | Variable        | Description                                                                                                                                                                                                                                                         |
 89 | |-----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 90 | | DSC_NUM_THREADS | Use multiple threads (CPU). If you set it to -1 it will use half of your available cores (**default=1**)                                                                                                                                                            |
 91 | | TRACE           | Enable tracing DSC kernels. Values: [0-3] with 0 meaning no tracing, 1 dump only to the console, 2 dump only as Perfetto-compatible json, 3 dump as both console and Perfetto. This option requires DSC to be compiled with tracing support enabled (**default=0**) |
 92 | 
 93 | ### Notes on GPU support
 94 | DSC supports both AMD and NVIDIA GPUs. If compiled with `DSC_GPU=1` it will automatically detect the appropriate backend.
 95 | You can see which backend has been selected by checking the output of the Makefile or, once the compilation is done,
 96 | use the Python API:
 97 | ```python
 98 | import dsc
 99 | 
100 | if dsc.gpu.is_available(): # If a GPU backend has been detected you can check if it's ROCm or CUDA
101 |     dsc.gpu.is_rocm()
102 |     dsc.gpu.is_cuda()
103 | ```
104 | 
105 | ### CUDA backend
106 | This provides GPU acceleration on NVIDIA GPUs. To get started make sure to have the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit)
107 | installed.
108 | 
109 | To build the C++ library with CUDA enabled simply specify `DSC_GPU=1`. CUDA will be detected automatically if you installed it.
110 | 
111 | **Note:** if you see errors when compiling with CUDA support make sure that the CUDA installation path specified in the Makefile
112 | is correct. If this is not the case you have to manually update the Makefile or set the `CUDA` environment variable before calling `make`.
113 | 
114 | To verify that the CUDA backend is working try:
115 | ```shell
116 | python3 -c "import dsc; print(dsc.gpu.is_available() and dsc.gpu.is_cuda())"
117 | ```
118 | 
119 | ### HIP backend
120 | This provides GPU acceleration on AMD GPUs. To get started make sure to have the [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/quick-start.html#rocm-install-quick)
121 | installed.
122 | 
123 | To build the C++ library with ROCm enabled simply specify `DSC_GPU=1`. ROCm will be detected automatically if you installed it.
124 | 
125 | **Note:** if you see errors when compiling with ROCm support make sure that the ROCm installation path specified in the Makefile
126 | is correct. If this is not the case you have to manually update the Makefile or set the `ROCM` environment variable before calling `make`.
127 | 
128 | To verify that the ROCm backend is working try:
129 | ```shell
130 | python3 -c "import dsc; print(dsc.gpu.is_available() and dsc.gpu.is_rocm())"
131 | ```
132 | 
133 | ## Setting a default device
134 | The default device in DSC is the CPU. This means that, if you don't specify anything, all the operations will be
135 | performed on the CPU even when a GPU device is available. To set a different device as default you can use
136 | ```python
137 | dsc.set_default_device('gpu')
138 | ```
139 | This will make the GPU the default device and DSC will perform all the operations there by default.
140 | 
141 | ## Running tests
142 | DSC uses `pytest` to run unit tests against NumPy which is the reference for correctness.
143 | 
144 | The tests are structured as follows:
145 | - `test_ops_common` and `test_indexing` are used to test operations both on CPU and GPU using NumPy as reference
146 | - `test_ops_cpu` are CPU-specific
147 | - `test_ops_gpu` are GPU-specific and they use PyTorch as reference
148 | 
149 | The device on which tests are run can be configured by setting the environment variable `DSC_DEVICE` before calling pytest.
150 | 
151 | **Note:** to use PyTorch with a ROCm-compatible GPU please refer to https://pytorch.org/get-started/locally/.
152 | 
153 | To run all the tests simple do:
154 | ```bash
155 | cd python/tests/
156 | pytest -s <test_file>.py --no-header --no-summary -q
157 | ```
158 | **Note:** there are quite a few tests so to run them it's better to compile DSC with `DSC_FAST=1`.
159 | 
160 | ## License
161 | BSD-3-Clause
162 | 


--------------------------------------------------------------------------------
/dsc/include/cpu/dsc_tracing.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | // All rights reserved.
  3 | //
  4 | // This code is licensed under the terms of the 3-clause BSD license
  5 | // (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | #pragma once
  8 | 
  9 | #include "dsc.h"
 10 | #include "dsc_tracing_common.h"
 11 | 
 12 | #if defined(DSC_TRACING)
 13 | 
 14 | #include <unistd.h>     // getpid()
 15 | #include <pthread.h>    // pthread_self()
 16 | 
 17 | 
 18 | #undef DSC_INSERT_TYPED_TRACE
 19 | #undef DSC_INSERT_NAMED_TRACE
 20 | 
 21 | #define DSC_INSERT_TYPED_TRACE(DEV, T, type_) \
 22 |     dsc_cpu_trace_tracker<T> trace__ { (DEV)->trace_ctx, __FUNCTION__, (type_), &args__ }
 23 | 
 24 | #define DSC_INSERT_NAMED_TRACE(DEV, T, type_, name_) \
 25 |     dsc_cpu_trace_tracker<T> trace__ { (DEV)->trace_ctx, (name_), (type_), &args__ }
 26 | 
 27 | 
 28 | struct dsc_cpu_trace {
 29 |     dsc_trace_common base;
 30 | 
 31 |     u64 tid, start_us, stop_us;
 32 |     int pid;
 33 | };
 34 | 
 35 | template<typename T>
 36 | struct dsc_cpu_trace_tracker {
 37 |     dsc_cpu_trace_tracker(dsc_trace_ctx *ctx,
 38 |                           const char *name,
 39 |                           const dsc_trace_type type,
 40 |                           const T *args) {
 41 |         using namespace internal::tracing;
 42 | 
 43 |         // Memory allocations are CPU-only
 44 |         static const bool filter_alloc = dsc_get_env("TRACE_ALLOC", 0) == 0;
 45 |         if (dsc_tracing_is_enabled() && (!filter_alloc || (type != DSC_TENSOR_ALLOC && type != DSC_TENSOR_FREE))) {
 46 |             check_if_full<dsc_cpu_trace>(ctx);
 47 |             trace_ = next_empty_trace<dsc_cpu_trace>(ctx);
 48 |             fill_trace(&trace_->base, name, type, args);
 49 |             trace_->pid = getpid();
 50 |             trace_->tid = pthread_self();
 51 |             trace_->start_us = time_us();
 52 |         }
 53 |     }
 54 | 
 55 |     ~dsc_cpu_trace_tracker() {
 56 |         using namespace internal::tracing;
 57 |         if (trace_) {
 58 |             trace_->stop_us= time_us();
 59 |         }
 60 |     }
 61 | 
 62 | private:
 63 |     dsc_cpu_trace *trace_ = nullptr;
 64 | };
 65 | 
 66 | static DSC_INLINE dsc_trace_ctx *dsc_cpu_tracing_init() {
 67 |     return internal::tracing::init<dsc_cpu_trace>();
 68 | }
 69 | 
 70 | static DSC_INLINE void dsc_cpu_tracing_dispose(const dsc_trace_ctx *ctx) {
 71 |     internal::tracing::dispose(ctx);
 72 | }
 73 | 
 74 | static DSC_INLINE void dsc_cpu_tracing_dump(void *trace, FILE *json_file,
 75 |                                             const bool to_console, const bool to_json) {
 76 |     static constexpr const char *COLOR_NONE = "\033[0m";
 77 |     static constexpr const char *COLOR_CUSTOM = "\033[38;5;51m"; // cyan
 78 |     static constexpr const char *COLOR_COPY = "\033[38;5;201m"; // deep magenta
 79 | 
 80 |     const dsc_cpu_trace *cpu_trace = (dsc_cpu_trace *) trace;
 81 | 
 82 |     const dsc_trace_common *base = &cpu_trace->base;
 83 |     const u64 elapsed_us = cpu_trace->stop_us - cpu_trace->start_us;
 84 |     const f64 bandwidth = (f64) base->rw_bytes / ((f64) elapsed_us * 1e-6 * DSC_GB(1));
 85 | 
 86 |     if (to_console) {
 87 |         char device_str[16];
 88 |         if (base->type == DSC_COPY_OP) {
 89 |             snprintf(device_str, sizeof(device_str), "%s <- %s",
 90 |                      DSC_DEVICE_NAMES[base->copy.x.device],
 91 |                      DSC_DEVICE_NAMES[base->copy.data_device]);
 92 |         } else if (base->type == DSC_TO_OP) {
 93 |             snprintf(device_str, sizeof(device_str), "%s <- %s",
 94 |                      DSC_DEVICE_NAMES[base->to.new_device],
 95 |                      DSC_DEVICE_NAMES[base->to.x.device]);
 96 |         } else if (base->type == DSC_GET_IDX) {
 97 |             snprintf(device_str, sizeof(device_str), "%s <- %s",
 98 |                      DSC_DEVICE_NAMES[base->get_idx.x.device],
 99 |                      DSC_DEVICE_NAMES[base->get_idx.x.device]);
100 |         } else if (base->type == DSC_GET_TENSOR) {
101 |             snprintf(device_str, sizeof(device_str), "%s <- %s",
102 |                      DSC_DEVICE_NAMES[base->get_tensor.x.device],
103 |                      DSC_DEVICE_NAMES[base->get_tensor.x.device]);
104 |         } else {
105 |             snprintf(device_str, sizeof(device_str), "CPU");
106 |         }
107 | 
108 |         const char *ansi_color_1 = "", *ansi_color_2 = "";
109 |         switch (base->type) {
110 |             case DSC_COPY_OP:
111 |             case DSC_TO_OP:
112 |             case DSC_GET_IDX:
113 |             case DSC_GET_TENSOR:
114 |                 ansi_color_1 = COLOR_COPY;
115 |                 ansi_color_2 = COLOR_NONE;
116 |                 break;
117 |             case DSC_TRACE_CUSTOM:
118 |                 ansi_color_1 = COLOR_CUSTOM;
119 |                 ansi_color_2 = COLOR_NONE;
120 |                 break;
121 |             default:
122 |                 break;
123 |         }
124 | 
125 |         printf("*** [%ld] %-12s %s%-40s%s %.2fms (%6ldus)\t|",
126 |                base->ingestion_time_us,
127 |                device_str,
128 |                ansi_color_1,
129 |                base->name,
130 |                ansi_color_2,
131 |                (f64) elapsed_us * 1e-3,
132 |                elapsed_us);
133 | 
134 |         // Don't show bandwidth for custom traces
135 |         if (base->type != DSC_TRACE_CUSTOM && base->type != DSC_TENSOR_ALLOC && base->type != DSC_TENSOR_FREE) {
136 |             printf("\t%10.2fGB/s (%ldB)",
137 |                    bandwidth,
138 |                    base->rw_bytes);
139 |         }
140 | 
141 |         printf("\n");
142 |     }
143 | 
144 |     if (to_json) {
145 |         fprintf(json_file, R"({"name":"%s","cat":"%s","ph":"X","ts":%ld,"dur":%ld,"pid":%d,"tid":%ld)",
146 |                 base->name,
147 |                 DSC_TRACE_CATEGORY[base->type],
148 |                 base->ingestion_time_us,
149 |                 elapsed_us,
150 |                 cpu_trace->pid,
151 |                 cpu_trace->tid);
152 | 
153 |         fprintf(json_file, R"(,"args":{)");
154 | 
155 |         if (base->type != DSC_TRACE_CUSTOM) fprintf(json_file, R"("bandwidth":"%.2fGB/s")", bandwidth);
156 | 
157 |         internal::tracing::dump_trace_base(json_file, base);
158 |         fprintf(json_file, R"(}})" ",\n");
159 |     }
160 | }
161 | 
162 | static DSC_INLINE void dsc_cpu_next_trace(dsc_trace_ctx *ctx) {
163 |     internal::tracing::advance_current_trace<dsc_cpu_trace>(ctx);
164 | }
165 | 
166 | static DSC_INLINE void dsc_cpu_dump_json_metadata(FILE *json_file, void *) {
167 |     fprintf(json_file, R"({"name":"process_name","ph":"M","pid":%d,"tid":%ld,"args":{"name":"CPU"},"process_sort_index":0})" ",\n", getpid(), pthread_self());
168 |     fprintf(json_file, R"({"name":"thread_name","ph":"M","pid":%d,"tid":%ld,"args":{"name":"Main Thread"},"thread_sort_index":1})" ",\n", getpid(), pthread_self());
169 | }
170 | 
171 | static DSC_INLINE void dsc_cpu_insert_user_trace(dsc_trace_ctx *ctx,
172 |                                                  const char *name,
173 |                                                  const u64 start,
174 |                                                  const u64 duration) {
175 |     if (!dsc_tracing_is_enabled()) return;
176 | 
177 |     dsc_cpu_trace *trace = internal::tracing::next_empty_trace<dsc_cpu_trace>(ctx);
178 |     internal::tracing::fill_trace(&trace->base, name, DSC_TRACE_CUSTOM);
179 |     // For user-generated traces the ingestion time must be inserted manually
180 |     trace->base.ingestion_time_us = start;
181 |     trace->start_us = start;
182 |     trace->stop_us = start + duration;
183 |     // NOTE: maybe it's a good idea to put these kind of events on a separate pid/tid?
184 |     trace->pid = getpid();
185 |     trace->tid = pthread_self();
186 | }
187 | 
188 | #else
189 | 
190 | static DSC_INLINE dsc_trace_ctx *dsc_cpu_tracing_init() { return nullptr; }
191 | static DSC_INLINE void dsc_cpu_tracing_dispose(const dsc_trace_ctx *) {}
192 | static DSC_INLINE void dsc_cpu_tracing_dump(void *, FILE *, bool, bool) {}
193 | static DSC_INLINE void dsc_cpu_next_trace(dsc_trace_ctx *) {}
194 | static DSC_INLINE void dsc_cpu_dump_json_metadata(FILE *, void *) {}
195 | static DSC_INLINE void dsc_cpu_insert_user_trace(dsc_trace_ctx *, const char *, const u64, const u64) {}
196 | 
197 | #endif // DSC_TRACING


--------------------------------------------------------------------------------
/python/tests/test_indexing.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | # All rights reserved.
  3 | #
  4 | # This code is licensed under the terms of the 3-clause BSD license
  5 | # (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | import pytest
  8 | from utils_cpu import *
  9 | from random import randint, random
 10 | from typing import List
 11 | import math
 12 | 
 13 | 
 14 | @pytest.fixture(scope='session', autouse=True)
 15 | def session_fixture():
 16 |     # This is invoked once before starting the test session
 17 |     dsc.init(int(2**30))
 18 |     print(f'Running tests on {DEVICE}')
 19 |     dsc.set_default_device(DEVICE)
 20 |     yield
 21 | 
 22 | 
 23 | @pytest.fixture(autouse=True)
 24 | def teardown_fixture():
 25 |     # This is invoked automatically after each test
 26 |     yield
 27 | 
 28 | 
 29 | class TestIndexing:
 30 |     def test_get_idx(self):
 31 |         # The idea is to start with 1D tensors and then, for all dtypes, test with a growing number of indexes
 32 |         # from 1 up to the number of dimensions (to select a scalar value). Given the number of indexes we generate
 33 |         # a bunch of random pairs to try and cover most use cases.
 34 |         for n_dim in range(4):
 35 |             for dtype in DTYPES:
 36 |                 x = random_nd([10 for _ in range(n_dim + 1)], dtype=dtype)
 37 |                 x_dsc = dsc.from_numpy(x)
 38 | 
 39 |                 for indexes in range(n_dim + 1):
 40 |                     for _ in range(10):
 41 |                         idx = tuple(randint(-10, 9) for _ in range(indexes + 1))
 42 |                         res = x[idx]
 43 |                         res_dsc = x_dsc[idx]
 44 |                         if isinstance(res_dsc, dsc.Tensor):
 45 |                             assert all_close(res_dsc, res)
 46 |                         else:
 47 |                             assert np.isclose(res, res_dsc)
 48 |     def test_get_tensor(self):
 49 |         for dtype in DTYPES:
 50 |             rows = randint(1, 100)
 51 |             cols = randint(1, 100)
 52 |             x = random_nd([rows, cols], dtype=dtype)
 53 |             x_dsc = dsc.from_numpy(x)
 54 | 
 55 |             indexes = np.array([randint(0, rows - 1) for _ in range(randint(1, rows))]).astype(np.int32)
 56 |             # Indexes are always on CPU
 57 |             indexes_dsc = dsc.from_numpy(indexes, device='cpu')
 58 | 
 59 |             res = x[indexes]
 60 |             res_dsc = x_dsc[indexes_dsc]
 61 |             assert all_close(res_dsc, res)
 62 | 
 63 | 
 64 |     @staticmethod
 65 |     def _validate_slice(sl: slice, max_dim: int) -> bool:
 66 |         s_start = sl.start
 67 |         s_stop = sl.stop
 68 |         s_step = sl.step
 69 |         san_start = s_start if s_start >= 0 else s_start + max_dim
 70 |         san_stop = s_stop if s_stop >= 0 else s_stop + max_dim
 71 |         # Some of these checks should probably be handles gracefully by DSC
 72 |         if s_step == 0 or san_start == san_stop:
 73 |             return False
 74 |         if (s_step > 0 and san_stop < san_start) or (
 75 |                 s_step < 0 and san_stop > san_start
 76 |         ):
 77 |             return False
 78 |         return True
 79 | 
 80 |     def test_get_slice(self):
 81 |         # Note: this should probably be more exhaustive
 82 |         x_1d = random_nd([10], np.float32)
 83 |         x_1d_dsc = dsc.from_numpy(x_1d)
 84 | 
 85 |         for start in range(-10, 10):
 86 |             for stop in range(-10, 10):
 87 |                 for step in range(-10, 10):
 88 |                     s = slice(start, stop, step)
 89 |                     if not TestIndexing._validate_slice(s, 10):
 90 |                         continue
 91 |                     assert all_close(x_1d_dsc[s], x_1d[s])
 92 | 
 93 |         x_2d = random_nd([5, 5], np.float32)
 94 |         x_2d_dsc = dsc.from_numpy(x_2d)
 95 |         for start in range(-5, 5):
 96 |             for stop in range(-5, 5):
 97 |                 for step in range(-5, 5):
 98 |                     s = slice(start, stop, step)
 99 |                     if not TestIndexing._validate_slice(s, 5):
100 |                         continue
101 |                     assert all_close(
102 |                         x_2d_dsc[(slice(None, None, None), s)],
103 |                         x_2d[(slice(None, None, None), s)],
104 |                     )
105 | 
106 |         for extra_dim in range(-5, 5):
107 |             for start in range(-5, 5):
108 |                 for stop in range(-5, 5):
109 |                     for step in range(-5, 5):
110 |                         s = slice(start, stop, step)
111 |                         if not TestIndexing._validate_slice(s, 5):
112 |                             continue
113 | 
114 |                         x_dsc_1 = x_2d_dsc[(extra_dim, s)]
115 |                         x_np_1 = x_2d[(extra_dim, s)]
116 |                         assert all_close(x_dsc_1, x_np_1)
117 | 
118 |                         x_dsc_2 = x_2d_dsc[(s, extra_dim)]
119 |                         x_np_2 = x_2d[(s, extra_dim)]
120 |                         assert all_close(x_dsc_2, x_np_2)
121 | 
122 |     def test_set_idx(self):
123 |         for n_dim in range(1, 5):
124 |             for dtype in DTYPES:
125 |                 x = random_nd([10 for _ in range(n_dim)], dtype=dtype)
126 |                 x_dsc = dsc.from_numpy(x)
127 | 
128 |                 for indexes in range(1, n_dim):
129 |                     for _ in range(10):
130 |                         idx = tuple(randint(-10, 9) for _ in range(indexes))
131 |                         val = (
132 |                             random() + 1
133 |                             if indexes == n_dim
134 |                             else random_nd(
135 |                                 [10 for _ in range(n_dim - indexes)], dtype=dtype
136 |                             )
137 |                         )
138 |                         x[idx] = val
139 |                         x_dsc[idx] = val
140 |                         assert all_close(x_dsc, x)
141 | 
142 |     def test_set_slice(self):
143 |         def _shape_from_slice(sl: slice, max_dim: int) -> List[int]:
144 |             real_start = sl.start if sl.start >= 0 else sl.start + max_dim
145 |             real_stop = sl.stop if sl.stop >= 0 else sl.stop + max_dim
146 |             return [math.ceil(math.fabs(real_start - real_stop) / math.fabs(sl.step))]
147 | 
148 |         # This is not exhaustive, but it's good enough for now
149 |         x_1d = random_nd([10], np.float32)
150 |         x_1d_dsc = dsc.from_numpy(x_1d)
151 | 
152 |         x_1d[:] = np.ones(10, dtype=np.float32)
153 |         x_1d_dsc[:] = np.ones(10, dtype=np.float32)
154 |         assert all_close(x_1d_dsc, x_1d)
155 | 
156 |         for start in range(-10, 10):
157 |             for stop in range(-10, 10):
158 |                 for step in range(-10, 10):
159 |                     s = slice(start, stop, step)
160 |                     if not TestIndexing._validate_slice(s, 10):
161 |                         continue
162 |                     x_1d[s] = 1516.0
163 |                     x_1d_dsc[s] = 1516.0
164 |                     assert all_close(x_1d_dsc, x_1d)
165 | 
166 |                     val_shape = _shape_from_slice(s, 10)
167 |                     val = random_nd(val_shape, dtype=np.float32)
168 |                     x_1d[s] = val
169 |                     x_1d_dsc[s] = val
170 |                     assert all_close(x_1d_dsc, x_1d)
171 | 
172 |         x_2d = random_nd([5, 5], np.float32)
173 |         x_2d_dsc = dsc.from_numpy(x_2d)
174 | 
175 |         for extra_dim in range(-5, 5):
176 |             for start in range(-5, 5):
177 |                 for stop in range(-5, 5):
178 |                     for step in range(-5, 5):
179 |                         s = slice(start, stop, step)
180 |                         if not TestIndexing._validate_slice(s, 5):
181 |                             continue
182 | 
183 |                         x_2d[(extra_dim, s)] = 12.0
184 |                         x_2d_dsc[(extra_dim, s)] = 12.0
185 |                         assert all_close(x_2d_dsc, x_2d)
186 | 
187 |                         x_2d[(s, extra_dim)] = -1.55
188 |                         x_2d_dsc[(s, extra_dim)] = -1.55
189 |                         assert all_close(x_2d_dsc, x_2d)
190 | 
191 |                         val_shape = _shape_from_slice(s, 5)
192 |                         val = random_nd(val_shape, np.float32)
193 |                         x_2d[(extra_dim, s)] = val
194 |                         x_2d_dsc[(extra_dim, s)] = val
195 |                         assert all_close(x_2d_dsc, x_2d)
196 | 
197 |                         val = random_nd(val_shape, np.float32)
198 |                         x_2d[(s, extra_dim)] = val
199 |                         x_2d_dsc[(s, extra_dim)] = val
200 |                         assert all_close(x_2d_dsc, x_2d)


--------------------------------------------------------------------------------
/python/tests/test_ops_common.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | # All rights reserved.
  3 | #
  4 | # This code is licensed under the terms of the 3-clause BSD license
  5 | # (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | import pytest
  8 | from random import randint, random
  9 | from itertools import permutations
 10 | from utils_cpu import *
 11 | 
 12 | 
 13 | @pytest.fixture(scope='session', autouse=True)
 14 | def session_fixture():
 15 |     # This is invoked once before starting the test session
 16 |     dsc.init(int(2**30))
 17 |     print(f'Running tests on {DEVICE}')
 18 |     dsc.set_default_device(DEVICE)
 19 |     yield
 20 | 
 21 | 
 22 | @pytest.fixture(autouse=True)
 23 | def teardown_fixture():
 24 |     # This is invoked automatically after each test
 25 |     yield
 26 | 
 27 | class TestInit:
 28 |     def test_arange(self):
 29 |         for _ in range(10):
 30 |             n = randint(1, 10_000)
 31 |             for dtype in DTYPES:
 32 |                 if is_bool(dtype):
 33 |                     continue
 34 |                 print(f'Tensing arange with N={n} and dtype={dtype.__name__} ')
 35 |                 res_np = np.arange(n, dtype=dtype)
 36 |                 res_dsc = dsc.arange(n, dtype=DSC_DTYPES[dtype])
 37 |                 assert all_close(res_dsc, res_np)
 38 | 
 39 |     def test_random(self):
 40 |         for _ in range(10):
 41 |             shape = tuple([randint(1, 10) for _ in range(4)])
 42 |             for dtype in DTYPES:
 43 |                 if not is_float(dtype):
 44 |                     continue
 45 |                 print(f'Tensing randn with dtype={dtype.__name__} ')
 46 | 
 47 |                 res_np = np.random.randn(*shape).astype(dtype)
 48 |                 res_dsc = dsc.randn(*shape, dtype=DSC_DTYPES[dtype])
 49 |                 res_dsc_np = res_dsc.numpy()
 50 | 
 51 |                 assert res_dsc_np.dtype == res_np.dtype
 52 |                 assert res_dsc_np.shape == res_np.shape
 53 | 
 54 | def test_creation():
 55 |     for n_dim in range(4):
 56 |         for dtype in DTYPES:
 57 |             shape = tuple(randint(1, 20) for _ in range(n_dim + 1))
 58 |             fill = random()
 59 | 
 60 |             x = np.full(shape, fill_value=fill, dtype=dtype)
 61 |             x_dsc = dsc.full(shape, fill_value=fill, dtype=DSC_DTYPES[dtype])
 62 |             assert all_close(x_dsc, x)
 63 | 
 64 |             like = np.ones([randint(1, 10) for _ in range(n_dim + 1)])
 65 | 
 66 |             x = np.full_like(like, fill_value=fill, dtype=dtype)
 67 |             x_dsc = dsc.full_like(like, fill_value=fill, dtype=DSC_DTYPES[dtype])
 68 |             assert all_close(x_dsc, x)
 69 | 
 70 |             x = np.ones(shape, dtype=dtype)
 71 |             x_dsc = dsc.ones(shape, dtype=DSC_DTYPES[dtype])
 72 |             assert all_close(x_dsc, x)
 73 | 
 74 |             x = np.ones_like(like, dtype=dtype)
 75 |             x_dsc = dsc.ones_like(like, dtype=DSC_DTYPES[dtype])
 76 |             assert all_close(x_dsc, x)
 77 | 
 78 |             x = np.zeros(shape, dtype=dtype)
 79 |             x_dsc = dsc.zeros(shape, dtype=DSC_DTYPES[dtype])
 80 |             assert all_close(x_dsc, x)
 81 | 
 82 |             x = np.zeros_like(like, dtype=dtype)
 83 |             x_dsc = dsc.zeros_like(like, dtype=DSC_DTYPES[dtype])
 84 |             assert all_close(x_dsc, x)
 85 | 
 86 | def test_reshape():
 87 |     x = np.ones((10, 10))
 88 |     x_dsc = dsc.from_numpy(x)
 89 |     assert all_close(x_dsc.reshape(4, 5, 5), x.reshape(4, 5, 5))
 90 |     assert all_close(x_dsc.reshape([4, 5, 5]), x.reshape([4, 5, 5]))
 91 |     assert all_close(x_dsc.reshape((4, 5, 5)), x.reshape((4, 5, 5)))
 92 | 
 93 |     assert all_close(x_dsc.reshape(-1, 5), x.reshape(-1, 5))
 94 |     assert all_close(x_dsc.reshape([-1, 5]), x.reshape([-1, 5]))
 95 |     assert all_close(x_dsc.reshape((-1, 5)), x.reshape((-1, 5)))
 96 | 
 97 | def test_concat():
 98 |     for n_dim in range(1, 5):
 99 |         for dtype in DTYPES:
100 |             shape = [randint(2, 10) for _ in range(n_dim)]
101 |             for axis_idx in range(n_dim):
102 |                 print(
103 |                     f'Testing concat with {n_dim}-dimensional tensors of type {dtype.__name__} on axis {axis_idx}'
104 |                 )
105 |                 shape_x1 = list(shape)
106 |                 shape_x1[axis_idx] = randint(2, 10)
107 |                 shape_x2 = list(shape)
108 |                 shape_x2[axis_idx] = randint(2, 10)
109 |                 x1 = random_nd(shape_x1, dtype)
110 |                 x2 = random_nd(shape_x2, dtype)
111 |                 x1_dsc = dsc.from_numpy(x1)
112 |                 x2_dsc = dsc.from_numpy(x2)
113 | 
114 |                 res_np = np.concat((x1, x2), axis_idx)
115 |                 res_dsc = dsc.concat((x1_dsc, x2_dsc), axis_idx)
116 |                 assert all_close(res_dsc, res_np)
117 | 
118 |                 # Test flatten
119 |                 res_np_flat = np.concat((x1, x2), None)
120 |                 res_dsc_flat = dsc.concat((x1_dsc, x2_dsc), None)
121 |                 assert all_close(res_dsc_flat, res_np_flat)
122 | 
123 | def test_split():
124 |     for n_dim in range(1, 5):
125 |         for dtype in DTYPES:
126 |             for axis_idx in range(n_dim):
127 |                 shape = [randint(2, 10) for _ in range(n_dim)]
128 |                 print(f'Testing split with {n_dim}-dimensional tensors of type {dtype.__name__} on axis {axis_idx}')
129 |                 ne = shape[axis_idx]
130 |                 multi = randint(1, 10)
131 |                 shape[axis_idx] *= multi
132 |                 x = random_nd(shape, dtype)
133 |                 x_dsc = dsc.from_numpy(x)
134 | 
135 |                 res = np.split(x, multi, axis=axis_idx)
136 |                 res_dsc = dsc.split(x_dsc, ne, axis=axis_idx)
137 |                 assert len(res) == len(res_dsc)
138 |                 for r_np, r_dsc in zip(res, res_dsc):
139 |                     assert all_close(r_dsc, r_np)
140 | 
141 | def test_repeat():
142 |     for n_dim in range(1, 5):
143 |         for dtype in DTYPES:
144 |             shape = [randint(2, 10) for _ in range(n_dim)]
145 |             for axis_idx in range(n_dim):
146 |                 print(f'Testing repeat with {n_dim}-dimensional tensors of type {dtype.__name__} on axis {axis_idx}')
147 |                 x = random_nd(shape, dtype)
148 |                 x_dsc = dsc.from_numpy(x)
149 |                 repeats = randint(2, 5)
150 |                 res = np.repeat(x, repeats, axis=axis_idx)
151 |                 res_dsc = dsc.repeat(x_dsc, repeats, axis=axis_idx)
152 |                 assert all_close(res_dsc, res)
153 | 
154 | def test_where():
155 |     for n_dim in range(1, 5):
156 |         for dtype in DTYPES:
157 |             print(f'Testing where with {n_dim}-dimensional condition tensor and values of type {dtype.__name__}')
158 |             x = np.random.choice([True, False], size=tuple([randint(1, 10) for _ in range(n_dim)]))
159 |             values = random_nd([2], dtype=dtype)
160 |             this = values[0]; that = values[1]
161 |             x_dsc = dsc.from_numpy(x)
162 |             res = np.where(x, this, that)
163 |             res_dsc = dsc.where(x_dsc, this, that)
164 |             assert all_close(res_dsc, res)
165 | 
166 | def test_transpose():
167 |     for n_dim in range(2, 5):
168 |         for dtype in DTYPES:
169 |             print(
170 |                 f'Testing transpose with {n_dim}-dimensional tensors of type {dtype.__name__}'
171 |             )
172 |             shape = [randint(2, 10) for _ in range(n_dim)]
173 |             x = random_nd(shape, dtype)
174 |             x_dsc = dsc.from_numpy(x)
175 |             # Simple transpose
176 |             res_np_simple = np.transpose(x)
177 |             res_dsc_simple = dsc.transpose(x_dsc)
178 |             assert all_close(res_dsc_simple, res_np_simple)
179 | 
180 |             # Test with all the permutations of axes, both positive and negative
181 |             for axes in permutations(range(-n_dim, 0), n_dim):
182 |                 res_np = np.transpose(x, axes)
183 |                 res_dsc = dsc.transpose(x_dsc, axes)
184 |                 assert all_close(res_dsc, res_np)
185 | 
186 |             for axes in permutations(range(0, n_dim), n_dim):
187 |                 res_np = np.transpose(x, axes)
188 |                 res_dsc = dsc.transpose(x_dsc, axes)
189 |                 assert all_close(res_dsc, res_np)
190 | 
191 | def test_tril():
192 |     for n_dim in range(2, 5):
193 |         for dtype in DTYPES:
194 |             x = random_nd([randint(1, 10) for _ in range(n_dim)], dtype)
195 |             x_dsc = dsc.from_numpy(x)
196 |             for k in range(-1, 2):
197 |                 print(f'Testing tril with {n_dim}-dimensional tensors of type {dtype.__name__} k={k}')
198 |                 res = np.tril(x, k)
199 |                 res_dsc = dsc.tril(x_dsc, k)
200 |                 assert all_close(res_dsc, res)
201 | 
202 | def test_masked_fill():
203 |     for n_dim in range(1, 5):
204 |         for dtype in DTYPES:
205 |             if not is_float(dtype):
206 |                 continue
207 | 
208 |             print(f'Testing masked_fill with {n_dim}-dimensional tensors of type {dtype.__name__}')
209 |             x = random_nd([randint(1, 10) for _ in range(n_dim)], dtype)
210 |             mask = random_nd(x.shape, np.bool)
211 |             x_dsc = dsc.from_numpy(x)
212 |             mask_dsc = dsc.from_numpy(mask)
213 |             fill = random()
214 | 
215 |             x[mask] = fill
216 |             res_dsc = x_dsc.masked_fill(mask_dsc, fill)
217 |             assert all_close(res_dsc, x)
218 | 


--------------------------------------------------------------------------------
/dsc/include/gpu/dsc_gpu.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | // All rights reserved.
  3 | //
  4 | // This code is licensed under the terms of the 3-clause BSD license
  5 | // (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | #pragma once
  8 | 
  9 | #include "dsc.h"
 10 | // #include "dsc_device.h"
 11 | 
 12 | #if defined(DSC_CUDA) && defined(DSC_HIP)
 13 | #   error "DSC can't be compiled with both CUDA and HIP support"
 14 | #endif
 15 | 
 16 | 
 17 | #if defined(DSC_CUDA) || defined(DSC_HIP)
 18 | 
 19 | #if defined(DSC_CUDA)
 20 | #   include "platform/dsc_cuda_platform.h"
 21 | #endif
 22 | 
 23 | #if defined(DSC_HIP)
 24 | #   include "platform/dsc_hip_platform.h"
 25 | #endif
 26 | 
 27 | #define DSC_GPU_KERNEL             __global__
 28 | #define DSC_GPU_FUNC               __device__
 29 | #define DSC_GPU_DEFAULT_THREADS    ((uint) 256)
 30 | #define DSC_GPU_MAX_BLOCKS         ((uint) 256)
 31 | 
 32 | #define DSC_GPU_BLOCKS(n)    DSC_MIN(DSC_GPU_MAX_BLOCKS, DSC_CEIL(n, DSC_GPU_DEFAULT_THREADS))
 33 | #define DSC_GPU_TID()        const int tid = (int) (threadIdx.x + blockIdx.x * blockDim.x)
 34 | #define DSC_GPU_STRIDE()     const int stride = (int) (blockDim.x * gridDim.x)
 35 | 
 36 | struct dsc_device;
 37 | 
 38 | struct dsc_gpu_dev_info {
 39 |     char name[256];
 40 |     gpu_rand_state *rand_state;
 41 |     gpu_blas_handle blas_handle;
 42 |     int dev_idx;
 43 |     dsc_gpu_platform platform;
 44 | };
 45 | 
 46 | // ============================================================
 47 | // Utilities
 48 | //
 49 | 
 50 | static DSC_INLINE int dsc_gpu_devices() {
 51 |     int devices;
 52 |     DSC_GPU_CHECK(gpu_get_device_count(&devices));
 53 |     return devices;
 54 | }
 55 | 
 56 | static DSC_INLINE int dsc_gpu_dev_capability(const int dev) {
 57 |     gpu_device_props prop{};
 58 |     DSC_GPU_CHECK(gpu_get_device_properties(&prop, dev));
 59 |     return prop.major * 100 + prop.minor * 10;
 60 | }
 61 | 
 62 | static DSC_INLINE void dsc_gpu_dev_name(const int dev, char *dst) {
 63 |     gpu_device_props prop{};
 64 |     DSC_GPU_CHECK(gpu_get_device_properties(&prop, dev));
 65 |     strncpy(dst, prop.name, 256);
 66 | }
 67 | 
 68 | static DSC_INLINE usize dsc_gpu_dev_mem(const int dev) {
 69 |     gpu_device_props prop{};
 70 |     DSC_GPU_CHECK(gpu_get_device_properties(&prop, dev));
 71 |     return prop.totalGlobalMem;
 72 | }
 73 | 
 74 | static DSC_INLINE void dsc_gpu_sync() {
 75 |     DSC_GPU_CHECK(gpu_device_sync());
 76 | }
 77 | 
 78 | static DSC_INLINE bool dsc_gpu_has_bf16() {
 79 | #if defined(DSC_BF16)
 80 |     return true;
 81 | #else
 82 |     return false;
 83 | #endif
 84 | }
 85 | 
 86 | // ============================================================
 87 | // GPU-specific operations
 88 | //
 89 | 
 90 | extern void dsc_gpu_cast(dsc_device *dev, const dsc_tensor *DSC_RESTRICT x,
 91 |                          dsc_tensor *DSC_RESTRICT out);
 92 | 
 93 | extern void dsc_gpu_arange(dsc_device *dev, dsc_tensor *DSC_RESTRICT x,
 94 |                            f64 start, f64 step);
 95 | 
 96 | extern void dsc_gpu_repeat(dsc_device *dev,
 97 |                            const dsc_tensor *DSC_RESTRICT x,
 98 |                            dsc_tensor *DSC_RESTRICT out,
 99 |                            int repeats, int axis_idx);
100 | 
101 | extern void dsc_gpu_randn(dsc_device *dev, dsc_tensor *DSC_RESTRICT x);
102 | 
103 | extern void dsc_gpu_concat(dsc_device *dev,
104 |                            dsc_tensor **to_concat,
105 |                            int tensors,
106 |                            dsc_tensor *DSC_RESTRICT out,
107 |                            int axis_idx);
108 | 
109 | extern void dsc_gpu_transpose(dsc_device *dev,
110 |                               const dsc_tensor *DSC_RESTRICT x,
111 |                               dsc_tensor *DSC_RESTRICT out,
112 |                               const int *new_shape,
113 |                               const int *new_stride);
114 | 
115 | extern void dsc_gpu_tril(dsc_device *dev,
116 |                          const dsc_tensor *DSC_RESTRICT x,
117 |                          int diagonal,
118 |                          dsc_tensor *DSC_RESTRICT out);
119 | 
120 | // ============================================================
121 | // Indexing and Slicing
122 | 
123 | extern void dsc_gpu_get_slice(dsc_device *dev,
124 |                               const dsc_tensor *DSC_RESTRICT x,
125 |                               dsc_tensor *DSC_RESTRICT out,
126 |                               int n_slices, const dsc_slice *slices,
127 |                               bool whole);
128 | 
129 | extern void dsc_gpu_set_slice(dsc_device *dev,
130 |                               dsc_tensor *DSC_RESTRICT xa,
131 |                               bool xa_scalar,
132 |                               const dsc_tensor *DSC_RESTRICT xb,
133 |                               bool xb_scalar,
134 |                               int n_slices,
135 |                               const dsc_slice *slices,
136 |                               bool whole);
137 | 
138 | // ============================================================
139 | // Binary Operations
140 | 
141 | extern void dsc_gpu_add(dsc_device *dev,
142 |                         const dsc_tensor *xa,
143 |                         const dsc_tensor *xb,
144 |                         dsc_tensor *out);
145 | 
146 | extern void dsc_gpu_sub(dsc_device *dev,
147 |                         const dsc_tensor *xa,
148 |                         const dsc_tensor *xb,
149 |                         dsc_tensor *out);
150 | 
151 | extern void dsc_gpu_mul(dsc_device *dev,
152 |                         const dsc_tensor *xa,
153 |                         const dsc_tensor *xb,
154 |                         dsc_tensor *out);
155 | 
156 | extern void dsc_gpu_div(dsc_device *dev,
157 |                         const dsc_tensor *xa,
158 |                         const dsc_tensor *xb,
159 |                         dsc_tensor *out);
160 | 
161 | extern void dsc_gpu_pow(dsc_device *dev,
162 |                         const dsc_tensor *xa,
163 |                         const dsc_tensor *xb,
164 |                         dsc_tensor *out);
165 | 
166 | extern void dsc_gpu_matmul(dsc_device *dev,
167 |                            const dsc_tensor *DSC_RESTRICT xa,
168 |                            const dsc_tensor *DSC_RESTRICT xb,
169 |                            bool trans_b,
170 |                            dsc_tensor *DSC_RESTRICT out);
171 | 
172 | extern void dsc_gpu_compare(dsc_device *dev,
173 |                             const dsc_tensor *xa,
174 |                             const dsc_tensor *xb,
175 |                             dsc_comparison_op comp,
176 |                             dsc_tensor *out);
177 | 
178 | extern void dsc_gpu_masked_fill(dsc_device *dev,
179 |                                 dsc_tensor *DSC_RESTRICT x,
180 |                                 const dsc_tensor *DSC_RESTRICT mask,
181 |                                 f64 value);
182 | 
183 | extern void dsc_gpu_outer(dsc_device *dev,
184 |                           const dsc_tensor *DSC_RESTRICT xa,
185 |                           const dsc_tensor *DSC_RESTRICT xb,
186 |                           dsc_tensor *DSC_RESTRICT out);
187 | 
188 | extern void dsc_gpu_where(dsc_device *dev,
189 |                           const dsc_tensor *DSC_RESTRICT condition,
190 |                           const dsc_tensor *DSC_RESTRICT input,
191 |                           const dsc_tensor *DSC_RESTRICT other,
192 |                           dsc_tensor *DSC_RESTRICT out);
193 | 
194 | // ============================================================
195 | // Unary Operations
196 | 
197 | extern void dsc_gpu_cos(dsc_device *dev,
198 |                         const dsc_tensor *DSC_RESTRICT x,
199 |                         dsc_tensor *DSC_RESTRICT out);
200 | 
201 | extern void dsc_gpu_sin(dsc_device *dev,
202 |                         const dsc_tensor *DSC_RESTRICT x,
203 |                         dsc_tensor *DSC_RESTRICT out);
204 | 
205 | extern void dsc_gpu_tanh(dsc_device *dev,
206 |                          const dsc_tensor *DSC_RESTRICT x,
207 |                          dsc_tensor *DSC_RESTRICT out);
208 | 
209 | extern void dsc_gpu_exp(dsc_device *dev,
210 |                         const dsc_tensor *DSC_RESTRICT x,
211 |                         dsc_tensor *DSC_RESTRICT out);
212 | 
213 | extern void dsc_gpu_sqrt(dsc_device *dev,
214 |                          const dsc_tensor *DSC_RESTRICT x,
215 |                          dsc_tensor *DSC_RESTRICT out);
216 | 
217 | // ============================================================
218 | // Unary Operations Along Axis
219 | 
220 | extern void dsc_gpu_sum(dsc_device *dev,
221 |                         const dsc_tensor *DSC_RESTRICT x,
222 |                         dsc_tensor *DSC_RESTRICT out,
223 |                         int axis_idx);
224 | 
225 | extern void dsc_gpu_min(dsc_device *dev,
226 |                         const dsc_tensor *DSC_RESTRICT x,
227 |                         dsc_tensor *DSC_RESTRICT out,
228 |                         int axis_idx);
229 | 
230 | extern void dsc_gpu_max(dsc_device *dev,
231 |                         const dsc_tensor *DSC_RESTRICT x,
232 |                         dsc_tensor *DSC_RESTRICT out,
233 |                         int axis_idx);
234 | 
235 | #else
236 | 
237 | #define DSC_GPU_PLATFORM NONE
238 | 
239 | static DSC_INLINE int dsc_gpu_devices() {
240 |     return 0;
241 | }
242 | 
243 | static DSC_INLINE int dsc_gpu_dev_capability(const int) {
244 |     return 0;
245 | }
246 | 
247 | static DSC_INLINE void dsc_gpu_dev_name(const int, char *) {}
248 | 
249 | static DSC_INLINE usize dsc_gpu_dev_mem(const int) {
250 |     return 0;
251 | }
252 | 
253 | static DSC_INLINE void dsc_gpu_sync() {}
254 | 
255 | static DSC_INLINE bool dsc_gpu_has_bf16() {
256 |     return false;
257 | }
258 | 
259 | #endif // DSC_CUDA || DSC_HIP


--------------------------------------------------------------------------------
/dsc/include/gpu/dsc_ops.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | // All rights reserved.
  3 | //
  4 | // This code is licensed under the terms of the 3-clause BSD license
  5 | // (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | #pragma once
  8 | 
  9 | #include "gpu/dsc_gpu.h"
 10 | 
 11 | #define atomic_cas_f32(PTR, VAL)                                     \
 12 |     do {                                                             \
 13 |         uint *addr = (uint *) (PTR);                                 \
 14 |         uint old = *addr, assumed;                                   \
 15 |         do {                                                         \
 16 |             assumed = old;                                           \
 17 |             const f32 assumed_val = __int_as_float(assumed);         \
 18 |             const f32 new_val = VAL;                                 \
 19 |             old = atomicCAS(addr, assumed, __float_as_int(new_val)); \
 20 |         } while (old != assumed);                                    \
 21 |     } while (0)
 22 | 
 23 | #define atomic_cas_f64(PTR, VAL)                                           \
 24 |     do {                                                                   \
 25 |         unsigned long long *addr = (unsigned long long *) (PTR);           \
 26 |         unsigned long long old = *addr, assumed;                           \
 27 |         do {                                                               \
 28 |             assumed = old;                                                 \
 29 |             const f64 assumed_val = __longlong_as_double(assumed);         \
 30 |             const f64 new_val = VAL;                                       \
 31 |             old = atomicCAS(addr, assumed, __double_as_longlong(new_val)); \
 32 |         } while (old != assumed);                                          \
 33 |     } while (0)
 34 | 
 35 | 
 36 | struct gpu_cast_op {
 37 |     template<typename Tin, typename Tout>
 38 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE Tout operator()(const Tin in) const {
 39 | #if defined(DSC_BF16)
 40 |         return (Tout) in;
 41 | #else
 42 |         if constexpr (dsc_is_type<Tin, bf16>()) {
 43 |             // If BF16 is not supported use the same logic as the CPU
 44 |             union {
 45 |                 f32 f;
 46 |                 u32 i;
 47 |             } u;
 48 |             u.i = (u32) in << 16;
 49 |             return (Tout) u.f;
 50 |         } else {
 51 |             return (Tout) in;
 52 |         }
 53 | #endif
 54 |     }
 55 | };
 56 | 
 57 | struct gpu_add_op {
 58 |     template<typename T>
 59 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const {
 60 |         if constexpr (dsc_is_type<T, bool>()) {
 61 |             return xa || xb;
 62 |         } else {
 63 |             return xa + xb;
 64 |         }
 65 |     }
 66 | };
 67 | 
 68 | struct gpu_atomic_add_op {
 69 |     template<typename T>
 70 |     DSC_GPU_FUNC DSC_INLINE void operator()(T *x, const T val) const {
 71 |         if constexpr (dsc_is_type<T, bool>()) {
 72 |             atomicOr(x, val);
 73 |         } else {
 74 |             atomicAdd(x, val);
 75 |         }
 76 |     }
 77 | };
 78 | 
 79 | struct gpu_sub_op {
 80 |     template<typename T>
 81 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const {
 82 |         if constexpr (dsc_is_type<T, bool>()) {
 83 |             return xa ^ xb;
 84 |         } else {
 85 |             return xa - xb;
 86 |         }
 87 |     }
 88 | };
 89 | 
 90 | struct gpu_mul_op {
 91 |     template<typename T>
 92 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const {
 93 |         if constexpr (dsc_is_type<T, bool>()) {
 94 |             return xa && xb;
 95 |         } else {
 96 |             return xa * xb;
 97 |         }
 98 |     }
 99 | };
100 | 
101 | struct gpu_div_op {
102 |     template<typename T>
103 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const {
104 |         return xa / xb;
105 |     }
106 | };
107 | 
108 | struct gpu_pow_op {
109 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE i32 operator()(const i32 base, const i32 exp) const {
110 |         i32 acc = 1;
111 |         for (int i = 0; i < exp; ++i) acc *= base;
112 |         return acc;
113 |     }
114 | 
115 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bf16 operator()(const bf16 base, const bf16 exp) const {
116 |         return gpu_pow_op()((f32) base, (f32) exp);
117 |     }
118 | 
119 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 base, const f32 exp) const {
120 |         return powf(base, exp);
121 |     }
122 | 
123 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 base, const f64 exp) const {
124 |         return pow(base, exp);
125 |     }
126 | };
127 | 
128 | struct gpu_cos_op {
129 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bf16 operator()(const bf16 x) const {
130 |         return gpu_cos_op()((f32) x);
131 |     }
132 | 
133 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const {
134 |         return cosf(x);
135 |     }
136 | 
137 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const {
138 |         return cos(x);
139 |     }
140 | };
141 | 
142 | struct gpu_sin_op {
143 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bf16 operator()(const bf16 x) const {
144 |         return gpu_sin_op()((f32) x);
145 |     }
146 | 
147 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const {
148 |         return sinf(x);
149 |     }
150 | 
151 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const {
152 |         return sin(x);
153 |     }
154 | };
155 | 
156 | struct gpu_tanh_op {
157 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bf16 operator()(const bf16 x) const {
158 |         return gpu_tanh_op()((f32) x);
159 |     }
160 | 
161 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const {
162 |         return tanhf(x);
163 |     }
164 | 
165 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const {
166 |         return tanh(x);
167 |     }
168 | };
169 | 
170 | struct gpu_sqrt_op {
171 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bf16 operator()(const bf16 x) const {
172 |         return gpu_sqrt_op()((f32) x);
173 |     }
174 | 
175 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const {
176 |         return sqrtf(x);
177 |     }
178 | 
179 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const {
180 |         return sqrt(x);
181 |     }
182 | };
183 | 
184 | struct gpu_exp_op {
185 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bf16 operator()(const bf16 x) const {
186 |         return gpu_exp_op()((f32) x);
187 |     }
188 | 
189 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const {
190 |         return expf(x);
191 |     }
192 | 
193 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const {
194 |         return exp(x);
195 |     }
196 | };
197 | 
198 | struct gpu_max_op {
199 |     template<typename T>
200 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const {
201 |         return DSC_MAX(xa, xb);
202 |     }
203 | };
204 | 
205 | struct gpu_atomic_max_op {
206 |     DSC_GPU_FUNC DSC_INLINE void operator()(f32 *x, const f32 val) const {
207 |         atomic_cas_f32(x, DSC_MAX(val, assumed_val));
208 |     }
209 | 
210 |     DSC_GPU_FUNC DSC_INLINE void operator()(f64 *x, const f64 val) const {
211 |         atomic_cas_f64(x, DSC_MAX(val, assumed_val));
212 |     }
213 | };
214 | 
215 | struct gpu_min_op {
216 |     template<typename T>
217 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const {
218 |         return DSC_MIN(xa, xb);
219 |     }
220 | };
221 | 
222 | struct gpu_atomic_min_op {
223 |     DSC_GPU_FUNC DSC_INLINE void operator()(f32 *x, const f32 val) const {
224 |         atomic_cas_f32(x, DSC_MIN(val, assumed_val));
225 |     }
226 | 
227 |     DSC_GPU_FUNC DSC_INLINE void operator()(f64 *x, const f64 val) const {
228 |         atomic_cas_f64(x, DSC_MIN(val, assumed_val));
229 |     }
230 | };
231 | 
232 | struct gpu_eq_op {
233 |     template<typename T>
234 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const {
235 |         return xa == xb;
236 |     }
237 | };
238 | 
239 | struct gpu_ne_op {
240 |     template<typename T>
241 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const {
242 |         return !gpu_eq_op()(xa, xb);
243 |     }
244 | };
245 | 
246 | struct gpu_lt_op {
247 |     template<typename T>
248 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const {
249 |         return xa < xb;
250 |     }
251 | };
252 | struct gpu_le_op {
253 |     template<typename T>
254 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const {
255 |         return xa <= xb;
256 |     }
257 | };
258 | 
259 | struct gpu_gt_op {
260 |     template<typename T>
261 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const {
262 |         return xa > xb;
263 |     }
264 | };
265 | 
266 | struct gpu_ge_op {
267 |     template<typename T>
268 |     DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const {
269 |         return xa >= xb;
270 |     }
271 | };
272 | 
273 | 
274 | template<typename Op>
275 | consteval bool is_comparison_op() {
276 |     return dsc_is_type<Op, gpu_eq_op>() ||
277 |            dsc_is_type<Op, gpu_ne_op>() ||
278 |            dsc_is_type<Op, gpu_lt_op>() ||
279 |            dsc_is_type<Op, gpu_le_op>() ||
280 |            dsc_is_type<Op, gpu_gt_op>() ||
281 |            dsc_is_type<Op, gpu_ge_op>();
282 | }
283 | 
284 | template<typename Op>
285 | consteval bool is_bool_arith_op() {
286 |     return dsc_is_type<Op, gpu_add_op>() ||
287 |            dsc_is_type<Op, gpu_sub_op>() ||
288 |            dsc_is_type<Op, gpu_mul_op>();
289 | }


--------------------------------------------------------------------------------
/examples/models/gpt2.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | #  All rights reserved.
  3 | #
  4 | #  This code is licensed under the terms of the 3-clause BSD license
  5 | #  (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | 
  8 | import dsc
  9 | import dsc.nn as nn
 10 | import dsc.nn.functional as F
 11 | from dataclasses import dataclass
 12 | from transformers import GPT2Tokenizer
 13 | from time import perf_counter
 14 | import argparse
 15 | 
 16 | 
 17 | @dataclass
 18 | class GPT2Hparams:
 19 |    # default hyperparameters for GPT-2 small
 20 |    n_layers: int = 12
 21 |    n_heads: int = 12
 22 |    emb_size: int = 768
 23 |    block_size: int = 1024
 24 |    vocab_size: int = 50257
 25 | 
 26 | 
 27 | class MultiHeadAttention(nn.Module):
 28 |     def __init__(self, hparams: GPT2Hparams, use_cache: bool = True, dtype: dsc.Dtype = dsc.f32):
 29 |         super().__init__()
 30 |         self.block_size = hparams.block_size
 31 |         self.emb_size = hparams.emb_size
 32 |         self.n_heads = hparams.n_heads
 33 |         # Stacked attention, contains the projections of both Q, K and V
 34 |         self.c_attn = nn.Linear(self.emb_size, 3 * self.emb_size, dtype=dtype)
 35 |         self.c_proj = nn.Linear(self.emb_size, self.emb_size, dtype=dtype)
 36 |         # Causal mask
 37 |         self.tril = dsc.tril(dsc.ones((self.block_size, self.block_size)))
 38 |         
 39 |         # KV cache
 40 |         self.use_cache = use_cache
 41 |         self.cache_k = None
 42 |         self.cache_v = None
 43 | 
 44 |     @dsc.trace('MultiHeadAttention')
 45 |     def forward(self, x: dsc.Tensor) -> dsc.Tensor:
 46 |         B, T, C = x.shape # (block size, context size, emb size)
 47 |         attn = self.c_attn(x)
 48 | 
 49 |         q, k, v = attn.split(self.emb_size, axis=2) # (B, T, C)
 50 |         q = q.reshape(B, T, self.n_heads, self.emb_size // self.n_heads).transpose((0, 2, 1, 3)) # (B, nh, T, hs) given EMB_SIZE is a multiple of N_HEADS
 51 |         k = k.reshape(B, T, self.n_heads, self.emb_size // self.n_heads).transpose((0, 2, 1, 3)) # (B, nh, T, hs)
 52 |         v = v.reshape(B, T, self.n_heads, self.emb_size // self.n_heads).transpose((0, 2, 1, 3)) # (B, nh, T, hs)
 53 | 
 54 |         if self.use_cache:
 55 |             if self.cache_k is not None:
 56 |                 k = dsc.concat([self.cache_k, k], axis=2)
 57 | 
 58 |             if self.cache_v is not None:
 59 |                 v = dsc.concat([self.cache_v, v], axis=2)
 60 | 
 61 |             self.cache_k = k
 62 |             self.cache_v = v
 63 | 
 64 |         seq_len = k.size(2)
 65 |         k_t = k.transpose((0, 1, 3, 2))
 66 | 
 67 |         # Self Attention (B, nh, T, hs) @ (B, nh, hs, T) = (B, nh, T, T)
 68 |         q_k = q @ k_t
 69 |         attention = q_k * q.size(-1) ** -0.5
 70 | 
 71 |         if not self.use_cache or seq_len == T:
 72 |             # Masking is needed when we are not using the cache or when using the cache and we are processing the prompt
 73 |             attention = attention.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
 74 | 
 75 |         attention = F.softmax(attention, axis=-1)
 76 |         out = attention @ v # (B, nh, T, T) @ (B, nh, T, hs) = (B, nh, T, hs)
 77 |         out = out.transpose((0, 2, 1, 3)).reshape(B, T, C)
 78 | 
 79 |         return self.c_proj(out)
 80 | 
 81 | 
 82 | class TransformerBlock(nn.Module):
 83 |     def __init__(self, hparams: GPT2Hparams, use_cache: bool = True, dtype: dsc.Dtype = dsc.f32):
 84 |         super().__init__()
 85 |         self.ln_1 = nn.LayerNorm(hparams.emb_size, dtype=dtype)
 86 |         self.attn = MultiHeadAttention(hparams, use_cache, dtype=dtype)
 87 |         self.ln_2 = nn.LayerNorm(hparams.emb_size, dtype=dtype)
 88 |         self.mlp = nn.ModuleDict(dict(
 89 |             c_fc = nn.Linear(hparams.emb_size, hparams.emb_size * 4, dtype=dtype),
 90 |             c_proj = nn.Linear(hparams.emb_size * 4, hparams.emb_size, dtype=dtype),
 91 |         ))
 92 | 
 93 |     @dsc.trace('TransformerBlock')
 94 |     def forward(self, x: dsc.Tensor) -> dsc.Tensor:
 95 |         m = self.mlp
 96 |         x = x + self.attn(self.ln_1(x))
 97 |         return x + m.c_proj(F.gelu(m.c_fc(self.ln_2(x))))
 98 | 
 99 | 
100 | class GPT2(nn.Module):
101 |     def __init__(self, hparams: GPT2Hparams, use_cache: bool = True, dtype: dsc.Dtype = dsc.f32):
102 |         super().__init__()
103 |         self.hparams = hparams
104 |         self.wpe = nn.Embedding(hparams.block_size, hparams.emb_size, dtype=dtype)
105 |         self.wte = nn.Embedding(hparams.vocab_size, hparams.emb_size, dtype=dtype)
106 |         self.h = nn.ModuleList([TransformerBlock(hparams, use_cache, dtype=dtype) for _ in range(hparams.n_layers)])
107 |         self.ln_f = nn.LayerNorm(hparams.emb_size, dtype=dtype)
108 |         self.lm_head = nn.Linear(hparams.emb_size, hparams.vocab_size, bias=False, dtype=dtype)
109 |         self.use_cache = use_cache
110 |         self.kv_pos = 0
111 | 
112 |         n_params = sum([p.ne for p in self.parameters()])
113 |         print(f'Model has {round(n_params / 1e6)}M parameters')
114 |     
115 |     @staticmethod
116 |     def from_pretrained(hparams: GPT2Hparams = GPT2Hparams(), use_cache: bool = True, dtype: dsc.Dtype = dsc.f32) -> 'GPT2':
117 |         # GPT2 uses Conv1D instead of a Linear layer which means we have to transpose the weights
118 |         state_dict = nn.safe_load('https://huggingface.co/openai-community/gpt2/resolve/main/model.safetensors',
119 |                                   use_dtype=dtype)
120 |         for i in range(hparams.n_layers):
121 |             # The causal mask doesn't need to be loaded, so I'll just remove it
122 |             del state_dict[f'h.{i}.attn.bias']
123 | 
124 |         to_transpose = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
125 |         my_model = GPT2(hparams, use_cache, dtype=dtype)
126 |         my_model.from_state(state_dict,
127 |                             on_hook=[(to_transpose, lambda x: x.transpose())],
128 |                             tied={'lm_head.weight': 'wte.weight'}) # lm_head and wte weights are tied
129 |         del state_dict
130 |         dsc.print_mem_usage()
131 |         return my_model
132 | 
133 |     @dsc.trace('GPT2')
134 |     def forward(self, idx: dsc.Tensor) -> dsc.Tensor:
135 |         B, T = idx.shape
136 |         tok_emb = self.wte(idx)
137 |         if self.use_cache:
138 |             pos_emb = self.wpe(dsc.arange(T, device='cpu') + self.kv_pos)
139 |         else:
140 |             pos_emb = self.wpe(dsc.arange(T, device='cpu'))
141 |     
142 |         x = tok_emb + pos_emb
143 |         for block in self.h:
144 |             x = block(x)
145 | 
146 |         x = self.ln_f(x)
147 |         logits = self.lm_head(x)
148 |         self.kv_pos += T
149 |         return logits
150 | 
151 |     def generate(self, idx: dsc.Tensor, tokenizer, max_new_tokens: int, temp: float = 1) -> dsc.Tensor:
152 |         assert max_new_tokens < self.hparams.block_size
153 |         # Include the input in the response
154 |         generated = idx
155 |         # The first time process the entire prompt then only the last token
156 |         idx_next = idx
157 |         sampling_start = None; sampling_stop = None
158 |         generation_start = None; generation_stop = None
159 |         for counter in range(max_new_tokens):
160 |             if counter == 0:
161 |                 sampling_start = perf_counter()
162 |             elif counter == 1:
163 |                 generation_start = perf_counter()
164 | 
165 |             if self.use_cache:
166 |                 logits = self(idx_next)
167 |             else:
168 |                 logits = self(generated)
169 |             # Apply temperature to the last row of each bach
170 |             logits = logits[:, -1, :] * (1 / temp)
171 | 
172 |             kth_value = dsc.kth(logits.reshape(-1), 10)
173 |             logits = logits.masked_fill(logits < kth_value, -float('Inf'))
174 |             probs = F.softmax(logits, axis=-1)
175 | 
176 |             idx_next = dsc.multinomial(probs, num_samples=1)
177 | 
178 |             idx_next = idx_next.to('cpu')
179 |             print(tokenizer.decode(idx_next[0]), end='', flush=True)
180 |             generated = dsc.concat([generated, idx_next], axis=1)
181 |             if counter == 0:
182 |                 sampling_stop = perf_counter()
183 | 
184 |         generation_stop = perf_counter()
185 |         print('\n')
186 |         
187 |         # Report metrics
188 |         prompt_processing_time_ms = (sampling_stop - sampling_start) * 1e3
189 |         generation_processing_time_ms = (generation_stop - generation_start) * 1e3
190 |         total_processing_time_ms = (generation_stop - sampling_start) * 1e3
191 |         print(f'prompt processing time\t= {round(prompt_processing_time_ms, 1)}ms')
192 |         print(f'generation time\t\t= {round(generation_processing_time_ms, 1)} ms | {round(generation_processing_time_ms / max_new_tokens, 2)} ms/tok')
193 |         print(f'total time\t\t= {round(total_processing_time_ms, 1)} ms | {round(max_new_tokens / (total_processing_time_ms / 1e3), 2)} tok/s')
194 |         return generated
195 | 
196 | 
197 | if __name__ == '__main__':
198 |     cli = argparse.ArgumentParser(description='GPT2 inference CLI')
199 |     cli.add_argument('prompt', type=str, help='Model prompt')
200 |     cli.add_argument('-n', type=int, default=100, help='Tokens to generate (default=100)')
201 |     cli.add_argument('--no-cache', action='store_true', help='Disable KV cache')
202 |     cli.add_argument('--device', choices=['cpu', 'gpu'], default='cpu', help='Device on which to run the model')
203 |     cli.add_argument('--dtype', choices=['f32', 'bf16'], default='f32', help='Dtype to use for inference')
204 | 
205 |     args = cli.parse_args()
206 | 
207 |     dsc.set_default_device(args.device)
208 |     use_kv_cache = not args.no_cache
209 |     prompt = args.prompt
210 |     max_tokens = args.n
211 | 
212 |     dtype = dsc.f32
213 |     if args.dtype == 'bf16':
214 |         dtype = dsc.bf16
215 | 
216 |     print(f'Running model on {args.device} using {dtype}')
217 | 
218 |     model = GPT2.from_pretrained(use_cache=use_kv_cache, dtype=dtype)
219 | 
220 |     tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
221 |     print(prompt, end='', flush=True)
222 | 
223 |     idx = tokenizer.encode(prompt)
224 |     model.generate(dsc.tensor(idx, dtype=dsc.i32, device='cpu').reshape(1, -1), tokenizer=tokenizer, max_new_tokens=max_tokens)
225 | 


--------------------------------------------------------------------------------
/python/tests/test_ops_gpu.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | # All rights reserved.
  3 | #
  4 | # This code is licensed under the terms of the 3-clause BSD license
  5 | # (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | 
  8 | import torch
  9 | import dsc
 10 | from random import randint, random
 11 | from typing import List
 12 | import pytest
 13 | 
 14 | 
 15 | DTYPES = [torch.bool, torch.int32, torch.float32, torch.float64]
 16 | TORCH_TO_DSC_DTYPES = {
 17 |     torch.bool: dsc.bool_,
 18 |     torch.int32: dsc.i32,
 19 |     torch.float32: dsc.f32,
 20 |     torch.float64: dsc.f64,
 21 | }
 22 | 
 23 | if dsc.gpu.has_bf16():
 24 |     DTYPES = [torch.bool, torch.int32, torch.bfloat16, torch.float32, torch.float64]
 25 |     TORCH_TO_DSC_DTYPES = {
 26 |         torch.bool: dsc.bool_,
 27 |         torch.int32: dsc.i32,
 28 |         torch.bfloat16: dsc.bf16,
 29 |         torch.float32: dsc.f32,
 30 |         torch.float64: dsc.f64,
 31 |     }
 32 | 
 33 | DSC_TO_TORCH_DTYPES = {v: k for k, v in TORCH_TO_DSC_DTYPES.items()}
 34 | 
 35 | 
 36 | def is_float(dtype: torch.dtype) -> bool:
 37 |     return dtype == torch.bfloat16 or dtype == torch.float32 or dtype == torch.float64
 38 | 
 39 | 
 40 | def is_bool(dtype: torch.dtype) -> bool:
 41 |     return dtype == torch.bool
 42 | 
 43 | 
 44 | def is_integer(dtype: torch.dtype) -> bool:
 45 |     return dtype == torch.int32
 46 | 
 47 | 
 48 | @pytest.fixture(scope='session', autouse=True)
 49 | def session_fixture():
 50 |     if not dsc.gpu.is_available():
 51 |         pytest.skip('GPU not available - skipping all GPU tests', allow_module_level=True)
 52 |     dsc.init(int(2**30))
 53 |     # This is invoked once before starting the test session
 54 |     dsc.set_default_device('gpu')
 55 |     yield
 56 | 
 57 | 
 58 | @pytest.fixture(autouse=True)
 59 | def teardown_fixture():
 60 |     # This is invoked automatically after each test
 61 |     yield
 62 | 
 63 | 
 64 | def random_nd(shape: List[int], dtype: torch.dtype = torch.float64) -> (torch.Tensor, dsc.Tensor):
 65 |     torch_tensor = None
 66 |     if dtype == torch.bool:
 67 |         torch_tensor = torch.randint(0, 2, size=tuple(shape), device='cuda').to(dtype)
 68 |     elif dtype == torch.int32:
 69 |         # Return a positive integer tensor if the dtype is int32 so that we don't have issues
 70 |         # with power
 71 |         torch_tensor = torch.randint(0, 10, size=tuple(shape), device='cuda').to(dtype)
 72 |     else:
 73 |         torch_tensor = torch.randn(*tuple(shape), device='cuda').to(dtype)
 74 | 
 75 |     return (torch_tensor,
 76 |             dsc.frombuffer(torch_tensor.shape, TORCH_TO_DSC_DTYPES[torch_tensor.dtype],
 77 |                            torch_tensor.data_ptr(), device='gpu', data_device='gpu'))
 78 | 
 79 | 
 80 | def all_close(actual: dsc.Tensor, target: torch.Tensor, atol: float = 1e-4, rtol: float = 1e-4) -> bool:
 81 |     torch.cuda.synchronize()
 82 |     dsc.gpu.synchronize()
 83 |     actual_dtype = DSC_TO_TORCH_DTYPES[actual.dtype]
 84 |     actual_torch = torch.as_tensor(actual, device='cuda').view(actual_dtype)
 85 |     if is_float(actual_dtype) and not is_float(target.dtype):
 86 |         target = target.to(dtype=actual_dtype)
 87 |     return torch.allclose(actual_torch, target, atol=atol, rtol=rtol, equal_nan=True)
 88 | 
 89 | 
 90 | class TestOps:
 91 |     def test_binary(self):
 92 |         ops = {
 93 |             'add': (torch.add, dsc.add),
 94 |             'sub': (torch.subtract, dsc.sub),
 95 |             'mul': (torch.multiply, dsc.mul),
 96 |             'div': (torch.true_divide, dsc.true_div),
 97 |             'pow': (torch.pow, dsc.power),
 98 |             'equal': (torch.eq, dsc.equal),
 99 |             'not_equal': (torch.ne, dsc.not_equal),
100 |             'less': (torch.lt, dsc.less),
101 |             'less_equal': (torch.le, dsc.less_equal),
102 |             'greater': (torch.gt, dsc.greater),
103 |             'greater_equal': (torch.ge, dsc.greater_equal),
104 |         }
105 |         for op_name in ops.keys():
106 |             torch_op, dsc_op = ops[op_name]
107 |             for dtype in DTYPES:
108 |                 if op_name == 'sub':
109 |                     torch_op = torch.bitwise_xor if is_bool(dtype) else torch.subtract
110 |                 if op_name == 'pow' and is_bool(dtype):
111 |                     # Pow on CUDA is not implemented in torch for bool
112 |                     continue
113 | 
114 |                 atol = 1e-4; rtol = 1e-4
115 |                 if dtype == torch.bfloat16:
116 |                     atol = 1e-1; rtol = 1e-2
117 |                 print(f'Testing operator {op_name} with {dtype}')
118 |                 shape = [randint(2, 10) for _ in range(4)]
119 |                 x, x_dsc = random_nd(shape, dtype=dtype)
120 | 
121 |                 # Same shape
122 |                 y, y_dsc = random_nd(shape, dtype=dtype)
123 | 
124 |                 res_torch = torch_op(x, y)
125 |                 res_dsc = dsc_op(x_dsc, y_dsc)
126 |                 r_res_torch = torch_op(y, x)
127 |                 r_res_dsc = dsc_op(y_dsc, x_dsc)
128 |                 assert all_close(res_dsc, res_torch, atol, rtol), f'Error testing ({x.shape} {op_name} {y.shape}) dtype={dtype}'
129 |                 assert all_close(r_res_dsc, r_res_torch, atol, rtol), f'Error testing ({y.shape} {op_name} {x.shape}) dtype={dtype}'
130 | 
131 |                 # Broadcasting
132 |                 collapse_idx = randint(0, 3)
133 |                 shape[collapse_idx] = 1
134 |                 y_b, y_dsc_b = random_nd(shape, dtype=dtype)
135 | 
136 |                 res_torch_b = torch_op(x, y_b)
137 |                 res_dsc_b = dsc_op(x_dsc, y_dsc_b)
138 |                 r_res_torch_b = torch_op(y_b, x)
139 |                 r_res_dsc_b = dsc_op(y_dsc_b, x_dsc)
140 |                 assert all_close(res_dsc_b, res_torch_b, atol, rtol), f'Error testing ({x.shape} {op_name} {y_b.shape}) dtype={dtype}'
141 |                 assert all_close(r_res_dsc_b, r_res_torch_b, atol, rtol), f'Error testing ({y_b.shape} {op_name} {x.shape}) dtype={dtype}'
142 | 
143 |                 # Scalar
144 |                 if is_float(dtype):
145 |                     y_s = random()
146 | 
147 |                 elif is_bool(dtype):
148 |                     y_s = bool(randint(0, 1))
149 |                 else:
150 |                     y_s = randint(0, 10)
151 | 
152 |                 res_torch_s = torch_op(x, y_s)
153 |                 res_dsc_s = dsc_op(x_dsc, y_s)
154 |                 if 'equal' in op_name or op_name == 'less' or op_name == 'greater':
155 |                     # For comparison ops torch requires the first argument to be a tensor
156 |                     continue
157 |                 r_res_torch_s = torch_op(y_s, x)
158 |                 r_res_dsc_s = dsc_op(y_s, x_dsc)
159 |                 assert all_close(res_dsc_s, res_torch_s, atol, rtol), f'Error testing ({x.shape} {op_name} {y_s}) dtype={dtype}'
160 |                 assert all_close(r_res_dsc_s, r_res_torch_s, atol, rtol), f'Error testing ({y_s} {op_name} {x.shape}) dtype={dtype}'
161 | 
162 |     def test_unary(self):
163 |         ops = {
164 |             'sin': (torch.sin, dsc.sin),
165 |             'cos': (torch.cos, dsc.cos),
166 |             'tanh': (torch.tanh, dsc.tanh),
167 |             'exp': (torch.exp, dsc.exp),
168 |             'sqrt': (torch.sqrt, dsc.sqrt),
169 |         }
170 |         for op_name in ops.keys():
171 |             torch_op, dsc_op = ops[op_name]
172 |             for dtype in DTYPES:
173 |                 print(f'Testing {op_name} with {dtype}')
174 |                 x, x_dsc = random_nd([randint(1, 10) for _ in range(4)], dtype=dtype)
175 | 
176 |                 res_torch = torch_op(x)
177 |                 res_dsc = dsc_op(x_dsc)
178 | 
179 |                 assert all_close(res_dsc, res_torch), f'Error testing {op_name} shape={x.shape} dtype={dtype}'
180 |     
181 |     def test_unary_axis(self):
182 |         ops = {
183 |             'sum': (torch.sum, dsc.sum),
184 |             'mean': (torch.mean, dsc.mean),
185 |             'var': (torch.var, dsc.var),
186 |             'max': (torch.amax, dsc.max),
187 |             'min': (torch.amin, dsc.min),
188 |         }
189 |         for op_name in ops.keys():
190 |             torch_op, dsc_op = ops[op_name]
191 |             for dtype in DTYPES:
192 |                 for axis in range(-4, 4):
193 |                     rtol = 1e-4; atol = 1e-4
194 |                     params_torch = {
195 |                         'dim': axis,
196 |                         'keepdim': True
197 |                     }
198 |                     if op_name == 'mean' or op_name == 'var':
199 |                         if not is_float(dtype) or dtype == torch.bfloat16:
200 |                             continue
201 |                         atol = 1e-3; rtol = 1e-2
202 |                         if op_name == 'var':
203 |                             params_torch['correction'] = 0
204 | 
205 |                     print(f'Testing {op_name} with {dtype} along axis {axis}')
206 |                     x, x_dsc = random_nd(
207 |                         [randint(1, 10) for _ in range(4)], dtype=dtype
208 |                     )
209 | 
210 |                     res_torch = torch_op(x, **params_torch)
211 |                     res_dsc = dsc_op(x_dsc, axis=axis, keepdims=True)
212 |                     assert all_close(res_dsc, res_torch, atol, rtol), f'Error testing {op_name} shape={x.shape} dtype={x.dtype} keepdims=True'
213 | 
214 |                     params_torch['keepdim'] = False
215 |                     res_torch_2 = torch_op(x, **params_torch)
216 |                     res_dsc_2 = dsc_op(x_dsc, axis=axis, keepdims=False)
217 |                     assert all_close(res_dsc_2, res_torch_2, atol, rtol), f'Error testing {op_name} shape={x.shape} dtype={x.dtype} keepdims=False'
218 | 
219 |     def test_matmul(self):
220 |         def _mnk() -> tuple[int, int, int]:
221 |             return randint(50, 100), randint(50, 100), randint(50, 100)
222 | 
223 |         def _test_matmul(shape_a: List[int], shape_b: List[int], dt: torch.dtype):
224 |             print(f'Testing {shape_a} @ {shape_b} on with {dt}')
225 |             xa, xa_dsc = random_nd(shape_a, dtype=dt)
226 |             xb, xb_dsc = random_nd(shape_b, dtype=dt)
227 |             res = xa @ xb
228 |             res_dsc = xa_dsc @ xb_dsc
229 |             # TODO: it looks like BF16 has a lower precision, should check what torch actually does. For now fix the tolerance at 1%
230 |             assert all_close(res_dsc, res, atol=1e-1, rtol=1e-2), f'Error testing {shape_a} @ {shape_b} with {dt}'
231 | 
232 |         for dtype in DTYPES:
233 |             if not is_float(dtype):
234 |                 continue
235 |             # 2D GEMM
236 |             m, n, k = _mnk()
237 |             _test_matmul([m, k], [k, n], dtype)
238 |             # GEVM
239 |             _test_matmul([1, k], [k, n], dtype)
240 | 
241 |             # Batched case
242 |             for _ in range(5):
243 |                 batch_1, batch_2 = randint(2, 10), randint(2, 10)
244 |                 m, n, k = _mnk()
245 |                 _test_matmul([batch_1, batch_2, m, k], [batch_1, batch_2, k, n], dtype)
246 | 
247 |             # Batched case with broadcasting
248 |             for batch_1 in range(1, 6):
249 |                 for batch_2 in range(1, 6):
250 |                     m, n, k = _mnk()
251 |                     _test_matmul([batch_1 if batch_1%2 == 0 else 1,
252 |                                   batch_2 if batch_2%2 == 0 else 1, m, k],
253 |                                  [batch_1 if batch_1%2 == 1 else 1,
254 |                                   batch_2 if batch_2%2 == 1 else 1, k, n],
255 |                                  dtype)
256 |     def test_outer(self):
257 |         for dtype in DTYPES:
258 |             for _ in range(10):
259 |                 xa, xa_dsc = random_nd([randint(2, 50)], dtype)
260 |                 xb, xb_dsc = random_nd([randint(2, 50)], dtype)
261 | 
262 |                 out = torch.outer(xa, xb)
263 |                 out_dsc = dsc.outer(xa_dsc, xb_dsc)
264 |                 assert all_close(out_dsc, out)
265 | 


--------------------------------------------------------------------------------
/examples/models/qwen2_5.py:
--------------------------------------------------------------------------------
  1 | #  Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | #  All rights reserved.
  3 | #
  4 | #  This code is licensed under the terms of the 3-clause BSD license
  5 | #  (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | 
  8 | import dsc
  9 | import dsc.nn as nn
 10 | import dsc.nn.functional as F
 11 | from dataclasses import dataclass
 12 | from time import perf_counter
 13 | import argparse
 14 | from transformers import AutoTokenizer
 15 | from typing import Tuple, Optional, List
 16 | import math
 17 | import numpy as np
 18 | 
 19 | 
 20 | CacheEntry = Tuple[dsc.Tensor, dsc.Tensor]
 21 | Cache = List[CacheEntry]
 22 | 
 23 | 
 24 | # Default config for Qwen 2.5 0.5B
 25 | @dataclass
 26 | class Config:
 27 |     vocab_size: int = 151936
 28 |     hidden_size: int = 896
 29 |     intermediate_size: int = 4864
 30 |     num_hidden_layers: int = 24
 31 |     num_attention_heads: int = 14
 32 |     num_key_value_heads: int = 2
 33 |     max_position_embeddings: int = 1024
 34 |     rms_norm_eps: float = 1e-6
 35 |     tie_word_embeddings: bool = True
 36 |     rope_theta: float = 1000000.0
 37 |     sliding_window: int = 4096
 38 |     max_window_layers: int = 28
 39 |     bos_token_id: int = 151643
 40 |     eos_token_id: int = 151645
 41 | 
 42 | 
 43 | class MLP(nn.Module):
 44 |     def __init__(self, config: Config, dtype: dsc.Dtype = dsc.f32):
 45 |         super().__init__()
 46 |         self.hidden_size = config.hidden_size
 47 |         self.intermediate_size = config.intermediate_size
 48 |         self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False, dtype=dtype)
 49 |         self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False, dtype=dtype)
 50 |         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False, dtype=dtype)
 51 | 
 52 |     @dsc.trace('MLP')
 53 |     def forward(self, x: dsc.Tensor) -> dsc.Tensor:
 54 |         return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
 55 | 
 56 | 
 57 | def _pre_compute_freqs(dim: int, theta: float, max_seq_len: int, dtype: dsc.Dtype = dsc.f32) -> Tuple[dsc.Tensor, dsc.Tensor]:
 58 |     freqs = 1.0 / (theta ** ((dsc.arange(start=0, stop=dim, step=2)[: (dim // 2)]).cast(dtype) / dim))
 59 |     t = dsc.arange(stop=max_seq_len, dtype=dtype)
 60 |     freqs = dsc.outer(t, freqs)
 61 |     cos_cache_half = dsc.cos(freqs)
 62 |     sin_cache_half = dsc.sin(freqs)
 63 | 
 64 |     cos_cache = dsc.concat([cos_cache_half, cos_cache_half], axis=-1)
 65 |     sin_cache = dsc.concat([sin_cache_half, sin_cache_half], axis=-1)
 66 |     return cos_cache, sin_cache
 67 | 
 68 | 
 69 | def _rotate_half(x: dsc.Tensor) -> dsc.Tensor:
 70 |     lim = x.size(-1) // 2
 71 |     x1 = x[:, :, :, :lim]
 72 |     x2 = x[:, :, :, lim:]
 73 |     return dsc.concat([-x2, x1], axis=-1)
 74 | 
 75 | 
 76 | @dsc.trace('RoPE')
 77 | def _apply_rope(q: dsc.Tensor, k: dsc.Tensor,
 78 |                 freq_cos: dsc.Tensor,
 79 |                 freq_sin: dsc.Tensor,
 80 |                 position_ids: dsc.Tensor) -> Tuple[dsc.Tensor, dsc.Tensor]:
 81 |     cos = freq_cos[position_ids]
 82 |     sin = freq_sin[position_ids]
 83 | 
 84 |     batch_size, seq_len, head_size = cos.shape
 85 | 
 86 |     cos = cos.reshape(batch_size, 1, seq_len, head_size)
 87 |     sin = sin.reshape(batch_size, 1, seq_len, head_size)
 88 | 
 89 |     q_embed = (q * cos) + (_rotate_half(q) * sin)
 90 |     k_embed = (k * cos) + (_rotate_half(k) * sin)
 91 |     return q_embed, k_embed
 92 | 
 93 | 
 94 | def _repeat_kv(x: dsc.Tensor, n_rep: int) -> dsc.Tensor:
 95 |     if n_rep == 1:
 96 |         return x
 97 |     return dsc.repeat(x, n_rep, axis=1)
 98 | 
 99 | 
100 | class Attention(nn.Module):
101 |     def __init__(self, config: Config, dtype: dsc.Dtype = dsc.f32):
102 |         super().__init__()
103 |         self.head_size = config.hidden_size // config.num_attention_heads
104 |         self.num_heads = config.num_attention_heads
105 |         self.num_kv_heads = config.num_key_value_heads
106 |         self.n_rep = self.num_heads // self.num_kv_heads
107 |         self.sliding_window = config.sliding_window
108 |         self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_size, dtype=dtype)
109 |         self.k_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_size, dtype=dtype)
110 |         self.v_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_size, dtype=dtype)
111 |         self.o_proj = nn.Linear(self.num_heads * self.head_size, config.hidden_size, bias=False, dtype=dtype)
112 | 
113 |     @dsc.trace('Attention')
114 |     def forward(
115 |         self, x: dsc.Tensor,
116 |         freq_cos_cache: dsc.Tensor,
117 |         freq_sin_cache: dsc.Tensor,
118 |         position_ids: dsc.Tensor,
119 |         past_key_value: Optional[CacheEntry] = None
120 |     ) -> Tuple[dsc.Tensor, CacheEntry]:
121 | 
122 |         block_size, seq_len, _ = x.shape
123 |         q, k_cur, v_cur = self.q_proj(x), self.k_proj(x), self.v_proj(x)
124 | 
125 |         q = q.reshape(block_size, seq_len, self.num_heads, self.head_size).transpose((0, 2, 1, 3))
126 |         k_cur = k_cur.reshape(block_size, seq_len, self.num_kv_heads, self.head_size).transpose((0, 2, 1, 3))
127 |         v_cur = v_cur.reshape(block_size, seq_len, self.num_kv_heads, self.head_size).transpose((0, 2, 3, 1))
128 | 
129 |         q, k_cur = _apply_rope(q, k_cur, freq_cos_cache, freq_sin_cache, position_ids)
130 | 
131 |         if past_key_value is not None:
132 |             past_k, past_v = past_key_value
133 |             k = dsc.concat([past_k, k_cur], axis=2)
134 |             v = dsc.concat([past_v, v_cur], axis=3)
135 |         else:
136 |             k = k_cur
137 |             v = v_cur
138 | 
139 |         present_key_value = (k, v)
140 | 
141 |         k = _repeat_kv(k, self.n_rep)
142 |         v = _repeat_kv(v, self.n_rep)
143 | 
144 |         scores = dsc.matmul(q, k, trans_b=True) * (1.0 / math.sqrt(self.head_size))
145 | 
146 |         q_len = q.size(2)
147 |         k_len = k.size(2)
148 | 
149 |         # SWA
150 |         k_pos_indices = dsc.arange(k_len).reshape(1, -1)
151 |         q_pos_indices = dsc.arange(start=(k_len - q_len), stop=k_len).reshape(-1, 1)
152 |         causal_mask = k_pos_indices <= q_pos_indices # shape (q_len, k_len)
153 |         window_mask = (q_pos_indices - k_pos_indices) < self.sliding_window
154 | 
155 |         should_attend = causal_mask * window_mask # shape (q_len, k_len)
156 | 
157 |         additive_mask = dsc.where(
158 |             should_attend,
159 |             0.0,
160 |             float('-inf')
161 |         ).reshape(1, 1, q_len, k_len).cast(scores.dtype)
162 |         masked_scores = scores + additive_mask
163 | 
164 |         attn_weights = F.softmax(masked_scores, axis=-1)
165 |         out = dsc.matmul(attn_weights, v, trans_b=True).transpose((0, 2, 1, 3)).reshape(block_size, seq_len, -1)
166 | 
167 |         return self.o_proj(out), present_key_value
168 | 
169 | 
170 | class DecoderLayer(nn.Module):
171 |     def __init__(self, config: Config, dtype: dsc.Dtype = dsc.f32):
172 |         super().__init__()
173 |         self.self_attn = Attention(config, dtype=dtype)
174 |         self.mlp = MLP(config, dtype=dtype)
175 |         self.input_layernorm = nn.RMSNorm(config.hidden_size, epsilon=config.rms_norm_eps, dtype=dtype)
176 |         self.post_attention_layernorm = nn.RMSNorm(config.hidden_size, epsilon=config.rms_norm_eps, dtype=dtype)
177 | 
178 |     @dsc.trace('DecoderLayer')
179 |     def forward(
180 |         self,
181 |         x: dsc.Tensor,
182 |         freq_cos: dsc.Tensor,
183 |         freq_sin: dsc.Tensor,
184 |         position_ids: dsc.Tensor,
185 |         past_key_value: Optional[CacheEntry] = None
186 |     ) -> Tuple[dsc.Tensor, CacheEntry]:
187 | 
188 |         ln_out = self.input_layernorm(x)
189 |         attn_out, present_kv = self.self_attn(ln_out, freq_cos, freq_sin, position_ids, past_key_value)
190 |         h = x + attn_out
191 |         return h + self.mlp(self.post_attention_layernorm(h)), present_kv
192 | 
193 | 
194 | class Qwen25Model(nn.Module):
195 |     def __init__(self, config: Config, dtype: dsc.Dtype = dsc.f32):
196 |         super().__init__()
197 |         self.config = config
198 |         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, dtype=dtype)
199 |         self.layers = nn.ModuleList([DecoderLayer(config, dtype=dtype) for _ in range(config.num_hidden_layers)])
200 |         self.norm = nn.RMSNorm(config.hidden_size, config.rms_norm_eps, dtype=dtype)
201 |         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False, dtype=dtype)
202 |         cos_cache, sin_cache = _pre_compute_freqs(config.hidden_size // config.num_attention_heads, config.rope_theta, config.max_position_embeddings, dtype=dtype)
203 |         self.cos_cache = cos_cache
204 |         self.sin_cache = sin_cache
205 | 
206 | 
207 |     @staticmethod
208 |     def from_pretrained(config: Config = Config(), dtype: dsc.Dtype = dsc.f32) -> 'Qwen25Model':
209 |         state_dict = nn.safe_load('https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct/resolve/main/model.safetensors',
210 |                                   trim_prefix='model.',
211 |                                   use_dtype=dtype)
212 |         model = Qwen25Model(config, dtype)
213 |         model.from_state(state_dict,
214 |                          tied={'lm_head.weight': 'embed_tokens.weight'})
215 |         del state_dict
216 |         dsc.print_mem_usage()
217 |         return model
218 |     
219 |     @dsc.trace('Qwen2_5')
220 |     def forward(self,
221 |         x: dsc.Tensor,
222 |         position_ids: dsc.Tensor,
223 |         past_key_values: Optional[Cache] = None,
224 |         use_cache: bool = True
225 |     ) -> Tuple[dsc.Tensor, Optional[Cache]]:
226 |         h = self.embed_tokens(x.to('cpu'))
227 | 
228 |         next_kv_caches = [] if use_cache else None
229 |         for i, layer in enumerate(self.layers):
230 |             layer_cache = past_key_values[i] if past_key_values is not None else None
231 |             h, present_kv = layer(
232 |                 h,
233 |                 self.cos_cache,
234 |                 self.sin_cache,
235 |                 position_ids=position_ids,
236 |                 past_key_value=layer_cache
237 |             )
238 |             if use_cache:
239 |                 next_kv_caches.append(present_kv)
240 | 
241 |         h = self.norm(h)
242 |         return self.lm_head(h), next_kv_caches
243 | 
244 |     def generate(self, idx: dsc.Tensor, tokenizer, max_new_tokens: int, top_k: int = 10):
245 |         prompt_processing_start = perf_counter()
246 |         prompt_tokens = idx.reshape(1, -1)
247 |         prompt_len = prompt_tokens.size(1)
248 | 
249 |         prompt_position_ids = dsc.arange(stop=prompt_len, device='cpu').reshape(1, -1)
250 |         # Run forward without caching
251 |         logits, past_key_values = self(prompt_tokens, position_ids=prompt_position_ids, past_key_values=None)
252 |         next_token_logits = logits[:, -1, :]
253 |         generated_tokens = []
254 |         current_len = prompt_len
255 |         prompt_processing_ms = (perf_counter() - prompt_processing_start) * 1e3
256 | 
257 |         # Loop
258 |         generation_start = perf_counter()
259 |         for _ in range(max_new_tokens):
260 |             k_th_value = dsc.kth(next_token_logits.reshape(-1), top_k)
261 |             next_token_logits = next_token_logits.masked_fill(next_token_logits < k_th_value, float('-inf'))
262 |             probs = F.softmax(next_token_logits, axis=-1)
263 | 
264 |             next_token_id = dsc.multinomial(probs, num_samples=1)
265 |             tok_id_scalar = next_token_id[0, 0]
266 |             if tok_id_scalar == self.config.eos_token_id:
267 |                 print('\n[EOS]', flush=True)
268 |                 break
269 | 
270 |             generated_tokens.append(tok_id_scalar)
271 |             print(tokenizer.decode(tok_id_scalar, skip_special_tokens=True), end='', flush=True)
272 | 
273 |             input_ids = next_token_id
274 |             position_ids = dsc.tensor([current_len], dtype=dsc.i32, device='cpu').reshape(1, -1)
275 | 
276 |             # Run forward with caching
277 |             logits, next_past_key_values = self(
278 |                 input_ids,
279 |                 position_ids=position_ids,
280 |                 past_key_values=past_key_values
281 |             )
282 |             past_key_values = next_past_key_values
283 |             next_token_logits = logits[:, -1, :] # Note: this is probably useless
284 |             current_len += 1
285 | 
286 |         generation_stop = perf_counter()
287 |         total_processing_ms = (generation_stop - prompt_processing_start) * 1e3
288 |         generation_processing_ms = (generation_stop - generation_start) * 1e3
289 |         print()
290 | 
291 |         print(f'prompt processing time\t= {round(prompt_processing_ms, 1)}ms')
292 |         print(f'generation time\t\t= {round(generation_processing_ms, 1)} ms | {round(generation_processing_ms / max_new_tokens, 2)} ms/tok')
293 |         print(f'total time\t\t= {round(total_processing_ms, 1)} ms | {round(max_new_tokens / (total_processing_ms / 1e3), 2)} tok/s')
294 |         return generated_tokens
295 | 
296 | 
297 | if __name__ == '__main__':
298 |     cli = argparse.ArgumentParser(description='QWEN 2.5 inference CLI')
299 |     cli.add_argument('prompt', type=str, help='Model prompt')
300 |     cli.add_argument('-n', type=int, default=100, help='Tokens to generate (default=100)')
301 |     cli.add_argument('-top-k', type=int, default=10, help='Top K sampling (default=10)')
302 |     cli.add_argument('--device', choices=['cpu', 'gpu'], default='cpu', help='Device on which to run the model')
303 |     cli.add_argument('--dtype', choices=['f32', 'bf16'], default='f32', help='Dtype to use for inference')
304 | 
305 |     args = cli.parse_args()
306 | 
307 |     dsc.set_default_device(args.device)
308 |     prompt = args.prompt
309 |     max_tokens = args.n
310 |     top_k = args.top_k
311 |     dtype = dsc.f32
312 |     if args.dtype == 'bf16':
313 |         dtype = dsc.bf16
314 | 
315 |     print(f'Running model on {args.device} using {dtype}')
316 | 
317 |     model = Qwen25Model.from_pretrained(dtype=dtype)
318 |     tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-Coder-0.5B-Instruct')
319 |     messages = [
320 |         {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
321 |         {"role": "user", "content": prompt}
322 |     ]
323 |     tokens = tokenizer.apply_chat_template(
324 |         messages,
325 |         tokenize=False,
326 |         add_generation_prompt=True,
327 |     )
328 |     model_inputs = tokenizer([tokens], return_tensors="np")
329 | 
330 |     model_input_ids = dsc.from_numpy(model_inputs.input_ids.astype(np.int32), device='cpu')
331 | 
332 |     model.generate(model_input_ids, tokenizer, max_new_tokens=max_tokens, top_k=top_k)
333 | 


--------------------------------------------------------------------------------
/dsc/include/dsc.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2024-2025, Christian Gilli <christian.gilli11@gmail.com>
  2 | // All rights reserved.
  3 | //
  4 | // This code is licensed under the terms of the 3-clause BSD license
  5 | // (https://opensource.org/license/bsd-3-clause).
  6 | 
  7 | #pragma once
  8 | 
  9 | // =============================================================== //
 10 | // =========================== Notepad =========================== //
 11 | // =============================================================== //
 12 | // (1) It's probably a good idea to add a struct for the arguments //
 13 | //     of dsc_new_tensor, mixing new params with defaults can lead //
 14 | //     to nasty bugs                                               //
 15 | // (2) Create a macro to validate tensors? It's probably a good    //
 16 | //     idea to always check by defaults both tensor != nullptr and //
 17 | //     tensor->buf != nullptr                                      //
 18 | // (3) Sometimes the context in Python is freed before all the     //
 19 | //     associated tensors are freed. This will SEGFAULT! It makes  //
 20 | //     sense to just not free the context in Python for now        //
 21 | // (4) Evaluate the iterator approach (check codegen with godbolt) //
 22 | // (5) Scratch buffer to allocate temporary results on a device    //
 23 | // (6) Use the same approach to pass shape as `full`               //
 24 | // =============================================================== //
 25 | 
 26 | #include <cstdlib> // getenv, atoi
 27 | #include <cstdio>
 28 | #include "dsc_dtype.h"
 29 | 
 30 | 
 31 | #if !defined(DSC_MAX_OBJS)
 32 | #    define DSC_MAX_OBJS            ((int) 1'000)
 33 | #endif
 34 | 
 35 | #define DSC_MAX_DEVICES             ((int) 2)
 36 | #define DSC_DEFAULT_DEVICE          CPU
 37 | #define DSC_COMPARISON_OPS          ((int) 6)
 38 | #define DSC_TRACE_NAME_MAX          ((int) 32)
 39 | #define DSC_TRACE_CAT_MAX           ((int) 16)
 40 | 
 41 | #if !defined(DSC_MAX_TRACES_PER_CHUNK)
 42 | #   define DSC_MAX_TRACES_PER_CHUNK 1'000'000
 43 | #endif
 44 | 
 45 | #if !defined(DSC_MAX_CHUNKS)
 46 | #   define DSC_MAX_CHUNKS 100
 47 | #endif
 48 | 
 49 | 
 50 | static_assert(DSC_MAX_DEVICES == 2, "DSC_MAX_DEVICES != 2 - update the code");
 51 | static_assert(DSC_COMPARISON_OPS == 6, "DSC_COMPARISON_OPS != 6 - update the code");
 52 | 
 53 | #define DSC_ASSERT(x)                                                           \
 54 |     do {                                                                        \
 55 |         if (!(x)) {                                                             \
 56 |             fprintf(stderr, "DSC_ASSERT: %s:%d %s\n", __FILE__, __LINE__, #x);  \
 57 |             exit(EXIT_FAILURE);                                                 \
 58 |         }                                                                       \
 59 |     } while(0)
 60 | 
 61 | #define DSC_LOG_FATAL(format, ...)                                            \
 62 |     do {                                                                      \
 63 |         fprintf(stderr, "[FATAL] %s: " format "\n", __func__, ##__VA_ARGS__); \
 64 |         exit(EXIT_FAILURE);                                                   \
 65 |     } while (0)
 66 | 
 67 | #if DSC_LOG_LEVEL >= 3
 68 | #    define DSC_LOG_DEBUG(format, ...)  ((void) 0)
 69 | #    define DSC_LOG_INFO(format, ...)   ((void) 0)
 70 | #    define DSC_LOG_ERR(format, ...)    ((void) 0)
 71 | #elif DSC_LOG_LEVEL >= 2
 72 | #    define DSC_LOG_DEBUG(format, ...)  ((void) 0)
 73 | #    define DSC_LOG_INFO(format, ...)   ((void) 0)
 74 | #    define DSC_LOG_ERR(format, ...)    fprintf(stderr, "[ERROR] %s: " format"\n",__func__, ##__VA_ARGS__)
 75 | #elif DSC_LOG_LEVEL >= 1
 76 | #    define DSC_LOG_DEBUG(format, ...)  ((void) 0)
 77 | #    define DSC_LOG_INFO(format, ...)   fprintf(stdout, "[INFO ] %s: " format"\n",__func__, ##__VA_ARGS__)
 78 | #    define DSC_LOG_ERR(format, ...)    fprintf(stderr, "[ERROR] %s: " format"\n",__func__, ##__VA_ARGS__)
 79 | #else
 80 | #    define DSC_LOG_DEBUG(format, ...)  fprintf(stdout, "[DEBUG] %s: " format"\n",__func__, ##__VA_ARGS__)
 81 | #    define DSC_LOG_INFO(format, ...)   fprintf(stdout, "[INFO ] %s: " format"\n",__func__, ##__VA_ARGS__)
 82 | #    define DSC_LOG_ERR(format, ...)    fprintf(stderr, "[ERROR] %s: " format"\n",__func__, ##__VA_ARGS__)
 83 | #endif
 84 | 
 85 | #define DSC_INVALID_CASE(format, ...)   \
 86 |     default:                            \
 87 |         DSC_LOG_FATAL(format, ##__VA_ARGS__)
 88 | 
 89 | #define DSC_UNUSED(x)        ((void) (x))
 90 | // Compute the next value of X aligned to Y
 91 | #define DSC_ALIGN(x, y)      (((x) + (y) - 1) & ~((y) - 1))
 92 | #define DSC_MAX(x, y)        ((x) > (y) ? (x) : (y))
 93 | #define DSC_MIN(x, y)        ((x) < (y) ? (x) : (y))
 94 | #define DSC_CEIL(x, y)       (((x) + ((y) - 1)) / (y))
 95 | #define DSC_B_TO_KB(b)       ((f64) (b) / 1024.)
 96 | #define DSC_B_TO_MB(b)       ((f64) (b) / (1024. * 1024.))
 97 | #define DSC_GB(gb)           ((usize) ((gb) * 1024ULL * 1024ULL * 1024ULL))
 98 | #define DSC_MB(mb)           ((usize) ((mb) * 1024ULL * 1024ULL))
 99 | #define DSC_KB(kb)           ((usize) ((kb) * 1024ULL))
100 | 
101 | // A 'strictly pure' function is a function whose return value doesn't depend on the global state of the program,
102 | // this means that it must not access global variables subject to change or access parameters passed by pointer
103 | // unless the actual value of the pointer does not change after the first invocation.
104 | // A 'pure' function is basically the same thing without the restriction on global state change, this means
105 | // that a 'pure' function can take in and read the value of parameters passed by pointer even if that value
106 | // changes between subsequent invocations.
107 | #if defined(__NVCC__)
108 | #    define DSC_INLINE           __forceinline__
109 | #    define DSC_STRICTLY_PURE    __attribute__((const))
110 | #    define DSC_PURE             __attribute__((pure))
111 | #elif defined(__HIPCC__)
112 | #    define DSC_INLINE           __forceinline__
113 | #    define DSC_STRICTLY_PURE    __attribute__((const))
114 | #    define DSC_PURE             __attribute__((pure))
115 | #elif defined(__GNUC__)
116 | #    define DSC_INLINE          inline __attribute__((always_inline))
117 | #    define DSC_STRICTLY_PURE   __attribute__((const))
118 | #    define DSC_PURE            __attribute__((pure))
119 | #else
120 | #    define DSC_INLINE          inline
121 | #    define DSC_STRICTLY_PURE
122 | #    define DSC_PURE
123 | #endif
124 | 
125 | #define DSC_RESTRICT __restrict
126 | 
127 | #if !defined(DSC_MAX_DIMS)
128 | #    define DSC_MAX_DIMS ((int) 4)
129 | #endif
130 | 
131 | static_assert(DSC_MAX_DIMS == 4, "DSC_MAX_DIMS != 4 - update the code");
132 | 
133 | #define DSC_VALUE_NONE          INT32_MAX
134 | #define DSC_DATA_ALIAS(T, X)    T *X##_data = (T *) (X)->buf->data
135 | #define DSC_DATA(T, X)          T *DSC_RESTRICT X##_data = (T *) (X)->buf->data
136 | 
137 | #define dsc_tensor_dim_idx(X, dim)    (((dim) < 0) ? (DSC_MAX_DIMS + (dim)) : (DSC_MAX_DIMS - (X)->n_dim + (dim)))
138 | // Note: dsc_tensor_get_dim() MUST NOT be used with the result of dsc_tensor_dim_idx()!
139 | #define dsc_tensor_get_dim(X, dim)    ((X)->shape[dsc_tensor_dim_idx((X), (dim))])
140 | #define dsc_tensor_get_stride(X, dim) ((X)->stride[dsc_tensor_dim_idx((X), (dim))])
141 | #define dsc_new_like(CTX, X)          (dsc_new_tensor((CTX), (X)->n_dim, &dsc_tensor_get_dim(X, 0), (X)->dtype, (X)->device))
142 | #define dsc_copy_of(CTX, X, dev)      (dsc_new_tensor((CTX), (X)->n_dim, &dsc_tensor_get_dim(X, 0), (X)->dtype, (dev), nullptr, false, (X)->buf->data, (X)->device))
143 | #define dsc_new_view(CTX, X)          (dsc_new_tensor((CTX), (X)->n_dim, &dsc_tensor_get_dim(X, 0), (X)->dtype, (X)->device, (X)->buf))
144 | #define dsc_for(idx, X)               for (int idx = 0; idx < (X)->ne; ++idx)
145 | #define dsc_is_scalar(X)              (X)->ne == 1
146 | 
147 | #if defined(__cplusplus)
148 | extern "C" {
149 | #endif
150 | 
151 | struct dsc_data_buffer;
152 | struct dsc_trace_ctx;
153 | struct dsc_device;
154 | 
155 | enum dsc_device_type : i8 {
156 |     DEFAULT = -1,
157 |     CPU,
158 |     GPU,
159 | };
160 | 
161 | enum dsc_gpu_platform : i8 {
162 |     NONE = -1,
163 |     CUDA,
164 |     ROCM,
165 | };
166 | 
167 | static constexpr const char *DSC_DEVICE_NAMES[DSC_MAX_DEVICES] = {
168 |         "CPU",
169 |         "GPU",
170 | };
171 | 
172 | static constexpr const char *DSC_GPU_PLATFORM_NAMES[2] = {
173 |         "CUDA",
174 |         "ROCm",
175 | };
176 | 
177 | enum dsc_comparison_op : u8 {
178 |     EQ,
179 |     NE,
180 |     LT,
181 |     LE,
182 |     GT,
183 |     GE
184 | };
185 | 
186 | struct dsc_tensor {
187 |     // The shape of this tensor, right-aligned. For example a 1D tensor T of 4 elements
188 |     // will have dim = [1, 1, 1, 4].
189 |     int shape[DSC_MAX_DIMS];
190 |     // Stride for a given dimension expressed in number of elements.
191 |     int stride[DSC_MAX_DIMS];
192 |     dsc_data_buffer *buf;
193 |     int ne;
194 |     int n_dim;
195 |     dsc_dtype dtype;
196 |     dsc_device_type device;
197 | };
198 | 
199 | struct dsc_ctx {
200 |     dsc_device *devices[DSC_MAX_DEVICES];
201 |     dsc_tensor *tensors;
202 |     dsc_device_type default_device;
203 | };
204 | 
205 | struct dsc_slice {
206 |     union {
207 |         int d[3];
208 |         struct {
209 |             int start, stop, step;
210 |         };
211 |     };
212 | };
213 | 
214 | // ============================================================
215 | // Helpers
216 | 
217 | static DSC_INLINE int dsc_get_env(const char *env, int value = 0) {
218 |     if (const char *str = std::getenv(env)) {
219 |         value = std::atoi(str);
220 |     }
221 | 
222 |     return value;
223 | }
224 | 
225 | // ============================================================
226 | // Initialization
227 | 
228 | extern dsc_ctx *dsc_ctx_init(usize mem_size);
229 | 
230 | // ============================================================
231 | // Cleanup/Teardown
232 | 
233 | extern void dsc_ctx_free(dsc_ctx *ctx);
234 | 
235 | extern void dsc_tensor_free(dsc_ctx *ctx, dsc_tensor *x);
236 | 
237 | // ============================================================
238 | // Utilities
239 | 
240 | extern usize dsc_used_mem(dsc_ctx *ctx);
241 | 
242 | extern void dsc_print_mem_usage(dsc_ctx *ctx);
243 | 
244 | extern void dsc_set_default_device(dsc_ctx *ctx, dsc_device_type device);
245 | 
246 | // ============================================================
247 | // GPU Utilities
248 | 
249 | extern dsc_gpu_platform dsc_get_gpu_platform(dsc_ctx *ctx);
250 | 
251 | extern void dsc_gpu_set_device(dsc_ctx *ctx, int device);
252 | 
253 | extern bool dsc_gpu_available(dsc_ctx *);
254 | 
255 | extern int dsc_gpu_devices(dsc_ctx *);
256 | 
257 | extern int dsc_gpu_dev_capability(dsc_ctx *, int device);
258 | 
259 | extern usize dsc_gpu_dev_mem(dsc_ctx *, int device);
260 | 
261 | extern void dsc_gpu_sync(dsc_ctx *);
262 | 
263 | extern bool dsc_gpu_has_bf16(dsc_ctx *);
264 |     
265 | // ============================================================
266 | // Tracing
267 | 
268 | extern bool dsc_tracing_enabled();
269 | 
270 | extern void dsc_insert_trace(dsc_ctx *ctx,
271 |                              const char *name,
272 |                              u64 start,
273 |                              u64 duration);
274 | 
275 | extern void dsc_dump_traces(dsc_ctx *ctx);
276 | 
277 | // ============================================================
278 | // Tensor Creation
279 | 
280 | extern void dsc_tensor_set_buffer(dsc_ctx *,
281 |                                   dsc_tensor *DSC_RESTRICT x,
282 |                                   dsc_data_buffer *buf);
283 | 
284 | // TODO: (1) (2)
285 | extern dsc_tensor *dsc_new_tensor(dsc_ctx *ctx,
286 |                                   int n_dim,
287 |                                   const int *shape,
288 |                                   dsc_dtype dtype,
289 |                                   dsc_device_type device = DEFAULT,
290 |                                   dsc_data_buffer *buf = nullptr,
291 |                                   bool lazy = false,
292 |                                   const void *DSC_RESTRICT data = nullptr,
293 |                                   dsc_device_type data_device = DEFAULT);
294 | 
295 | extern dsc_tensor *dsc_view(dsc_ctx *ctx,
296 |                             const dsc_tensor *x);
297 | 
298 | extern dsc_tensor *dsc_tensor_1d(dsc_ctx *ctx,
299 |                                  dsc_dtype dtype,
300 |                                  int dim1,
301 |                                  dsc_device_type device = DEFAULT,
302 |                                  const void *DSC_RESTRICT data = nullptr,
303 |                                  dsc_device_type data_device = DEFAULT);
304 | 
305 | extern dsc_tensor *dsc_tensor_2d(dsc_ctx *ctx,
306 |                                  dsc_dtype dtype,
307 |                                  int dim1, int dim2,
308 |                                  dsc_device_type device = DEFAULT,
309 |                                  const void *DSC_RESTRICT data = nullptr,
310 |                                  dsc_device_type data_device = DEFAULT);
311 | 
312 | extern dsc_tensor *dsc_tensor_3d(dsc_ctx *ctx,
313 |                                  dsc_dtype dtype,
314 |                                  int dim1, int dim2,
315 |                                  int dim3,
316 |                                  dsc_device_type device = DEFAULT,
317 |                                  const void *DSC_RESTRICT data = nullptr,
318 |                                  dsc_device_type data_device = DEFAULT);
319 | 
320 | extern dsc_tensor *dsc_tensor_4d(dsc_ctx *ctx,
321 |                                  dsc_dtype dtype,
322 |                                  int dim1, int dim2,
323 |                                  int dim3, int dim4,
324 |                                  dsc_device_type device = DEFAULT,
325 |                                  const void *DSC_RESTRICT data = nullptr,
326 |                                  dsc_device_type data_device = DEFAULT);
327 | 
328 | extern dsc_tensor *dsc_wrap_bool(dsc_ctx *ctx,
329 |                                  bool val,
330 |                                  dsc_device_type device = DEFAULT);
331 | 
332 | extern dsc_tensor *dsc_wrap_i32(dsc_ctx *ctx,
333 |                                 i32 val,
334 |                                 dsc_device_type device = DEFAULT);
335 | 
336 | extern dsc_tensor *dsc_wrap_f32(dsc_ctx *ctx,
337 |                                 f32 val,
338 |                                 dsc_device_type device = DEFAULT,
339 |                                 bool as_bf16 = false);
340 | 
341 | extern dsc_tensor *dsc_wrap_f64(dsc_ctx *ctx,
342 |                                 f64 val,
343 |                                 dsc_device_type device = DEFAULT);
344 | 
345 | extern dsc_tensor *dsc_arange(dsc_ctx *ctx,
346 |                               f64 stop,
347 |                               f64 start = 0,
348 |                               f64 step = 1,
349 |                               dsc_dtype dtype = I32,
350 |                               dsc_device_type device = DEFAULT);
351 | 
352 | extern dsc_tensor *dsc_repeat(dsc_ctx *ctx,
353 |                               const dsc_tensor *DSC_RESTRICT x,
354 |                               int repeats,
355 |                               int axis = -1);
356 | 
357 | extern dsc_tensor *dsc_randn(dsc_ctx *ctx,
358 |                              int n_dim,
359 |                              const int *shape,
360 |                              dsc_dtype dtype = DSC_DEFAULT_TYPE,
361 |                              dsc_device_type device = DEFAULT);
362 | 
363 | extern dsc_tensor *dsc_kth(dsc_ctx *ctx,
364 |                            const dsc_tensor *DSC_RESTRICT x,
365 |                            int k);
366 | 
367 | extern dsc_tensor *dsc_multinomial(dsc_ctx *ctx,
368 |                                    const dsc_tensor *DSC_RESTRICT x,
369 |                                    int num_samples);
370 | 
371 | extern dsc_tensor *dsc_cast(dsc_ctx *ctx,
372 |                             dsc_tensor *DSC_RESTRICT x,
373 |                             dsc_dtype new_dtype);
374 | 
375 | // ============================================================
376 | // Tensor Manipulation
377 | 
378 | extern void dsc_copy(dsc_ctx *ctx,
379 |                      dsc_tensor *DSC_RESTRICT x,
380 |                      void *DSC_RESTRICT data,
381 |                      usize nb,
382 |                      dsc_device_type data_device = DEFAULT);
383 | 
384 | extern dsc_tensor *dsc_to(dsc_ctx *ctx,
385 |                           dsc_tensor *DSC_RESTRICT x,
386 |                           dsc_device_type new_device);
387 | 
388 | extern dsc_tensor *dsc_reshape(dsc_ctx *ctx,
389 |                                const dsc_tensor *DSC_RESTRICT x,
390 |                                int dimensions...);
391 | 
392 | extern dsc_tensor *dsc_concat(dsc_ctx *ctx,
393 |                               int axis,
394 |                               int tensors...);
395 | 
396 | extern dsc_tensor *dsc_transpose(dsc_ctx *ctx,
397 |                                  const dsc_tensor *DSC_RESTRICT x,
398 |                                  int axes...);
399 | 
400 | extern dsc_tensor *dsc_tril(dsc_ctx *ctx,
401 |                             const dsc_tensor *DSC_RESTRICT x,
402 |                             int diagonal = 0,
403 |                             dsc_tensor *DSC_RESTRICT out = nullptr);
404 | 
405 | // ============================================================
406 | // Indexing and Slicing
407 | //
408 | // All indexing and slicing operations will return a new tensor.
409 | // If the number of indexes passed to dsc_tensor_get_idx is equal to the number of
410 | // dimensions of x then a new tensor will be allocated with a single element,
411 | // the caller must take care of unwrapping it if needed.
412 | extern dsc_tensor *dsc_tensor_get_idx(dsc_ctx *ctx,
413 |                                       const dsc_tensor *DSC_RESTRICT x,
414 |                                       int indexes...);
415 | 
416 | extern dsc_tensor *dsc_tensor_get_slice(dsc_ctx *ctx,
417 |                                         const dsc_tensor *DSC_RESTRICT x,
418 |                                         int slices...);
419 | 
420 | extern dsc_tensor *dsc_tensor_get_tensor(dsc_ctx *ctx,
421 |                                          const dsc_tensor *DSC_RESTRICT x,
422 |                                          const dsc_tensor *DSC_RESTRICT indexes);
423 | 
424 | extern void dsc_tensor_set_idx(dsc_ctx *ctx,
425 |                                dsc_tensor *DSC_RESTRICT xa,
426 |                                const dsc_tensor *DSC_RESTRICT xb,
427 |                                int indexes...);
428 | 
429 | extern void dsc_tensor_set_slice(dsc_ctx *ctx,
430 |                                  dsc_tensor *DSC_RESTRICT xa,
431 |                                  const dsc_tensor *DSC_RESTRICT xb,
432 |                                  int slices...);
433 | 
434 | // ============================================================
435 | // Binary Operations
436 | 
437 | extern dsc_tensor *dsc_add(dsc_ctx *ctx,
438 |                            dsc_tensor *xa,
439 |                            dsc_tensor *xb,
440 |                            dsc_tensor *out = nullptr);
441 | 
442 | extern dsc_tensor *dsc_sub(dsc_ctx *ctx,
443 |                            dsc_tensor *xa,
444 |                            dsc_tensor *xb,
445 |                            dsc_tensor *out = nullptr);
446 | 
447 | extern dsc_tensor *dsc_mul(dsc_ctx *ctx,
448 |                            dsc_tensor *xa,
449 |                            dsc_tensor *xb,
450 |                            dsc_tensor *out = nullptr);
451 | 
452 | extern dsc_tensor *dsc_div(dsc_ctx *ctx,
453 |                            dsc_tensor *xa,
454 |                            dsc_tensor *xb,
455 |                            dsc_tensor *out = nullptr);
456 | 
457 | extern dsc_tensor *dsc_pow(dsc_ctx *ctx,
458 |                            dsc_tensor *xa,
459 |                            dsc_tensor *xb,
460 |                            dsc_tensor *out = nullptr);
461 | 
462 | extern dsc_tensor *dsc_matmul(dsc_ctx *ctx,
463 |                               dsc_tensor *DSC_RESTRICT xa,
464 |                               dsc_tensor *DSC_RESTRICT xb,
465 |                               bool trans_b = false,
466 |                               dsc_tensor *DSC_RESTRICT out = nullptr);
467 | 
468 | extern dsc_tensor *dsc_compare(dsc_ctx *ctx,
469 |                                const dsc_tensor *xa,
470 |                                const dsc_tensor *xb,
471 |                                dsc_comparison_op comp,
472 |                                dsc_tensor *out = nullptr);
473 | 
474 | extern void dsc_masked_fill(dsc_ctx *ctx,
475 |                             dsc_tensor *DSC_RESTRICT x,
476 |                             const dsc_tensor *DSC_RESTRICT mask,
477 |                             f64 value);
478 | 
479 | extern dsc_tensor *dsc_outer(dsc_ctx *ctx,
480 |                              dsc_tensor *DSC_RESTRICT xa,
481 |                              dsc_tensor *DSC_RESTRICT xb,
482 |                              dsc_tensor *DSC_RESTRICT out = nullptr);
483 | 
484 | extern dsc_tensor *dsc_where(dsc_ctx *ctx,
485 |                              const dsc_tensor *DSC_RESTRICT condition,
486 |                              const dsc_tensor *DSC_RESTRICT input,
487 |                              const dsc_tensor *DSC_RESTRICT other,
488 |                              dsc_tensor *DSC_RESTRICT out = nullptr);
489 | 
490 | // ============================================================
491 | // Unary Operations
492 | 
493 | extern dsc_tensor *dsc_cos(dsc_ctx *ctx,
494 |                            dsc_tensor *DSC_RESTRICT x,
495 |                            dsc_tensor *DSC_RESTRICT out = nullptr);
496 | 
497 | extern dsc_tensor *dsc_sin(dsc_ctx *ctx,
498 |                            dsc_tensor *DSC_RESTRICT x,
499 |                            dsc_tensor *DSC_RESTRICT out = nullptr);
500 | 
501 | extern dsc_tensor *dsc_tanh(dsc_ctx *ctx,
502 |                             dsc_tensor *DSC_RESTRICT x,
503 |                             dsc_tensor *DSC_RESTRICT out = nullptr);
504 | 
505 | extern dsc_tensor *dsc_exp(dsc_ctx *ctx,
506 |                            dsc_tensor *DSC_RESTRICT x,
507 |                            dsc_tensor *DSC_RESTRICT out = nullptr);
508 | 
509 | extern dsc_tensor *dsc_sqrt(dsc_ctx *ctx,
510 |                             dsc_tensor *DSC_RESTRICT x,
511 |                             dsc_tensor *DSC_RESTRICT out = nullptr);
512 | 
513 | // ============================================================
514 | // Unary Operations Along Axis
515 | 
516 | extern dsc_tensor *dsc_sum(dsc_ctx *ctx,
517 |                            dsc_tensor *DSC_RESTRICT x,
518 |                            dsc_tensor *DSC_RESTRICT out = nullptr,
519 |                            int axis = -1,
520 |                            bool keep_dims = true);
521 | 
522 | extern dsc_tensor *dsc_max(dsc_ctx *ctx,
523 |                            dsc_tensor *DSC_RESTRICT x,
524 |                            dsc_tensor *DSC_RESTRICT out = nullptr,
525 |                            int axis = -1,
526 |                            bool keep_dims = true);
527 | 
528 | extern dsc_tensor *dsc_min(dsc_ctx *ctx,
529 |                            dsc_tensor *DSC_RESTRICT x,
530 |                            dsc_tensor *DSC_RESTRICT out = nullptr,
531 |                            int axis = -1,
532 |                            bool keep_dims = true);
533 | 
534 | #if defined(__cplusplus)
535 | }
536 | #endif
537 | 


--------------------------------------------------------------------------------