├── python ├── dsc │ ├── py.typed │ ├── nn │ │ ├── functional.py │ │ ├── utils.py │ │ └── __init__.py │ ├── device.py │ ├── __init__.py │ ├── gpu │ │ └── __init__.py │ ├── context.py │ ├── dtype.py │ └── profiler.py └── tests │ ├── utils_cpu.py │ ├── test_ops_cpu.py │ ├── test_indexing.py │ ├── test_ops_common.py │ └── test_ops_gpu.py ├── docs └── logo.png ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ └── bug_report.md └── workflows │ └── tests.yml ├── .gitignore ├── requirements.txt ├── pyproject.toml ├── LICENSE ├── setup.py ├── dsc ├── include │ ├── cpu │ │ ├── dsc_blas.h │ │ ├── dsc_iter.h │ │ ├── dsc_ops.h │ │ ├── dsc_cpu.h │ │ └── dsc_tracing.h │ ├── dsc_dtype.h │ ├── gpu │ │ ├── platform │ │ │ ├── dsc_cuda_platform.h │ │ │ └── dsc_hip_platform.h │ │ ├── dsc_tracing.h │ │ ├── dsc_gpu.h │ │ └── dsc_ops.h │ ├── dsc_device.h │ └── dsc.h └── src │ ├── cpu │ └── dsc_device_cpu.cpp │ └── gpu │ └── dsc_device_gpu.cpp ├── .clang-format ├── .clang-tidy ├── IDEAS.md ├── Makefile ├── README.md └── examples └── models ├── gpt2.py └── qwen2_5.py /python/dsc/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nirw4nna/dsc/HEAD/docs/logo.png -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: nirw4nna 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .vscode/ 3 | 4 | venv/ 5 | 6 | *.egg-info* 7 | __pycache__/ 8 | .pytest*/ 9 | .ruff*/ 10 | 11 | # Traces 12 | *.json 13 | 14 | *.o 15 | *.so 16 | *.dll 17 | *.a -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Required by the dsc python library 2 | numpy 3 | psutil 4 | tqdm 5 | 6 | 7 | # Only used for testing and benchmarking 8 | matplotlib 9 | PyQt5 10 | pytest 11 | tabulate 12 | ruff 13 | pyright 14 | torch 15 | transformers # Tokenizers for examples/models -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report issue 4 | title: "[BUG]" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 16 | Include code snippet: 17 | ```python 18 | 19 | ``` 20 | 21 | **Expected behavior** 22 | A clear and concise description of what you expected to happen. 23 | 24 | **Desktop (please complete the following information):** 25 | - OS: [e.g. Ubuntu-Linux] 26 | - Version [e.g. 0.1] 27 | 28 | **Additional context** 29 | Add any other context about the problem here. 30 | -------------------------------------------------------------------------------- /python/dsc/nn/functional.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025, Christian Gilli 2 | # All rights reserved. 3 | # 4 | # This code is licensed under the terms of the 3-clause BSD license 5 | # (https://opensource.org/license/bsd-3-clause). 6 | 7 | 8 | from ..tensor import Tensor, tanh, power, max, sum, exp 9 | from ..profiler import trace 10 | import math 11 | 12 | 13 | @trace('gelu') 14 | def gelu(x: Tensor) -> Tensor: 15 | return 0.5 * x * (1.0 + tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * power(x, 3)))) 16 | 17 | 18 | @trace('softmax') 19 | def softmax(x: Tensor, axis: int = -1) -> Tensor: 20 | e = exp((x - max(x, axis=axis, keepdims=True))) 21 | sum_e = sum(e, axis=axis, keepdims=True) 22 | return e / sum_e 23 | 24 | 25 | @trace('silu') 26 | def silu(x: Tensor) -> Tensor: 27 | return x * (1 / (1 + exp(-x))) -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.pyright] 2 | include = ["python/dsc"] 3 | exclude = [ 4 | "**/node_modules", 5 | "**/__pycache__", 6 | "benchmarks", 7 | "python/tests" 8 | ] 9 | venvPath = "." 10 | venv = "venv" 11 | 12 | reportMissingImports = "error" 13 | 14 | pythonVersion = "3.10" 15 | 16 | [tool.ruff] 17 | # Exclude a variety of commonly ignored directories. 18 | exclude = [ 19 | ".eggs", 20 | ".git", 21 | ".pyenv", 22 | ".pytest_cache", 23 | ".ruff_cache", 24 | ".vscode", 25 | ".idea", 26 | "__pypackages__", 27 | "node_modules", 28 | "site-packages", 29 | "venv", 30 | "benchmarks/*" 31 | ] 32 | 33 | # Same as Black. 34 | line-length = 88 35 | indent-width = 4 36 | 37 | target-version = "py310" 38 | 39 | [tool.ruff.lint] 40 | ignore = ["F401"] 41 | 42 | # Allow fix for all enabled rules (when `--fix`) is provided. 43 | fixable = ["ALL"] 44 | 45 | [tool.ruff.format] 46 | quote-style = "single" 47 | 48 | indent-style = "space" 49 | 50 | # Like Black, respect magic trailing commas. 51 | skip-magic-trailing-comma = false 52 | 53 | # Like Black, automatically detect the appropriate line ending. 54 | line-ending = "auto" -------------------------------------------------------------------------------- /python/dsc/device.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025, Christian Gilli 2 | # All rights reserved. 3 | # 4 | # This code is licensed under the terms of the 3-clause BSD license 5 | # (https://opensource.org/license/bsd-3-clause). 6 | 7 | from enum import Enum 8 | from typing import Union 9 | 10 | 11 | DeviceType = Union['Device', str] 12 | 13 | class Device(Enum): 14 | DEFAULT = -1 15 | CPU = 0 16 | GPU = 1 17 | 18 | def __repr__(self) -> str: 19 | return DEVICE_LOOKUP[self] 20 | 21 | def __str__(self) -> str: 22 | return repr(self) 23 | 24 | 25 | def _get_device(dev: DeviceType) -> Device: 26 | if isinstance(dev, Device): 27 | return dev 28 | else: 29 | if dev in DEVICE_REVERSE_LOOKUP: 30 | return DEVICE_REVERSE_LOOKUP[dev] 31 | else: 32 | raise RuntimeError(f'string "{dev}" is not a valid Device') 33 | 34 | 35 | DEVICE_VALUE_LOOKUP = {val.value: val for val in Device.__members__.values()} 36 | 37 | DEVICE_LOOKUP = { 38 | Device.DEFAULT: 'default', 39 | Device.CPU: 'cpu', 40 | Device.GPU: 'gpu', 41 | } 42 | 43 | DEVICE_REVERSE_LOOKUP = {val: key for key, val in DEVICE_LOOKUP.items()} -------------------------------------------------------------------------------- /python/dsc/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025, Christian Gilli 2 | # All rights reserved. 3 | # 4 | # This code is licensed under the terms of the 3-clause BSD license 5 | # (https://opensource.org/license/bsd-3-clause). 6 | 7 | from dsc.context import init, print_mem_usage, set_default_device 8 | from dsc.tensor import ( 9 | Tensor, 10 | from_numpy, 11 | frombuffer, 12 | reshape, 13 | concat, 14 | split, 15 | transpose, 16 | tril, 17 | arange, 18 | repeat, 19 | randn, 20 | cos, 21 | sin, 22 | tanh, 23 | exp, 24 | sqrt, 25 | rsqrt, 26 | add, 27 | sub, 28 | mul, 29 | true_div, 30 | sum, 31 | mean, 32 | var, 33 | matmul, 34 | outer, 35 | max, 36 | min, 37 | power, 38 | equal, 39 | not_equal, 40 | less, 41 | less_equal, 42 | greater, 43 | greater_equal, 44 | tensor, 45 | ones, 46 | ones_like, 47 | zeros, 48 | zeros_like, 49 | full, 50 | full_like, 51 | empty, 52 | empty_like, 53 | kth, 54 | multinomial, 55 | where, 56 | ) 57 | from dsc.dtype import Dtype, bool_, i32, bf16, f32, f64 58 | from dsc.profiler import trace 59 | from dsc.device import Device 60 | import dsc.gpu as gpu 61 | import dsc.nn as nn -------------------------------------------------------------------------------- /python/dsc/gpu/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025, Christian Gilli 2 | # All rights reserved. 3 | # 4 | # This code is licensed under the terms of the 3-clause BSD license 5 | # (https://opensource.org/license/bsd-3-clause). 6 | 7 | from .._bindings import ( 8 | _dsc_get_gpu_platform, 9 | _dsc_gpu_available, 10 | _dsc_gpu_devices, 11 | _dsc_gpu_set_device, 12 | _dsc_gpu_dev_capability, 13 | _dsc_gpu_dev_mem, 14 | _dsc_gpu_sync, 15 | _dsc_gpu_has_bf16, 16 | _DSC_PLATFORM_CUDA, 17 | _DSC_PLATFORM_ROCM, 18 | ) 19 | 20 | 21 | def is_available() -> bool: 22 | return _dsc_gpu_available(None) 23 | 24 | def is_cuda() -> bool: 25 | return _dsc_get_gpu_platform(None) == _DSC_PLATFORM_CUDA 26 | 27 | def is_rocm() -> bool: 28 | return _dsc_get_gpu_platform(None) == _DSC_PLATFORM_ROCM 29 | 30 | def has_bf16() -> bool: 31 | return _dsc_gpu_has_bf16(None) 32 | 33 | def device_count() -> int: 34 | return _dsc_gpu_devices(None) 35 | 36 | def set_device(device: int): 37 | _dsc_gpu_set_device(None, device) 38 | 39 | def get_device_capability(device: int) -> int: 40 | return _dsc_gpu_dev_capability(None, device) 41 | 42 | def get_device_mem(device: int) -> int: 43 | return _dsc_gpu_dev_mem(None, device) 44 | 45 | def synchronize(): 46 | _dsc_gpu_sync(None) -------------------------------------------------------------------------------- /python/tests/utils_cpu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025, Christian Gilli 2 | # All rights reserved. 3 | # 4 | # This code is licensed under the terms of the 3-clause BSD license 5 | # (https://opensource.org/license/bsd-3-clause). 6 | 7 | 8 | import dsc 9 | import numpy as np 10 | from typing import List 11 | import os 12 | 13 | 14 | DEVICE = os.getenv('DEVICE', 'cpu') 15 | 16 | def all_close(actual: dsc.Tensor, target: np.ndarray, eps=1e-5): 17 | actual_np = actual.numpy() 18 | diffs = ~np.isclose(actual_np, target, atol=eps, rtol=eps, equal_nan=True) 19 | close = len(actual_np[diffs]) == 0 20 | return close 21 | 22 | 23 | def random_nd(shape: List[int], dtype: np.dtype = np.float64): 24 | if dtype == np.bool: 25 | return np.random.randint(0, 2, size=tuple(shape)).astype(dtype) 26 | elif dtype == np.int32: 27 | # Return a positive integer tensor if the dtype is int32 so that we don't have issues 28 | # with power 29 | return np.random.randint(0, 10, size=tuple(shape)).astype(dtype) 30 | else: 31 | return np.random.randn(*tuple(shape)).astype(dtype) 32 | 33 | 34 | DTYPES = [np.bool, np.int32, np.float32, np.float64] 35 | DSC_DTYPES = { 36 | np.bool: dsc.bool_, 37 | np.int32: dsc.i32, 38 | np.float32: dsc.f32, 39 | np.float64: dsc.f64, 40 | } 41 | 42 | def is_float(dtype) -> bool: 43 | return dtype == np.float32 or dtype == np.float64 44 | 45 | def is_bool(dtype) -> bool: 46 | return dtype == np.bool 47 | 48 | def is_integer(dtype) -> bool: 49 | return dtype == np.int32 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2024-2025, Christian Gilli 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /python/dsc/context.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025, Christian Gilli 2 | # All rights reserved. 3 | # 4 | # This code is licensed under the terms of the 3-clause BSD license 5 | # (https://opensource.org/license/bsd-3-clause). 6 | 7 | from ._bindings import _dsc_ctx_init, _dsc_print_mem_usage, _dsc_set_default_device 8 | from .device import _get_device, DeviceType 9 | import psutil 10 | 11 | _ctx_instance = None 12 | 13 | 14 | class _DscContext: 15 | def __init__(self, main_mem: int): 16 | self._ctx = _dsc_ctx_init(main_mem) 17 | 18 | # TODO: (3) 19 | # def __del__(self): 20 | # _dsc_ctx_free(self._ctx) 21 | 22 | 23 | def _get_ctx(): 24 | global _ctx_instance 25 | if _ctx_instance is None: 26 | # Workaround: instead of throwing an error if the context is not initialized 27 | # we can simply initialize one with a fixed amount of memory that is a small % 28 | # of the total available memory. 29 | total_mem = psutil.virtual_memory().total 30 | mem = int(total_mem * 0.1) 31 | print( 32 | f'DSC has not been explicitly initialized. Using {round(mem / (1024. * 1024.))}MB.' 33 | f' If you require more memory please call dsc.init() once before executing your code.' 34 | ) 35 | _ctx_instance = _DscContext(mem) 36 | return _ctx_instance._ctx 37 | 38 | 39 | def init(mem_size: int): 40 | global _ctx_instance 41 | if _ctx_instance is None: 42 | _ctx_instance = _DscContext(mem_size) 43 | else: 44 | raise RuntimeWarning('Context already initialized') 45 | 46 | def print_mem_usage(): 47 | _dsc_print_mem_usage(_get_ctx()) 48 | 49 | def set_default_device(device: DeviceType): 50 | _dsc_set_default_device(_get_ctx(), _get_device(device)) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025, Christian Gilli 2 | # All rights reserved. 3 | # 4 | # This code is licensed under the terms of the 3-clause BSD license 5 | # (https://opensource.org/license/bsd-3-clause). 6 | 7 | from setuptools import setup, find_packages 8 | from setuptools.command.install import install 9 | import subprocess 10 | import os 11 | from pathlib import Path 12 | 13 | 14 | def _compile_cpp(): 15 | subprocess.check_call( 16 | ['make', 'shared', 'DSC_FAST=1'], cwd=os.path.dirname(os.path.abspath(__file__)) 17 | ) 18 | 19 | 20 | class BuildCmd(install): 21 | def run(self): 22 | _compile_cpp() 23 | install.run(self) 24 | 25 | 26 | if __name__ == '__main__': 27 | with open(Path(__file__).parent / 'README.md', 'r', encoding='utf-8') as f: 28 | long_description = f.read() 29 | 30 | packages = find_packages('python') 31 | package_dir = {'': 'python'} 32 | package_data = {'dsc': ['*.so']} 33 | setup( 34 | name='dsc', 35 | version='0.1', 36 | author='Christian Gilli', 37 | author_email='christian.gilli11@gmail.com', 38 | license='BSD-3-Clause', 39 | description='DSPCraft tensor processing library.', 40 | long_description=long_description, 41 | long_description_content_type='text/markdown', 42 | url='https://github.com/dspcraft/dsc', 43 | packages=packages, 44 | package_dir=package_dir, 45 | install_requires=[ 46 | 'numpy', 47 | 'psutil', 48 | ], 49 | extras_require={'dev': ['matplotlib', 'pytest', 'tabulate', 'pyright', 'ruff']}, 50 | cmdclass={ 51 | 'install': BuildCmd, 52 | }, 53 | include_package_data=True, 54 | package_data=package_data, 55 | python_requires='>=3.10', 56 | ) 57 | -------------------------------------------------------------------------------- /dsc/include/cpu/dsc_blas.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024-2025, Christian Gilli 2 | // All rights reserved. 3 | // 4 | // This code is licensed under the terms of the 3-clause BSD license 5 | // (https://opensource.org/license/bsd-3-clause). 6 | 7 | #pragma once 8 | 9 | #include "dsc.h" 10 | 11 | enum dsc_blas_trans : bool { 12 | NO_TRANS, 13 | TRANS 14 | }; 15 | 16 | struct dsc_blas_ctx; 17 | 18 | // ============================================================ 19 | // Setup / Teardown 20 | // 21 | 22 | extern dsc_blas_ctx *dsc_blas_init(); 23 | 24 | extern void dsc_blas_destroy(dsc_blas_ctx *ctx); 25 | 26 | // ============================================================ 27 | // GEMM-related functions 28 | // 29 | 30 | extern void dsc_dgemm(dsc_blas_ctx *ctx, dsc_blas_trans trans_b, 31 | int m, int n, int k, 32 | const f64 *DSC_RESTRICT a, int stride_a, 33 | const f64 *DSC_RESTRICT b, int stride_b, 34 | f64 *DSC_RESTRICT c, int stride_c); 35 | 36 | extern void dsc_sgemm(dsc_blas_ctx *ctx, dsc_blas_trans trans_b, 37 | int m, int n, int k, 38 | const f32 *DSC_RESTRICT a, int stride_a, 39 | const f32 *DSC_RESTRICT b, int stride_b, 40 | f32 *DSC_RESTRICT c, int stride_c); 41 | 42 | // ============================================================ 43 | // GEVM-related functions 44 | // 45 | 46 | extern void dsc_dgevm_trans(dsc_blas_ctx *ctx, 47 | int n, int k, 48 | const f64 *DSC_RESTRICT a, 49 | const f64 *DSC_RESTRICT b, int stride_b, 50 | f64 *DSC_RESTRICT c); 51 | 52 | extern void dsc_sgevm_trans(dsc_blas_ctx *ctx, 53 | int n, int k, 54 | const f32 *DSC_RESTRICT a, 55 | const f32 *DSC_RESTRICT b, int stride_b, 56 | f32 *DSC_RESTRICT c); -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | AccessModifierOffset: -4 3 | AlignAfterOpenBracket: Align 4 | AlignConsecutiveAssignments: None 5 | AlignOperands: Align 6 | AllowAllArgumentsOnNextLine: false 7 | AllowAllConstructorInitializersOnNextLine: false 8 | AllowAllParametersOfDeclarationOnNextLine: false 9 | AllowShortBlocksOnASingleLine: Always 10 | AllowShortCaseLabelsOnASingleLine: false 11 | AllowShortFunctionsOnASingleLine: All 12 | AllowShortIfStatementsOnASingleLine: Always 13 | AllowShortLambdasOnASingleLine: All 14 | AllowShortLoopsOnASingleLine: true 15 | AlwaysBreakAfterReturnType: None 16 | AlwaysBreakTemplateDeclarations: Yes 17 | BreakBeforeBraces: Custom 18 | BraceWrapping: 19 | AfterCaseLabel: false 20 | AfterClass: false 21 | AfterControlStatement: Never 22 | AfterEnum: false 23 | AfterFunction: false 24 | AfterNamespace: false 25 | AfterUnion: false 26 | BeforeCatch: false 27 | BeforeElse: false 28 | IndentBraces: false 29 | SplitEmptyFunction: false 30 | SplitEmptyRecord: true 31 | BreakBeforeBinaryOperators: None 32 | BreakBeforeTernaryOperators: true 33 | BreakConstructorInitializers: BeforeColon 34 | BreakInheritanceList: BeforeColon 35 | ColumnLimit: 0 36 | CompactNamespaces: false 37 | ContinuationIndentWidth: 8 38 | IndentCaseLabels: true 39 | IndentPPDirectives: None 40 | IndentWidth: 4 41 | KeepEmptyLinesAtTheStartOfBlocks: true 42 | MaxEmptyLinesToKeep: 2 43 | NamespaceIndentation: None 44 | ObjCSpaceAfterProperty: false 45 | ObjCSpaceBeforeProtocolList: true 46 | PointerAlignment: Right 47 | ReflowComments: false 48 | SpaceAfterCStyleCast: true 49 | SpaceAfterLogicalNot: false 50 | SpaceAfterTemplateKeyword: false 51 | SpaceBeforeAssignmentOperators: true 52 | SpaceBeforeCpp11BracedList: false 53 | SpaceBeforeCtorInitializerColon: true 54 | SpaceBeforeInheritanceColon: true 55 | SpaceBeforeParens: ControlStatements 56 | SpaceBeforeRangeBasedForLoopColon: false 57 | SpaceInEmptyParentheses: false 58 | SpacesBeforeTrailingComments: 0 59 | SpacesInAngles: false 60 | SpacesInCStyleCastParentheses: false 61 | SpacesInContainerLiterals: false 62 | SpacesInParentheses: false 63 | SpacesInSquareBrackets: false 64 | TabWidth: 4 65 | UseTab: Never 66 | -------------------------------------------------------------------------------- /python/dsc/dtype.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025, Christian Gilli 2 | # All rights reserved. 3 | # 4 | # This code is licensed under the terms of the 3-clause BSD license 5 | # (https://opensource.org/license/bsd-3-clause). 6 | 7 | from enum import Enum 8 | import numpy as np 9 | from ctypes import POINTER, c_float, c_double, c_bool, c_int32, c_uint16 10 | from typing import Union 11 | 12 | 13 | ScalarType = Union[bool, int, float] 14 | 15 | 16 | class Dtype(Enum): 17 | BOOL = 0 18 | I32 = 1 19 | BF16 = 2 20 | F32 = 3 21 | F64 = 4 22 | 23 | def __repr__(self) -> str: 24 | return TYPENAME_LOOKUP[self] 25 | 26 | def __str__(self) -> str: 27 | return repr(self) 28 | 29 | @staticmethod 30 | def from_string(val: str) -> 'Dtype': 31 | return TYPENAME_REVERSE_LOOKUP[val.lower()] 32 | 33 | bool_ = Dtype.BOOL 34 | i32 = Dtype.I32 35 | bf16 = Dtype.BF16 36 | f32 = Dtype.F32 37 | f64 = Dtype.F64 38 | 39 | DTYPE_VALUE_LOOKUP = {val.value: val for val in Dtype.__members__.values()} 40 | 41 | TYPENAME_LOOKUP = { 42 | Dtype.BOOL: 'bool', 43 | Dtype.I32: 'i32', 44 | Dtype.BF16: 'bf16', 45 | Dtype.F32: 'f32', 46 | Dtype.F64: 'f64', 47 | } 48 | 49 | TYPENAME_REVERSE_LOOKUP = {v: k for k, v in TYPENAME_LOOKUP.items()} 50 | 51 | DTYPE_TO_CTYPE = { 52 | Dtype.BOOL: POINTER(c_bool), 53 | Dtype.I32: POINTER(c_int32), 54 | Dtype.BF16: POINTER(c_uint16), 55 | Dtype.F32: POINTER(c_float), 56 | Dtype.F64: POINTER(c_double), 57 | } 58 | 59 | DTYPE_SIZE = { 60 | Dtype.BOOL: 1, 61 | Dtype.I32: 4, 62 | Dtype.BF16: 2, 63 | Dtype.F32: 4, 64 | Dtype.F64: 8, 65 | } 66 | 67 | NP_TO_DTYPE = { 68 | np.dtype(np.bool): Dtype.BOOL, 69 | np.dtype(np.int32): Dtype.I32, 70 | np.dtype(np.float16): Dtype.BF16, # TODO: this is wrong! NumPy doesn't support BF16 71 | np.dtype(np.float32): Dtype.F32, 72 | np.dtype(np.float64): Dtype.F64, 73 | } 74 | 75 | DTYPE_TO_NP = {val: key for key, val in NP_TO_DTYPE.items()} 76 | 77 | DTYPE_CONVERSION_TABLES = [ 78 | [Dtype.BOOL, Dtype.I32, Dtype.F32, Dtype.F32, Dtype.F64], 79 | [Dtype.BOOL, Dtype.I32, Dtype.F32, Dtype.F32, Dtype.F64], 80 | [Dtype.F32, Dtype.F32, Dtype.F32, Dtype.F32, Dtype.F64], 81 | [Dtype.F32, Dtype.F32, Dtype.F32, Dtype.F32, Dtype.F64], 82 | [Dtype.F64, Dtype.F64, Dtype.F64, Dtype.F64, Dtype.F64], 83 | ] 84 | -------------------------------------------------------------------------------- /python/dsc/nn/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025, Christian Gilli 2 | # All rights reserved. 3 | # 4 | # This code is licensed under the terms of the 3-clause BSD license 5 | # (https://opensource.org/license/bsd-3-clause). 6 | # 7 | # This code is licensed under the terms of the 3-clause BSD license 8 | # (https://opensource.org/license/bsd-3-clause). 9 | 10 | from typing import Dict, Optional 11 | from ..tensor import Tensor, frombuffer 12 | from ..dtype import Dtype 13 | import struct, pathlib, os, hashlib, urllib.request, json, ctypes 14 | from tqdm import tqdm 15 | 16 | 17 | def _fetch(url: str, invalidate_cache: bool = False) -> pathlib.Path: 18 | cache_dir = pathlib.Path.home() / '.cache' / 'dsc' / 'blob' 19 | if invalidate_cache and cache_dir.exists(): 20 | os.removedirs(cache_dir) 21 | 22 | cache_dir.mkdir(parents=True, exist_ok=True) 23 | fp = cache_dir / hashlib.md5(url.encode('utf-8')).hexdigest() 24 | if not fp.exists(): 25 | with urllib.request.urlopen(url, timeout=10) as r: 26 | assert r.status == 200 27 | pbar = tqdm(total=r.length, unit='B', unit_scale=True, desc=url) 28 | with open(fp, mode="w+b") as f: 29 | while chunk := r.read(8192): 30 | pbar.update(f.write(chunk)) 31 | return fp 32 | 33 | 34 | def safe_load(url: str, invalidate_cache: bool = False, 35 | trim_prefix: Optional[str] = None, 36 | use_dtype: Optional[Dtype] = None) -> Dict[str, Tensor]: 37 | fp = _fetch(url, invalidate_cache) 38 | b = fp.read_bytes() 39 | n = struct.unpack_from(' 2 | # All rights reserved. 3 | # 4 | # This code is licensed under the terms of the 3-clause BSD license 5 | # (https://opensource.org/license/bsd-3-clause). 6 | 7 | from ._bindings import _dsc_tracing_enabled, _dsc_insert_trace, _dsc_dump_traces 8 | from .context import _get_ctx 9 | from time import perf_counter 10 | from http.server import SimpleHTTPRequestHandler 11 | import socketserver 12 | from functools import wraps 13 | import atexit 14 | import os 15 | 16 | 17 | def _is_tracing_enabled() -> bool: 18 | return bool(_dsc_tracing_enabled()) 19 | 20 | 21 | @atexit.register 22 | def _dump_traces(): 23 | if not _is_tracing_enabled(): 24 | return 25 | 26 | _dsc_dump_traces(_get_ctx()) 27 | trace_value = int(os.getenv('TRACE', '0')) 28 | if trace_value >= 2: 29 | _serve_traces() 30 | 31 | 32 | def trace(name: str): 33 | def _decorator(func): 34 | if not _is_tracing_enabled(): 35 | return func 36 | 37 | # Encode name and cat once 38 | name_ = name.encode('ascii') 39 | @wraps(func) 40 | def _wrapper(*args, **kwargs): 41 | start_us = int(perf_counter() * 1e6) 42 | res = func(*args, **kwargs) 43 | end_us = int(perf_counter() * 1e6) 44 | _dsc_insert_trace(_get_ctx(), name_, start_us, end_us - start_us) 45 | return res 46 | return _wrapper 47 | return _decorator 48 | 49 | 50 | class _PerfettoServer(SimpleHTTPRequestHandler): 51 | def log_message(self, format, *args): 52 | # Suppress the output of the HTTP server 53 | pass 54 | 55 | def end_headers(self): 56 | self.send_header('Access-Control-Allow-Origin', '*') 57 | return super().end_headers() 58 | 59 | def do_GET(self): 60 | self.server.last_request = self.path # pyright: ignore[reportAttributeAccessIssue] 61 | return super().do_GET() 62 | 63 | def do_POST(self): 64 | self.send_error(404, 'File not found') 65 | 66 | 67 | def _serve_traces(): 68 | # Taken from https://github.com/jax-ml/jax 69 | port = 9001 70 | socketserver.TCPServer.allow_reuse_address = True 71 | with socketserver.TCPServer(('127.0.0.1', port), _PerfettoServer) as httpd: 72 | url = f'https://ui.perfetto.dev/#!/?url=http://127.0.0.1:{port}/traces.json' 73 | print(f'Open URL in browser: {url}') 74 | 75 | while httpd.__dict__.get('last_request') != '/traces.json': 76 | httpd.handle_request() 77 | -------------------------------------------------------------------------------- /dsc/src/cpu/dsc_device_cpu.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024-2025, Christian Gilli 2 | // All rights reserved. 3 | // 4 | // This code is licensed under the terms of the 3-clause BSD license 5 | // (https://opensource.org/license/bsd-3-clause). 6 | 7 | #include "cpu/dsc_blas.h" 8 | #include "cpu/dsc_tracing.h" 9 | #include "dsc_device.h" 10 | #include 11 | 12 | #if defined(_WIN32) 13 | # include 14 | 15 | # define dsc_aligned_alloc(ALIGN, SIZE) _aligned_malloc(SIZE, ALIGN) 16 | # define dsc_aligned_free(PTR) _aligned_free(PTR) 17 | #else 18 | # include 19 | 20 | # define dsc_aligned_alloc(ALIGN, SIZE) aligned_alloc(ALIGN, SIZE) 21 | # define dsc_aligned_free(PTR) free(PTR) 22 | #endif 23 | 24 | #define DSC_DEVICE_CPU_ALIGN ((usize) 4096) 25 | 26 | static void cpu_memcpy(void *dst, const void *src, const usize nb, dsc_memcpy_dir) { 27 | memcpy(dst, src, nb); 28 | } 29 | 30 | static void cpu_memset(void *dst, const int c, const usize nb) { 31 | memset(dst, c, nb); 32 | } 33 | 34 | static void cpu_dispose(dsc_device *dev) { 35 | dsc_aligned_free(dev->device_mem); 36 | dsc_blas_ctx *blas_ctx = (dsc_blas_ctx *) dev->extra_info; 37 | dsc_blas_destroy(blas_ctx); 38 | 39 | dsc_cpu_tracing_dispose(dev->trace_ctx); 40 | 41 | DSC_LOG_INFO("%s device disposed", DSC_DEVICE_NAMES[dev->type]); 42 | } 43 | 44 | dsc_device *dsc_cpu_device(const usize mem_size) { 45 | static dsc_device dev = { 46 | .used_nodes = {}, 47 | .free_nodes = {}, 48 | .head = {}, 49 | .device_mem = {}, 50 | .alignment = 64, // I don't know about this... 51 | .extra_info = dsc_blas_init(), 52 | .trace_ctx = dsc_cpu_tracing_init(), 53 | .mem_size = DSC_ALIGN(mem_size, DSC_DEVICE_CPU_ALIGN), 54 | .used_mem = 0, 55 | .type = CPU, 56 | .memcpy = cpu_memcpy, 57 | .memset = cpu_memset, 58 | .dispose = cpu_dispose, 59 | .next_trace = dsc_cpu_next_trace, 60 | .dump_trace = dsc_cpu_tracing_dump, 61 | .dump_json_metadata = dsc_cpu_dump_json_metadata 62 | }; 63 | 64 | dev.device_mem = dsc_aligned_alloc(DSC_DEVICE_CPU_ALIGN, dev.mem_size); 65 | DSC_ASSERT(dev.device_mem != nullptr); 66 | 67 | dev.free_nodes[0].size = dev.mem_size; 68 | dev.free_nodes[0].data = dev.device_mem; 69 | dev.free_nodes[0].next = nullptr; 70 | 71 | dev.head = &dev.free_nodes[0]; 72 | 73 | DSC_LOG_INFO("%s device initialized with a buffer of %ldMB", 74 | DSC_DEVICE_NAMES[dev.type], 75 | (usize) DSC_B_TO_MB(dev.mem_size)); 76 | 77 | return &dev; 78 | } -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Unit Tests 2 | on: 3 | pull_request: 4 | branches: 5 | - main 6 | workflow_dispatch: 7 | 8 | 9 | jobs: 10 | test: 11 | strategy: 12 | matrix: 13 | include: 14 | - host: ubuntu-latest 15 | device: cpu 16 | - host: self-hosted-amd-gpu 17 | device: gpu 18 | - host: self-hosted-nvidia-gpu 19 | device: gpu 20 | runs-on: ${{ matrix.host }} 21 | 22 | steps: 23 | - name: Set env 24 | run: | 25 | echo "DEVICE=${{ matrix.device }}" >> $GITHUB_ENV 26 | - name: Checkout Code 27 | uses: actions/checkout@v4 28 | - name: Set up Python 3.10 29 | # Workaround until this gets moved to a proper server 30 | if: matrix.host != 'self-hosted-nvidia-gpu' 31 | uses: actions/setup-python@v5 32 | with: 33 | python-version: '3.10' 34 | - name: Install dependencies 35 | if: matrix.host != 'self-hosted-nvidia-gpu' 36 | run: | 37 | sudo apt-get update 38 | sudo apt-get install -y build-essential 39 | - name: Install DSC with requirements 40 | if: matrix.host == 'ubuntu-latest' 41 | run: | 42 | pip install -e . 43 | pip install -r requirements.txt 44 | - name: Install DSC with requirements (NVIDIA) 45 | if: matrix.host == 'self-hosted-nvidia-gpu' 46 | run: | 47 | python3 -m venv venv 48 | # Make sure venv stays active across tasks 49 | echo "${{ github.workspace }}/venv/bin" >> $GITHUB_PATH 50 | source venv/bin/activate 51 | pip install -e . 52 | pip install -r requirements.txt 53 | - name: Install DSC with requirements (AMD) 54 | if: matrix.host == 'self-hosted-amd-gpu' 55 | run: | 56 | # Don't install torch stable, we need the nightly build with ROCm 6.4 57 | sed -i 's/^torch/#torch/' requirements.txt 58 | pip install -e . 59 | pip install -r requirements.txt 60 | pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.4 61 | - name: Compile DSC C++ (CPU-only) 62 | if: matrix.device == 'cpu' 63 | run: make shared DSC_FAST=1 64 | - name: Compile DSC C++ (GPU) 65 | if: matrix.device == 'gpu' 66 | run: make shared DSC_FAST=1 DSC_GPU=1 67 | - name: Run common ops tests on ${{ matrix.device }} 68 | run: | 69 | cd python/tests/ 70 | pytest -s test_ops_common.py --no-header --no-summary -q 71 | - name: Run ops tests on CPU 72 | if: matrix.device == 'cpu' 73 | run: | 74 | cd python/tests/ 75 | pytest -s test_ops_cpu.py --no-header --no-summary -q 76 | - name: Run GPU ops tests 77 | if: matrix.device == 'gpu' 78 | run: | 79 | cd python/tests/ 80 | pytest -s test_ops_gpu.py --no-header --no-summary -q 81 | - name: Run indexing tests on ${{ matrix.device }} 82 | run: | 83 | cd python/tests/ 84 | pytest -s test_indexing.py --no-header --no-summary -q -------------------------------------------------------------------------------- /dsc/include/dsc_dtype.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024-2025, Christian Gilli 2 | // All rights reserved. 3 | // 4 | // This code is licensed under the terms of the 3-clause BSD license 5 | // (https://opensource.org/license/bsd-3-clause). 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #if defined(__HIPCC__) && defined(DSC_BF16) 15 | # include 16 | #endif 17 | 18 | #if defined(__NVCC__) && defined(DSC_BF16) 19 | # include 20 | #endif 21 | 22 | 23 | #define DSC_DTYPES ((int) 5) 24 | #define DSC_DEFAULT_TYPE F32 25 | 26 | 27 | using i8 = int8_t; 28 | using i16 = int16_t; 29 | using i32 = int32_t; 30 | using i64 = int64_t; 31 | using u8 = uint8_t; 32 | using u16 = uint16_t; 33 | using u32 = uint32_t; 34 | using u64 = uint64_t; 35 | using size = ptrdiff_t; 36 | using usize = size_t; 37 | using byte = char; 38 | using f32 = float; 39 | using f64 = double; 40 | #if defined(__HIPCC__) && defined(DSC_BF16) 41 | using bf16 = __hip_bfloat16; 42 | #elif defined(__NVCC__) && defined(DSC_BF16) 43 | using bf16 = __nv_bfloat16; 44 | #else 45 | using bf16 = u16; 46 | #endif 47 | 48 | enum dsc_dtype : u8 { 49 | BOOL, 50 | I32, 51 | BF16, 52 | F32, 53 | F64, 54 | }; 55 | 56 | static constexpr usize DSC_DTYPE_SIZE[DSC_DTYPES] = { 57 | sizeof(bool), 58 | sizeof(i32), 59 | sizeof(bf16), 60 | sizeof(f32), 61 | sizeof(f64), 62 | }; 63 | 64 | static constexpr const char *DSC_DTYPE_NAMES[DSC_DTYPES] = { 65 | "bool", 66 | "i32", 67 | "bf16", 68 | "f32", 69 | "f64", 70 | }; 71 | 72 | 73 | // Conversion utility 74 | template 75 | struct dsc_type_mapping; 76 | 77 | template<> 78 | struct dsc_type_mapping { 79 | static constexpr dsc_dtype value = BOOL; 80 | }; 81 | 82 | template<> 83 | struct dsc_type_mapping { 84 | static constexpr dsc_dtype value = I32; 85 | }; 86 | 87 | template<> 88 | struct dsc_type_mapping { 89 | static constexpr dsc_dtype value = BF16; 90 | }; 91 | 92 | template<> 93 | struct dsc_type_mapping { 94 | static constexpr dsc_dtype value = F32; 95 | }; 96 | 97 | template<> 98 | struct dsc_type_mapping { 99 | static constexpr dsc_dtype value = F64; 100 | }; 101 | 102 | template 103 | consteval bool dsc_is_type() { 104 | return std::is_same_v; 105 | } 106 | 107 | template 108 | consteval bool dsc_is_real() { 109 | return dsc_is_type() || dsc_is_type() || dsc_is_type(); 110 | } 111 | 112 | template 113 | consteval T dsc_inf() { 114 | constexpr T sign = positive ? 1 : -1; 115 | 116 | if constexpr (dsc_is_type()) { 117 | return sign * std::numeric_limits::infinity(); 118 | } else if constexpr (dsc_is_type()) { 119 | return sign * std::numeric_limits::infinity(); 120 | } else { 121 | static_assert("T is not supported"); 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /dsc/src/gpu/dsc_device_gpu.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024-2025, Christian Gilli 2 | // All rights reserved. 3 | // 4 | // This code is licensed under the terms of the 3-clause BSD license 5 | // (https://opensource.org/license/bsd-3-clause). 6 | 7 | #include "gpu/dsc_gpu.h" 8 | #include "gpu/dsc_tracing.h" 9 | #include "dsc_device.h" 10 | 11 | // As per https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses 12 | // "Any address of a variable residing in global memory or returned by one of the memory allocation routines 13 | // from the driver or runtime API is always aligned to at least 256 bytes." 14 | #define DSC_DEVICE_GPU_ALIGN ((usize) 256) 15 | #define DSC_MEMCPY_DIRECTIONS ((int) 4) 16 | 17 | static constexpr gpu_memcpy_kind DSC_GPU_MEMCPY_DIRECTIONS[DSC_MEMCPY_DIRECTIONS] = { 18 | gpu_memcpy_default, 19 | gpu_memcpy_device_2_host, 20 | gpu_memcpy_host_2_device, 21 | gpu_memcpy_device_2_device, 22 | }; 23 | 24 | static DSC_GPU_KERNEL void k_init_random(gpu_rand_state *state) { 25 | DSC_GPU_TID(); 26 | gpu_init_rand(clock64(), tid, 0, &state[tid]); 27 | } 28 | 29 | static void gpu_memcpy_wrapper(void *dst, const void *src, const usize nb, const dsc_memcpy_dir dir) { 30 | DSC_GPU_CHECK(gpu_memcpy(dst, src, nb, DSC_GPU_MEMCPY_DIRECTIONS[dir])); 31 | } 32 | 33 | static void gpu_memset_wrapper(void *dst, const int c, const usize nb) { 34 | DSC_GPU_CHECK(gpu_memset(dst, c, nb)); 35 | } 36 | 37 | static void gpu_dispose(dsc_device *dev) { 38 | DSC_GPU_CHECK(gpu_free(dev->device_mem)); 39 | 40 | const dsc_gpu_dev_info *info = (dsc_gpu_dev_info *) dev->extra_info; 41 | DSC_GPU_BLAS_CHECK(gpu_blas_destroy(info->blas_handle)); 42 | 43 | DSC_GPU_CHECK(gpu_free(info->rand_state)); 44 | 45 | dsc_gpu_tracing_dispose(dev->trace_ctx); 46 | 47 | DSC_LOG_INFO("%s:%s:%d device %s disposed", 48 | DSC_DEVICE_NAMES[dev->type], 49 | DSC_GPU_PLATFORM_NAMES[DSC_GPU_PLATFORM], 50 | info->dev_idx, 51 | info->name); 52 | } 53 | 54 | dsc_device *dsc_gpu_device(usize mem_size, const int dev_idx) { 55 | static dsc_gpu_dev_info extra = { 56 | .name = {}, 57 | .rand_state = {}, 58 | .blas_handle = {}, 59 | .dev_idx = dev_idx, 60 | .platform = DSC_GPU_PLATFORM, 61 | }; 62 | DSC_GPU_BLAS_CHECK(gpu_blas_create(&extra.blas_handle)); 63 | 64 | // Allocate 90% of the device memory at most (is this too much?) 65 | const usize max_mem = (usize) (0.9 * (f64) dsc_gpu_dev_mem(dev_idx)); 66 | mem_size = mem_size < max_mem ? mem_size : DSC_ALIGN(max_mem - (DSC_DEVICE_GPU_ALIGN - 1), DSC_DEVICE_GPU_ALIGN); 67 | static dsc_device dev = { 68 | .used_nodes = {}, 69 | .free_nodes = {}, 70 | .head = {}, 71 | .device_mem = {}, 72 | .alignment = DSC_DEVICE_GPU_ALIGN, 73 | .extra_info = &extra, 74 | .trace_ctx = dsc_gpu_tracing_init(), 75 | .mem_size = DSC_ALIGN(mem_size, DSC_DEVICE_GPU_ALIGN), 76 | .used_mem = 0, 77 | .type = GPU, 78 | .memcpy = gpu_memcpy_wrapper, 79 | .memset = gpu_memset_wrapper, 80 | .dispose = gpu_dispose, 81 | .next_trace = dsc_gpu_next_trace, 82 | .dump_trace = dsc_gpu_tracing_dump, 83 | .dump_json_metadata = dsc_gpu_dump_json_metadata, 84 | }; 85 | 86 | DSC_GPU_CHECK(gpu_set_device(dev_idx)); 87 | 88 | dsc_gpu_dev_name(dev_idx, extra.name); 89 | 90 | DSC_GPU_CHECK(gpu_malloc(&extra.rand_state, DSC_GPU_DEFAULT_THREADS * sizeof(gpu_rand_state))); 91 | 92 | k_init_random<<<1, DSC_GPU_DEFAULT_THREADS>>>(extra.rand_state); 93 | 94 | dsc_gpu_sync(); 95 | 96 | DSC_GPU_CHECK(gpu_malloc(&dev.device_mem, dev.mem_size)); 97 | 98 | dev.free_nodes[0].size = dev.mem_size; 99 | dev.free_nodes[0].data = dev.device_mem; 100 | dev.free_nodes[0].next = nullptr; 101 | 102 | dev.head = &dev.free_nodes[0]; 103 | 104 | DSC_LOG_INFO("%s:%s:%d device %s initialized with a buffer of %ldMB (total: %ldMB)", 105 | DSC_DEVICE_NAMES[dev.type], 106 | DSC_GPU_PLATFORM_NAMES[DSC_GPU_PLATFORM], 107 | dev_idx, 108 | extra.name, 109 | (usize) DSC_B_TO_MB(dev.mem_size), 110 | (usize) DSC_B_TO_MB(dsc_gpu_dev_mem(dev_idx))); 111 | 112 | return &dev; 113 | } -------------------------------------------------------------------------------- /dsc/include/gpu/platform/dsc_cuda_platform.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024-2025, Christian Gilli 2 | // All rights reserved. 3 | // 4 | // This code is licensed under the terms of the 3-clause BSD license 5 | // (https://opensource.org/license/bsd-3-clause). 6 | 7 | #pragma once 8 | 9 | #include 10 | 11 | #pragma GCC diagnostic push 12 | #pragma GCC diagnostic ignored "-Wshadow" 13 | #pragma GCC diagnostic ignored "-Wdouble-promotion" 14 | #include 15 | #pragma GCC diagnostic pop 16 | 17 | #define DSC_GPU_PLATFORM CUDA 18 | 19 | #define DSC_GPU_CHECK(err) \ 20 | do { \ 21 | if (err != cudaSuccess) { \ 22 | DSC_LOG_FATAL("CUDA error: %s", cudaGetErrorName(err)); \ 23 | } \ 24 | } while (0) 25 | 26 | #define DSC_GPU_BLAS_CHECK(err) \ 27 | do { \ 28 | if (err != CUBLAS_STATUS_SUCCESS) { \ 29 | DSC_LOG_FATAL("cuBLAS error: %s", cublasGetStatusString(err)); \ 30 | } \ 31 | } while (0) 32 | 33 | 34 | // ============================================================ 35 | // Runtime API 36 | // 37 | 38 | #define gpu_get_device_count cudaGetDeviceCount 39 | #define gpu_get_device_properties cudaGetDeviceProperties 40 | #define gpu_device_sync cudaDeviceSynchronize 41 | 42 | #define gpu_malloc cudaMalloc 43 | #define gpu_free cudaFree 44 | #define gpu_memcpy cudaMemcpy 45 | #define gpu_memset cudaMemset 46 | #define gpu_set_device cudaSetDevice 47 | 48 | #define gpu_memcpy_default cudaMemcpyDefault 49 | #define gpu_memcpy_device_2_host cudaMemcpyDeviceToHost 50 | #define gpu_memcpy_host_2_device cudaMemcpyHostToDevice 51 | #define gpu_memcpy_device_2_device cudaMemcpyDeviceToDevice 52 | 53 | using gpu_memcpy_kind = cudaMemcpyKind; 54 | using gpu_device_props = cudaDeviceProp; 55 | 56 | // ============================================================ 57 | // Rand API 58 | // 59 | 60 | #define gpu_init_rand curand_init 61 | #define gpu_sample_normalf curand_normal 62 | #define gpu_sample_normal curand_normal_double 63 | 64 | using gpu_rand_state = curandState; 65 | 66 | // ============================================================ 67 | // BLAS API 68 | // 69 | 70 | #define gpu_blas_create cublasCreate 71 | #define gpu_blas_destroy cublasDestroy 72 | #define gpu_blas_sgemm cublasSgemm 73 | #define gpu_blas_dgemm cublasDgemm 74 | #define gpu_blas_op cublasOperation_t 75 | #define gpu_blas_dtype cudaDataType 76 | #define GPU_BLAS_OP_T CUBLAS_OP_T 77 | #define GPU_BLAS_OP_N CUBLAS_OP_N 78 | #define GPU_GEMM_DTYPE_BF16 CUDA_R_16BF 79 | #define GPU_GEMM_DTYPE_F32 CUDA_R_32F 80 | 81 | using gpu_blas_handle = cublasHandle_t; 82 | 83 | 84 | static DSC_INLINE cublasStatus_t gpu_blas_bfgemm(const gpu_blas_handle handle, const gpu_blas_op a_op, const gpu_blas_op b_op, 85 | const int m, const int n, const int k, const void *DSC_RESTRICT alpha, 86 | const void *DSC_RESTRICT xa, const gpu_blas_dtype a_dtype, const int stride_a, 87 | const void *DSC_RESTRICT xb, const gpu_blas_dtype b_dtype, const int stride_b, 88 | const void *DSC_RESTRICT beta, void *out, const gpu_blas_dtype out_dtype, 89 | const int stride_out, const gpu_blas_dtype compute_dtype) { 90 | return cublasGemmEx(handle, a_op, b_op, m, n, k, 91 | alpha, xa, a_dtype, stride_a, 92 | xb, b_dtype, stride_b, beta, 93 | out, out_dtype, stride_out, 94 | compute_dtype, CUBLAS_GEMM_DEFAULT); 95 | } 96 | 97 | // ============================================================ 98 | // Event API 99 | // 100 | 101 | #define gpu_event_create cudaEventCreate 102 | #define gpu_event_destroy cudaEventDestroy 103 | #define gpu_event_record cudaEventRecord 104 | #define gpu_event_synchronize cudaEventSynchronize 105 | #define gpu_event_elapsed cudaEventElapsedTime 106 | 107 | using gpu_event = cudaEvent_t; 108 | -------------------------------------------------------------------------------- /dsc/include/gpu/platform/dsc_hip_platform.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024-2025, Christian Gilli 2 | // All rights reserved. 3 | // 4 | // This code is licensed under the terms of the 3-clause BSD license 5 | // (https://opensource.org/license/bsd-3-clause). 6 | 7 | #pragma once 8 | 9 | #include 10 | #pragma GCC diagnostic push 11 | #pragma GCC diagnostic ignored "-Wdouble-promotion" 12 | #include 13 | #pragma GCC diagnostic pop 14 | 15 | #include 16 | 17 | 18 | #define DSC_GPU_PLATFORM ROCM 19 | 20 | #define DSC_GPU_CHECK(err) \ 21 | do { \ 22 | if (err != hipSuccess) { \ 23 | DSC_LOG_FATAL("HIP error: %s", hipGetErrorString(err)); \ 24 | } \ 25 | } while (0) 26 | 27 | #define DSC_GPU_BLAS_CHECK(err) \ 28 | do { \ 29 | if (err != rocblas_status_success) { \ 30 | DSC_LOG_FATAL("rocBLAS error: %s", rocblas_status_to_string(err)); \ 31 | } \ 32 | } while (0) 33 | 34 | // ============================================================ 35 | // Runtime API 36 | // 37 | 38 | #define gpu_get_device_count hipGetDeviceCount 39 | #define gpu_get_device_properties hipGetDeviceProperties 40 | #define gpu_device_sync hipDeviceSynchronize 41 | 42 | #define gpu_malloc hipMalloc 43 | #define gpu_free hipFree 44 | #define gpu_memcpy hipMemcpy 45 | #define gpu_memset hipMemset 46 | #define gpu_set_device hipSetDevice 47 | 48 | #define gpu_memcpy_default hipMemcpyDefault 49 | #define gpu_memcpy_device_2_host hipMemcpyDeviceToHost 50 | #define gpu_memcpy_host_2_device hipMemcpyHostToDevice 51 | #define gpu_memcpy_device_2_device hipMemcpyDeviceToDevice 52 | 53 | using gpu_memcpy_kind = hipMemcpyKind; 54 | using gpu_device_props = hipDeviceProp_t; 55 | 56 | // ============================================================ 57 | // Rand API 58 | // 59 | 60 | #define gpu_init_rand rocrand_init 61 | #define gpu_sample_normalf rocrand_normal 62 | #define gpu_sample_normal rocrand_normal_double 63 | // Default for cuRAND 64 | using gpu_rand_state = rocrand_state_xorwow; 65 | 66 | // ============================================================ 67 | // BLAS API 68 | // 69 | 70 | #define gpu_blas_create rocblas_create_handle 71 | #define gpu_blas_destroy rocblas_destroy_handle 72 | #define gpu_blas_sgemm rocblas_sgemm 73 | #define gpu_blas_dgemm rocblas_dgemm 74 | #define gpu_blas_op rocblas_operation 75 | #define gpu_blas_dtype rocblas_datatype 76 | #define GPU_BLAS_OP_T rocblas_operation_transpose 77 | #define GPU_BLAS_OP_N rocblas_operation_none 78 | #define GPU_GEMM_DTYPE_BF16 rocblas_datatype_bf16_r 79 | #define GPU_GEMM_DTYPE_F32 rocblas_datatype_f32_r 80 | 81 | using gpu_blas_handle = rocblas_handle; 82 | 83 | static DSC_INLINE rocblas_status gpu_blas_bfgemm(const gpu_blas_handle handle, const gpu_blas_op a_op, const gpu_blas_op b_op, 84 | const int m, const int n, const int k, const void *DSC_RESTRICT alpha, 85 | const void *DSC_RESTRICT xa, const gpu_blas_dtype a_dtype, const int stride_a, 86 | const void *DSC_RESTRICT xb, const gpu_blas_dtype b_dtype, const int stride_b, 87 | const void *DSC_RESTRICT beta, void *out, const gpu_blas_dtype out_dtype, 88 | const int stride_out, const gpu_blas_dtype compute_dtype) { 89 | return rocblas_gemm_ex(handle, a_op, b_op, m, n, k, 90 | alpha, xa, a_dtype, stride_a, 91 | xb, b_dtype, stride_b, beta, 92 | out, out_dtype, stride_out, 93 | out, out_dtype, stride_out, 94 | compute_dtype, rocblas_gemm_algo_standard, 0, 0); 95 | } 96 | 97 | // ============================================================ 98 | // Event API 99 | // 100 | 101 | #define gpu_event_create hipEventCreate 102 | #define gpu_event_destroy hipEventDestroy 103 | #define gpu_event_record hipEventRecord 104 | #define gpu_event_synchronize hipEventSynchronize 105 | #define gpu_event_elapsed hipEventElapsedTime 106 | 107 | using gpu_event = hipEvent_t; 108 | -------------------------------------------------------------------------------- /.clang-tidy: -------------------------------------------------------------------------------- 1 | --- 2 | Checks: '-*, 3 | bugprone-argument-comment, 4 | bugprone-assert-side-effect, 5 | bugprone-bad-signal-to-kill-thread, 6 | bugprone-branch-clone, 7 | bugprone-copy-constructor-init, 8 | bugprone-dangling-handle, 9 | bugprone-dynamic-static-initializers, 10 | bugprone-fold-init-type, 11 | bugprone-forward-declaration-namespace, 12 | bugprone-forwarding-reference-overload, 13 | bugprone-inaccurate-erase, 14 | bugprone-incorrect-roundings, 15 | bugprone-integer-division, 16 | bugprone-lambda-function-name, 17 | bugprone-macro-repeated-side-effects, 18 | bugprone-misplaced-operator-in-strlen-in-alloc, 19 | bugprone-misplaced-pointer-arithmetic-in-alloc, 20 | bugprone-misplaced-widening-cast, 21 | bugprone-move-forwarding-reference, 22 | bugprone-multiple-statement-macro, 23 | bugprone-no-escape, 24 | bugprone-parent-virtual-call, 25 | bugprone-posix-return, 26 | bugprone-reserved-identifier, 27 | bugprone-sizeof-container, 28 | bugprone-sizeof-expression, 29 | bugprone-spuriously-wake-up-functions, 30 | bugprone-string-constructor, 31 | bugprone-string-integer-assignment, 32 | bugprone-string-literal-with-embedded-nul, 33 | bugprone-suspicious-enum-usage, 34 | bugprone-suspicious-include, 35 | bugprone-suspicious-memset-usage, 36 | bugprone-suspicious-missing-comma, 37 | bugprone-suspicious-semicolon, 38 | bugprone-suspicious-string-compare, 39 | bugprone-suspicious-memory-comparison, 40 | bugprone-suspicious-realloc-usage, 41 | bugprone-swapped-arguments, 42 | bugprone-terminating-continue, 43 | bugprone-throw-keyword-missing, 44 | bugprone-too-small-loop-variable, 45 | bugprone-undefined-memory-manipulation, 46 | bugprone-undelegated-constructor, 47 | bugprone-unhandled-self-assignment, 48 | bugprone-unused-raii, 49 | bugprone-unused-return-value, 50 | bugprone-use-after-move, 51 | bugprone-virtual-near-miss, 52 | cert-dcl21-cpp, 53 | cert-dcl58-cpp, 54 | cert-err52-cpp, 55 | cert-err60-cpp, 56 | cert-flp30-c, 57 | cert-str34-c, 58 | cppcoreguidelines-interfaces-global-init, 59 | cppcoreguidelines-narrowing-conversions, 60 | cppcoreguidelines-pro-type-member-init, 61 | cppcoreguidelines-pro-type-static-cast-downcast, 62 | cppcoreguidelines-slicing, 63 | google-default-arguments, 64 | google-runtime-operator, 65 | hicpp-exception-baseclass, 66 | hicpp-multiway-paths-covered, 67 | misc-misplaced-const, 68 | misc-new-delete-overloads, 69 | misc-non-copyable-objects, 70 | misc-throw-by-value-catch-by-reference, 71 | misc-unconventional-assign-operator, 72 | misc-uniqueptr-reset-release, 73 | modernize-avoid-bind, 74 | modernize-concat-nested-namespaces, 75 | modernize-deprecated-headers, 76 | modernize-deprecated-ios-base-aliases, 77 | modernize-make-shared, 78 | modernize-make-unique, 79 | modernize-pass-by-value, 80 | modernize-raw-string-literal, 81 | modernize-redundant-void-arg, 82 | modernize-replace-auto-ptr, 83 | modernize-replace-disallow-copy-and-assign-macro, 84 | modernize-replace-random-shuffle, 85 | modernize-return-braced-init-list, 86 | modernize-shrink-to-fit, 87 | modernize-unary-static-assert, 88 | modernize-use-bool-literals, 89 | modernize-use-emplace, 90 | modernize-use-equals-default, 91 | modernize-use-equals-delete, 92 | modernize-use-noexcept, 93 | modernize-use-nullptr, 94 | modernize-use-override, 95 | modernize-use-transparent-functors, 96 | modernize-use-uncaught-exceptions, 97 | mpi-buffer-deref, 98 | mpi-type-mismatch, 99 | openmp-use-default-none, 100 | performance-faster-string-find, 101 | performance-for-range-copy, 102 | performance-implicit-conversion-in-loop, 103 | performance-inefficient-algorithm, 104 | performance-inefficient-string-concatenation, 105 | performance-inefficient-vector-operation, 106 | performance-move-const-arg, 107 | performance-move-constructor-init, 108 | performance-no-automatic-move, 109 | performance-noexcept-move-constructor, 110 | performance-trivially-destructible, 111 | performance-type-promotion-in-math-fn, 112 | performance-unnecessary-copy-initialization, 113 | performance-unnecessary-value-param, 114 | portability-simd-intrinsics, 115 | readability-avoid-const-params-in-decls, 116 | readability-const-return-type, 117 | readability-container-size-empty, 118 | readability-convert-member-functions-to-static, 119 | readability-delete-null-pointer, 120 | readability-deleted-default, 121 | readability-inconsistent-declaration-parameter-name, 122 | readability-make-member-function-const, 123 | readability-misleading-indentation, 124 | readability-misplaced-array-index, 125 | readability-non-const-parameter, 126 | readability-redundant-control-flow, 127 | readability-redundant-declaration, 128 | readability-redundant-function-ptr-dereference, 129 | readability-redundant-smartptr-get, 130 | readability-redundant-string-cstr, 131 | readability-redundant-string-init, 132 | readability-simplify-subscript-expr, 133 | readability-static-accessed-through-instance, 134 | readability-static-definition-in-anonymous-namespace, 135 | readability-string-compare, 136 | readability-uniqueptr-delete-release, 137 | readability-use-anyofallof' -------------------------------------------------------------------------------- /IDEAS.md: -------------------------------------------------------------------------------- 1 | ## Code Gen 2 | I'm thinking about adding some table-driven code generation into dsc. 3 | Right now, if you want to add a new operation you have to go through the following steps: 4 | 5 | 1. Declare the API function in `dsc.h` 6 | 2. Implement the function (parameter validation + dispatching) in `dsc.cpp` 7 | 3. For each backend: 8 | 1. Declare the function that actually implements the core logic in `dsc_xxx.h` 9 | 2. Actually implement the core logic of the function in `dsc_xxx.cpp` 10 | 4. Implement the binding in `_bindings.py` 11 | 12 | The actual logic itself for the *majority* of operations is basically the same. 13 | Consider a generic binary operation, the flow is almost always: 14 | 1. Validation + dispatch 15 | ```c++ 16 | dsc_tensor *dsc_add(dsc_ctx *ctx, 17 | dsc_tensor *xa, 18 | dsc_tensor *xb, 19 | dsc_tensor *out) { 20 | validate_binary_params(); 21 | 22 | DSC_DISPATCH(xa->device, add, xa, xb, out); 23 | 24 | cleanup_binary(); 25 | 26 | return out; 27 | } 28 | ``` 29 | 2. Implementation 30 | ```c++ 31 | void dsc_cpu_add(dsc_device *, 32 | const dsc_tensor *xa, 33 | const dsc_tensor *xb, 34 | dsc_tensor *out) { 35 | binary_op(xa, xb, out, cpu_add_op()); 36 | } 37 | ``` 38 | The same goes for unary operations and for reductions. The only exceptions at the moment is probably 39 | the GEMM and the operations related to indexing and slicing. 40 | 41 | Also, most of the code in `dsc_ops.h` can be generate trivially. 42 | 43 | And then there are the aspects related to testing and benchmarking: using this kind of approach could also 44 | lead to better testing and benchmarking (ie. when I specify a new operation I can also specify 'how it should be tested / benchmarked'). 45 | 46 | Another important point: with this approach it could be even easier to implement tracing. For example, 47 | if I want to use the current approach I can just define the parameters that are important and generate 48 | the macros and stuff. 49 | 50 | 51 | **Key things to keep in mind:** 52 | - The generated code must live alongside the handwritten code, I don't have a clear idea on how to do this, 53 | same goes for the bindings. For tests and benchmarks is easier because I can actually put them in multiple files 54 | right away. 55 | - (this is related to the previous point) Limit file proliferation. I don't want the number of files to explode 56 | due to this feature. 57 | - **Generated code must be versioned** (ie. pushed to GitHub) **and must be modifiable** and the code generator 58 | must not erase these updates if it's re-run. 59 | 60 | 61 | ## Kernel Generation 62 | This is somewhat related to the previous point. The idea is that for 'high-level' kernels (i.e. kernels that are defined 63 | directly in Python) the overhead of going back and forth between Python and C++ can be significant. 64 | Take for example the softmax kernel: 65 | ```python 66 | @trace('softmax') 67 | def softmax(x: Tensor, axis: int = -1) -> Tensor: 68 | e = exp((x - max(x, axis=axis, keepdims=True))) 69 | sum_e = sum(e, axis=axis, keepdims=True) 70 | return e / sum_e 71 | ``` 72 | This is a correct softmax that uses native operations under the hood, this means each operation will produce a new tensor 73 | that Python must wrap and track and so on. Just be re-writing these same operations in C++ naively, without anything extra, 74 | we get a **~20% speedup**. The Python side will then be replaced by: 75 | ```python 76 | @trace('softmax') 77 | def softmax(x: Tensor, axis: int = -1) -> Tensor: 78 | return Tensor(_dsc_softmax(_get_ctx(), x.c_ptr, axis)) 79 | ``` 80 | 81 | The idea is to add a mechanism to do this sort of code generation automatically. There are a few things to keep in mind: 82 | - Kernels may depend on other 'high-level' kernels (i.e. LayerNorm depends on mean and var). 83 | - It should be possible to switch between the naive Python version and the generated version i.e. when debugging 84 | 85 | 86 | ## Memory Management 87 | It's clear that for model inference the general purpose allocator approach is not good enough. Right now, between allocations 88 | and de-allocations more than 6% of the total execution time is wasted managing memory. 89 | I don't know if using a proper arena for everything will work with Python garbage collector but at least on the C++ side 90 | I should add that (opt-in ?). 91 | This would be even more important if I manage to implement native kernel generation because the softmax example above 92 | otherwise will turn out to look like this: 93 | ```c++ 94 | dsc_tensor *dsc_softmax(dsc_ctx *ctx, 95 | dsc_tensor *DSC_RESTRICT x, 96 | const int axis) { 97 | dsc_tensor *m = dsc_max(ctx, x, nullptr, axis); 98 | dsc_tensor *dif = dsc_sub(ctx, x, m); 99 | dsc_tensor *e = dsc_exp(ctx, dif); 100 | dsc_tensor *sum_e = dsc_sum(ctx, e, nullptr, axis); 101 | 102 | dsc_tensor *out = dsc_div(ctx, e, sum_e); 103 | 104 | dsc_tensor_free(ctx, m); 105 | dsc_tensor_free(ctx, dif); 106 | dsc_tensor_free(ctx, e); 107 | dsc_tensor_free(ctx, sum_e); 108 | return out; 109 | } 110 | ``` 111 | Which is very ugly and cumbersome. -------------------------------------------------------------------------------- /dsc/include/cpu/dsc_iter.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024-2025, Christian Gilli 2 | // All rights reserved. 3 | // 4 | // This code is licensed under the terms of the 3-clause BSD license 5 | // (https://opensource.org/license/bsd-3-clause). 6 | 7 | #pragma once 8 | 9 | #include "dsc.h" 10 | 11 | 12 | namespace internal::iter { 13 | // TODO: (4) 14 | template 15 | constexpr int compute_index(const int *DSC_RESTRICT idx, 16 | const int *DSC_RESTRICT stride) { 17 | // Note: computing the index on the fly is way easier than keeping track of the current index 18 | // and increasing/decreasing it after each step, but it requires some benchmarking! 19 | if constexpr (Cur == DSC_MAX_DIMS) { 20 | return 0; 21 | } else { 22 | return idx[Cur] * stride[Cur] + compute_index(idx, stride); 23 | } 24 | } 25 | } 26 | 27 | struct dsc_axis_iterator { 28 | dsc_axis_iterator(const dsc_tensor *x, 29 | const int axis, 30 | const int axis_n = -1) : 31 | x_(x), axis_(axis), 32 | axis_n_((axis_n < 0 || axis_n > x->shape[axis]) ? x->shape[axis] : axis_n) { 33 | } 34 | 35 | DSC_INLINE void next() { 36 | if (++idx_[axis_] < axis_n_) [[likely]] return; 37 | 38 | idx_[axis_] = 0; 39 | bool still_left = false; 40 | for (int i = DSC_MAX_DIMS - 1; i >= 0; --i) { 41 | if (i == axis_) continue; 42 | 43 | if (++idx_[i] < x_->shape[i]) [[likely]] { 44 | still_left = true; 45 | break; 46 | } 47 | idx_[i] = 0; 48 | // If this is the last dimension and we rolled then we're done 49 | end_ = i == 0; 50 | } 51 | // If we are iterating over axis 0 and we arrive here then we're done 52 | end_ |= axis_ == 0 && !still_left; 53 | } 54 | 55 | DSC_INLINE int index() const { 56 | return internal::iter::compute_index(idx_, x_->stride); 57 | } 58 | 59 | DSC_INLINE bool has_next() const { 60 | return !end_; 61 | } 62 | 63 | DSC_INLINE int pos(const int idx) const { 64 | return idx_[idx > 0 ? idx : DSC_MAX_DIMS + idx]; 65 | } 66 | 67 | private: 68 | int idx_[DSC_MAX_DIMS]{}; 69 | const dsc_tensor *DSC_RESTRICT x_; 70 | int axis_; 71 | int axis_n_; 72 | bool end_ = false; 73 | }; 74 | 75 | struct dsc_broadcast_iterator { 76 | dsc_broadcast_iterator(const dsc_tensor *x, const int *out_shape) : x_shape_(x->shape), 77 | x_stride_(x->stride), 78 | out_shape_(out_shape) { 79 | for (int i = 0; i < DSC_MAX_DIMS; ++i) { 80 | x_broadcast_stride_[i] = x_shape_[i] < out_shape_[i] ? 0 : x_stride_[i]; 81 | } 82 | } 83 | 84 | // Simple strided iterator 85 | dsc_broadcast_iterator(const int *x_shape, const int *x_stride) : x_shape_(x_shape), 86 | x_stride_(x_stride), 87 | out_shape_(x_shape) { 88 | for (int i = 0; i < DSC_MAX_DIMS; ++i) { 89 | x_broadcast_stride_[i] = x_stride_[i]; 90 | } 91 | } 92 | 93 | DSC_INLINE void next() { 94 | for (int i = DSC_MAX_DIMS - 1; i >= 0; --i) { 95 | if (++x_idx_[i] < out_shape_[i]) [[likely]] { 96 | index_ += x_broadcast_stride_[i]; 97 | return; 98 | } 99 | // Rollover this dimension 100 | index_ -= (x_idx_[i] - 1) * x_broadcast_stride_[i]; 101 | x_idx_[i] = 0; 102 | } 103 | } 104 | 105 | DSC_INLINE int index() const { 106 | return index_; 107 | } 108 | 109 | private: 110 | int index_ = 0; 111 | const int *x_shape_, *x_stride_, *out_shape_; 112 | int x_broadcast_stride_[DSC_MAX_DIMS]{}, x_idx_[DSC_MAX_DIMS]{}; 113 | }; 114 | 115 | struct dsc_slice_iterator { 116 | dsc_slice_iterator(const dsc_tensor *x, const int n_slices, const dsc_slice *slices) : 117 | shape_(x->shape), stride_(x->stride), n_dim_(x->n_dim) { 118 | for (int i = 0; i < x->n_dim; ++i) { 119 | const int dim_idx = dsc_tensor_dim_idx(x, i); 120 | if (i < n_slices) { 121 | start_[dim_idx] = slices[i].start; 122 | stop_[dim_idx] = slices[i].stop; 123 | step_[dim_idx] = slices[i].step; 124 | } else { 125 | start_[dim_idx] = 0; 126 | stop_[dim_idx] = shape_[dim_idx]; 127 | step_[dim_idx] = 1; 128 | } 129 | 130 | idx_[dim_idx] = start_[dim_idx]; 131 | } 132 | } 133 | 134 | DSC_INLINE bool has_next() const { 135 | return !end_; 136 | } 137 | 138 | DSC_INLINE void next() { 139 | for (int i = DSC_MAX_DIMS - 1; i >= (DSC_MAX_DIMS - n_dim_); --i) { 140 | idx_[i] += step_[i]; 141 | if ((step_[i] > 0 && idx_[i] < stop_[i]) || 142 | (step_[i] < 0 && idx_[i] > stop_[i])) [[likely]] { 143 | return; 144 | } 145 | idx_[i] = start_[i]; 146 | } 147 | end_ = true; 148 | } 149 | 150 | DSC_INLINE int index() const { 151 | return internal::iter::compute_index<>(idx_, stride_); 152 | } 153 | 154 | private: 155 | const int *shape_; 156 | const int *stride_; 157 | int idx_[DSC_MAX_DIMS]{}; 158 | int start_[DSC_MAX_DIMS]{}; 159 | int stop_[DSC_MAX_DIMS]{}; 160 | int step_[DSC_MAX_DIMS]{}; 161 | const int n_dim_; 162 | bool end_ = false; 163 | }; -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CXX = g++ 2 | NVCC = nvcc 3 | HIPCC = hipcc 4 | 5 | HIPCCFLAGS = -std=c++20 -I$(ROCM)/include -I./dsc/include/ --offload-arch=native -Wall -Wextra -Wformat \ 6 | -Wcast-qual -Wcast-align -Wstrict-aliasing -Wpointer-arith -Wunused -Wdouble-promotion \ 7 | -Wno-missing-braces -Wcast-align -fno-exceptions -fno-rtti 8 | NVCCFLAGS = -std=c++20 -I$(CUDA)/include -I./dsc/include/ -ccbin=$(CXX) -arch=native \ 9 | -forward-unknown-opts -Wall -Wextra -Wformat -Wnoexcept \ 10 | -Wcast-qual -Wcast-align -Wstrict-aliasing -Wpointer-arith -Wunused -Wdouble-promotion \ 11 | -Wlogical-op -Wcast-align -fno-exceptions -fno-rtti 12 | CXXFLAGS = -std=c++20 -I./dsc/include/ -Wall -Wextra -Wformat -Wnoexcept \ 13 | -Wcast-qual -Wcast-align -Wstrict-aliasing -Wpointer-arith -Wunused -Wdouble-promotion \ 14 | -Wlogical-op -Wcast-align -fno-exceptions -fno-rtti -pthread 15 | LDFLAGS = -lm 16 | 17 | UNAME_M := $(shell uname -m) 18 | UNAME_S := $(shell uname -s) 19 | 20 | ifdef DSC_GPU 21 | # Try to detect the GPU vendor based on the available compiler 22 | DSC_CUDA := $(shell which $(NVCC) 2>/dev/null) 23 | DSC_HIP := $(shell which $(HIPCC) 2>/dev/null) 24 | ifdef DSC_CUDA 25 | ifneq ($(wildcard /opt/cuda),) 26 | CUDA ?= /opt/cuda 27 | else 28 | CUDA ?= /usr/local/cuda 29 | endif 30 | else 31 | # Check for HIP only if CUDA is not defined 32 | ifdef DSC_HIP 33 | ifneq ($(wildcard /opt/rocm),) 34 | ROCM ?= /opt/rocm 35 | else 36 | ROCM ?= /usr/local/rocm 37 | endif 38 | endif 39 | endif 40 | endif 41 | 42 | # Make sure only one GPU platform is defined 43 | ifdef DSC_CUDA 44 | ifdef DSC_HIP 45 | $(error ERROR: both DSC_CUDA and DSC_HIP are defined - this is not supported) 46 | endif 47 | endif 48 | 49 | ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64)) 50 | # Use all available CPU extensions, x86 only 51 | CXXFLAGS += -march=native -mtune=native 52 | endif 53 | 54 | ifndef DSC_LOG_LEVEL 55 | ifdef DSC_FAST 56 | DSC_LOG_LEVEL := 1 57 | else 58 | DSC_LOG_LEVEL := 0 59 | endif 60 | endif 61 | 62 | CXXFLAGS += -DDSC_LOG_LEVEL=$(DSC_LOG_LEVEL) 63 | NVCCFLAGS += -DDSC_LOG_LEVEL=$(DSC_LOG_LEVEL) 64 | HIPCCFLAGS += -DDSC_LOG_LEVEL=$(DSC_LOG_LEVEL) 65 | 66 | ifdef DSC_FAST 67 | # -Ofast turns on all the unsafe math optimizations, including -ffinite-math-only this is an issue when testing 68 | # because Inf and NaN have different meaning but will be treated as equals when using -ffinite-math-only. 69 | # When inferencing assuming only finite numbers is correct but since it's doesn't actually hurt performance 70 | # let's keep this flag so we can run our tests without worrying about denormal numbers. 71 | CXXFLAGS += -Ofast -fno-finite-math-only -ffp-contract=fast -funroll-loops -flto=auto -fuse-linker-plugin 72 | NVCCFLAGS += -O3 73 | HIPCCFLAGS += -O3 74 | else 75 | CXXFLAGS += -O0 -fno-omit-frame-pointer -g 76 | NVCCFLAGS += -O0 -fno-omit-frame-pointer -g -G 77 | HIPCCFLAGS += -O0 -fno-omit-frame-pointer -g 78 | endif 79 | 80 | ifdef DSC_TRACING 81 | CXXFLAGS += -DDSC_TRACING=1 82 | NVCCFLAGS += -DDSC_TRACING=1 83 | HIPCCFLAGS += -DDSC_TRACING=1 84 | endif 85 | 86 | # If we are not compiling the shared object and are in debug mode then run in ASAN mode 87 | ifeq ($(MAKECMDGOALS),shared) 88 | CXXFLAGS += -fPIC 89 | NVCCFLAGS += -fPIC 90 | HIPCCFLAGS += -fPIC 91 | endif 92 | 93 | GPU_SRCS := $(wildcard dsc/src/gpu/*.cpp) 94 | GPU_OBJS := $(GPU_SRCS:.cpp=.o) 95 | 96 | # Enable CUDA support 97 | ifdef DSC_CUDA 98 | # BF16 is supported in compute capability >= 8.0 (Ampere) 99 | HAS_BF16_GPU := $(shell compute_major=$$(nvidia-smi --query-gpu=compute_cap --format=noheader | cut -d. -f1); \ 100 | if [ "$${compute_major}" -ge 8 ]; then echo 1; fi) 101 | ifeq ($(HAS_BF16_GPU), 1) 102 | NVCCFLAGS += -DDSC_BF16 103 | CXXFLAGS += -DDSC_BF16 104 | endif 105 | 106 | CXXFLAGS += -I$(CUDA)/include -DDSC_CUDA 107 | NVCCFLAGS += -x cu -DDSC_CUDA 108 | LDFLAGS += -L$(CUDA)/lib64 -lcudart -lcublas 109 | 110 | OBJS += $(GPU_OBJS) 111 | 112 | $(GPU_OBJS): %.o: %.cpp 113 | $(NVCC) $(NVCCFLAGS) -c $< -o $@ 114 | endif 115 | 116 | # Enable HIP support 117 | ifdef DSC_HIP 118 | GPU_TARGETS := $(shell ${ROCM_PATH}/bin/rocm_agent_enumerator) 119 | HAS_BF16_GPU := $(shell echo '${GPU_TARGETS}' | grep -q -E "gfx90a|gfx94[0-2]|gfx103[0-6]" && echo 1) 120 | ifeq ($(HAS_BF16_GPU), 1) 121 | HIPCCFLAGS += -DDSC_BF16 122 | CXXFLAGS += -DDSC_BF16 123 | endif 124 | 125 | # TODO: is -D__HIP_PLATFORM_AMD__ required? 126 | CXXFLAGS += -I$(ROCM)/include -DDSC_HIP -D__HIP_PLATFORM_AMD__ 127 | HIPCCFLAGS += -DDSC_HIP 128 | LDFLAGS += -L$(ROCM)/lib -lamdhip64 -lrocrand -lrocblas 129 | 130 | OBJS += $(GPU_OBJS) 131 | 132 | $(GPU_OBJS): %.o: %.cpp 133 | $(HIPCC) $(HIPCCFLAGS) -c $< -o $@ 134 | endif 135 | 136 | 137 | $(info dsc build info: ) 138 | $(info OS: $(UNAME_S)) 139 | $(info ARCH: $(UNAME_M)) 140 | $(info CXX: $(shell $(CXX) --version | head -n 1)) 141 | $(info CXXFLAGS: $(CXXFLAGS)) 142 | 143 | ifdef DSC_CUDA 144 | $(info NVCC: $(shell $(NVCC) --version | head -n 4 | tail -n 1)) 145 | $(info NVCCFLAGS: $(NVCCFLAGS)) 146 | endif 147 | 148 | ifdef DSC_HIP 149 | $(info HIPCC: $(shell $(HIPCC) --version | head -n 1 | tail -n 1)) 150 | $(info HIPCCFLAGS: $(HIPCCFLAGS)) 151 | endif 152 | 153 | $(info LDFLAGS: $(LDFLAGS)) 154 | $(info ) 155 | 156 | SRCS = $(wildcard dsc/src/*.cpp) 157 | SRCS += $(wildcard dsc/src/cpu/*.cpp) 158 | OBJS += $(SRCS:.cpp=.o) 159 | 160 | SHARED_LIB = python/dsc/libdsc.so 161 | 162 | .PHONY: clean shared 163 | 164 | clean: 165 | rm -rf *.o *.so *.old $(OBJS) $(GPU_OBJS) $(SHARED_LIB) 166 | 167 | shared: $(OBJS) 168 | $(CXX) $(CXXFLAGS) -shared $(OBJS) -o $(SHARED_LIB) $(LDFLAGS) 169 | 170 | %.o: %.cpp 171 | $(CXX) $(CXXFLAGS) -c $< -o $@ 172 | -------------------------------------------------------------------------------- /dsc/include/cpu/dsc_ops.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024-2025, Christian Gilli 2 | // All rights reserved. 3 | // 4 | // This code is licensed under the terms of the 3-clause BSD license 5 | // (https://opensource.org/license/bsd-3-clause). 6 | 7 | #pragma once 8 | 9 | #include "dsc.h" 10 | #include 11 | 12 | struct cpu_cast_op { 13 | template 14 | DSC_INLINE DSC_STRICTLY_PURE Tout operator()(const Tin in) const { 15 | union { 16 | f32 f; 17 | u32 i; 18 | } u; 19 | 20 | // TODO: 64 bit conversion f64 -> bf16 21 | if constexpr (dsc_is_type()) { 22 | // Naive way of converting between BF16 and F32, if this has to be applied to a sequence of 23 | // elements it can be vectorized quite easily. 24 | u.i = (u32) in << 16; 25 | return (Tout) u.f; 26 | } else if constexpr (dsc_is_type() && dsc_is_type()) { 27 | u.f = in; 28 | u.i >>= 16; 29 | return (Tout) u.i; 30 | } else { 31 | return (Tout) in; 32 | } 33 | } 34 | }; 35 | 36 | struct cpu_add_op { 37 | template 38 | DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const { 39 | if constexpr (dsc_is_type()) { 40 | return xa || xb; 41 | } else { 42 | return xa + xb; 43 | } 44 | } 45 | }; 46 | 47 | struct cpu_sub_op { 48 | template 49 | DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const { 50 | if constexpr (dsc_is_type()) { 51 | return xa ^ xb; 52 | } else { 53 | return xa - xb; 54 | } 55 | } 56 | }; 57 | 58 | struct cpu_mul_op { 59 | template 60 | DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const { 61 | if constexpr (dsc_is_type()) { 62 | return xa && xb; 63 | } else { 64 | return xa * xb; 65 | } 66 | } 67 | }; 68 | 69 | struct cpu_div_op { 70 | template 71 | DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const { 72 | return xa / xb; 73 | } 74 | }; 75 | 76 | struct cpu_pow_op { 77 | DSC_INLINE DSC_STRICTLY_PURE i32 operator()(const i32 base, const i32 exp) const { 78 | i32 acc = 1; 79 | for (int i = 0; i < exp; ++i) acc *= base; 80 | return acc; 81 | } 82 | 83 | DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 base, const f32 exp) const { 84 | return powf(base, exp); 85 | } 86 | 87 | DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 base, const f64 exp) const { 88 | return pow(base, exp); 89 | } 90 | }; 91 | 92 | struct cpu_cos_op { 93 | DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const { 94 | return cosf(x); 95 | } 96 | 97 | DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const { 98 | return cos(x); 99 | } 100 | }; 101 | 102 | struct cpu_sin_op { 103 | DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const { 104 | return sinf(x); 105 | } 106 | 107 | DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const { 108 | return sin(x); 109 | } 110 | }; 111 | 112 | struct cpu_tanh_op { 113 | DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const { 114 | return tanhf(x); 115 | } 116 | 117 | DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const { 118 | return tanh(x); 119 | } 120 | }; 121 | 122 | struct cpu_sqrt_op { 123 | DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const { 124 | return sqrtf(x); 125 | } 126 | 127 | DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const { 128 | return sqrt(x); 129 | } 130 | }; 131 | 132 | struct cpu_exp_op { 133 | DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const { 134 | return expf(x); 135 | } 136 | 137 | DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const { 138 | return exp(x); 139 | } 140 | }; 141 | 142 | struct cpu_max_op { 143 | template 144 | DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const { 145 | return DSC_MAX(xa, xb); 146 | } 147 | }; 148 | 149 | struct cpu_min_op { 150 | template 151 | DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const { 152 | return DSC_MIN(xa, xb); 153 | } 154 | }; 155 | 156 | struct cpu_eq_op { 157 | template 158 | DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const { 159 | return xa == xb; 160 | } 161 | }; 162 | 163 | struct cpu_ne_op { 164 | template 165 | DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const { 166 | return !cpu_eq_op()(xa, xb); 167 | } 168 | }; 169 | 170 | struct cpu_lt_op { 171 | template 172 | DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const { 173 | return xa < xb; 174 | } 175 | }; 176 | struct cpu_le_op { 177 | template 178 | DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const { 179 | return xa <= xb; 180 | } 181 | }; 182 | 183 | struct cpu_gt_op { 184 | template 185 | DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const { 186 | return xa > xb; 187 | } 188 | }; 189 | 190 | struct cpu_ge_op { 191 | template 192 | DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const { 193 | return xa >= xb; 194 | } 195 | }; 196 | 197 | template 198 | consteval bool is_comparison_op() { 199 | return dsc_is_type() || 200 | dsc_is_type() || 201 | dsc_is_type() || 202 | dsc_is_type() || 203 | dsc_is_type() || 204 | dsc_is_type(); 205 | } 206 | 207 | template 208 | consteval bool is_bool_arith_op() { 209 | return dsc_is_type() || 210 | dsc_is_type() || 211 | dsc_is_type(); 212 | } -------------------------------------------------------------------------------- /dsc/include/gpu/dsc_tracing.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024-2025, Christian Gilli 2 | // All rights reserved. 3 | // 4 | // This code is licensed under the terms of the 3-clause BSD license 5 | // (https://opensource.org/license/bsd-3-clause). 6 | 7 | #pragma once 8 | 9 | #include "dsc.h" 10 | #include "dsc_tracing_common.h" 11 | 12 | #if defined(DSC_TRACING) 13 | 14 | #include "gpu/dsc_gpu.h" 15 | 16 | #undef DSC_INSERT_TYPED_TRACE 17 | #undef DSC_INSERT_NAMED_TRACE 18 | 19 | #define DSC_INSERT_TYPED_TRACE(DEV, T, type_, grid_dim_, block_dim_) \ 20 | dsc_gpu_trace_tracker trace__ { (DEV)->trace_ctx, __FUNCTION__, (type_), (grid_dim_), (block_dim_), &args__ } 21 | 22 | #define DSC_INSERT_NAMED_TRACE(DEV, T, type_, name_, grid_dim_, block_dim_) \ 23 | dsc_gpu_trace_tracker trace__ { (DEV)->trace_ctx, (name_), (type_), (grid_dim_), (block_dim_), &args__ } 24 | 25 | 26 | struct dsc_gpu_trace { 27 | dsc_trace_common base; 28 | 29 | gpu_event start_event, stop_event; 30 | dim3 grid_dim, block_dim; 31 | f32 elapsed_ms; 32 | 33 | DSC_INLINE bool to_eval() const { 34 | return this->elapsed_ms <= 0.f; 35 | } 36 | }; 37 | 38 | template 39 | struct dsc_gpu_trace_tracker { 40 | dsc_gpu_trace_tracker(dsc_trace_ctx *ctx, 41 | const char *name, 42 | const dsc_trace_type type, 43 | const dim3 grid_dim, 44 | const dim3 block_dim, 45 | const T *args) { 46 | using namespace internal::tracing; 47 | 48 | if (dsc_tracing_is_enabled()) { 49 | check_if_full(ctx); 50 | trace_ = next_empty_trace(ctx); 51 | fill_trace(&trace_->base, name, type, args); 52 | DSC_GPU_CHECK(gpu_event_create(&trace_->start_event)); 53 | DSC_GPU_CHECK(gpu_event_create(&trace_->stop_event)); 54 | trace_->grid_dim = grid_dim; 55 | trace_->block_dim = block_dim; 56 | trace_->elapsed_ms = 0.f; 57 | DSC_GPU_CHECK(gpu_event_record(trace_->start_event)); 58 | } 59 | } 60 | 61 | ~dsc_gpu_trace_tracker() { 62 | if (trace_) { 63 | DSC_GPU_CHECK(gpu_event_record(trace_->stop_event)); 64 | } 65 | } 66 | 67 | private: 68 | dsc_gpu_trace *trace_ = nullptr; 69 | }; 70 | 71 | static DSC_INLINE dsc_trace_ctx *dsc_gpu_tracing_init() { 72 | return internal::tracing::init(); 73 | } 74 | 75 | static DSC_INLINE void dsc_gpu_tracing_dispose(const dsc_trace_ctx *ctx) { 76 | internal::tracing::dispose(ctx); 77 | } 78 | 79 | static DSC_INLINE void dsc_gpu_tracing_dump(void *trace, FILE *json_file, 80 | const bool to_console, const bool to_json) { 81 | dsc_gpu_trace *gpu_trace = (dsc_gpu_trace *) trace; 82 | 83 | if (gpu_trace->to_eval()) { 84 | // Make sure this is called once for each trace 85 | DSC_GPU_CHECK(gpu_event_synchronize(gpu_trace->stop_event)); 86 | DSC_GPU_CHECK(gpu_event_elapsed(&gpu_trace->elapsed_ms, gpu_trace->start_event,gpu_trace->stop_event)); 87 | } 88 | 89 | const dsc_trace_common *base = &gpu_trace->base; 90 | 91 | const f64 elapsed_ms = (f64) gpu_trace->elapsed_ms; 92 | const u64 elapsed_us = (u64) (elapsed_ms * 1e3); 93 | const f64 bandwidth = (f64) base->rw_bytes / (elapsed_ms * 1e-3 * DSC_GB(1)); 94 | 95 | if (to_console) { 96 | // So that we can align this 97 | char formatted_kernel_name[256]; 98 | snprintf(formatted_kernel_name, 256, 99 | "%s<(%d,%d,%d), (%d,%d,%d)>", 100 | base->name, 101 | gpu_trace->grid_dim.x, 102 | gpu_trace->grid_dim.y, 103 | gpu_trace->grid_dim.z, 104 | gpu_trace->block_dim.x, 105 | gpu_trace->block_dim.y, 106 | gpu_trace->block_dim.z); 107 | 108 | // Console dumping 109 | printf("*** [%ld] \033[38;5;208m%-12s\033[0m %-40s %.2fms (%6ldus)\t|\t%10.2fGB/s (%ldB)\n", 110 | base->ingestion_time_us, 111 | "GPU", 112 | formatted_kernel_name, 113 | elapsed_ms, 114 | elapsed_us, 115 | bandwidth, 116 | base->rw_bytes); 117 | } 118 | 119 | if (to_json) { 120 | fprintf(json_file, R"({"name":"%s","cat":"%s","ph":"X","ts":%ld,"dur":%ld,"pid":0,"tid":0)", 121 | base->name, 122 | DSC_TRACE_CATEGORY[base->type], 123 | base->ingestion_time_us, 124 | elapsed_us); 125 | fprintf(json_file, R"(,"args":{"bandwidth":"%.2fGB/s")", bandwidth); 126 | internal::tracing::dump_trace_base(json_file, base); 127 | fprintf(json_file, R"==(,"launch_config":{"grid":"(%d,%d,%d)","block":"(%d,%d,%d)"}})==", 128 | gpu_trace->grid_dim.x, 129 | gpu_trace->grid_dim.y, 130 | gpu_trace->grid_dim.z, 131 | gpu_trace->block_dim.x, 132 | gpu_trace->block_dim.y, 133 | gpu_trace->block_dim.z); 134 | fprintf(json_file, R"(})" ",\n"); 135 | } 136 | 137 | if (gpu_trace->to_eval()) { 138 | DSC_GPU_CHECK(gpu_event_destroy(gpu_trace->start_event)); 139 | DSC_GPU_CHECK(gpu_event_destroy(gpu_trace->stop_event)); 140 | } 141 | } 142 | 143 | static DSC_INLINE void dsc_gpu_next_trace(dsc_trace_ctx *ctx) { 144 | internal::tracing::advance_current_trace(ctx); 145 | } 146 | 147 | static DSC_INLINE void dsc_gpu_dump_json_metadata(FILE *json_file, void *extra_info) { 148 | const dsc_gpu_dev_info *dev_info = (dsc_gpu_dev_info *) extra_info; 149 | fprintf(json_file, R"({"name":"process_name","ph":"M","pid":%d,"tid":0,"args":{"name":"%s"},"process_sort_index":100})" ",\n", 150 | dev_info->dev_idx, 151 | dev_info->name); 152 | fprintf(json_file, R"({"name":"thread_name","ph":"M","pid":%d,"tid":0,"args":{"name":"Stream"},"thread_sort_index":101})" ",\n", 153 | dev_info->dev_idx); 154 | } 155 | 156 | #else 157 | 158 | static DSC_INLINE dsc_trace_ctx *dsc_gpu_tracing_init() { return nullptr; } 159 | static DSC_INLINE void dsc_gpu_tracing_dispose(const dsc_trace_ctx *) {} 160 | static DSC_INLINE void dsc_gpu_tracing_dump(void *, FILE *, bool, bool) {} 161 | static DSC_INLINE void dsc_gpu_next_trace(dsc_trace_ctx *) {} 162 | static DSC_INLINE void dsc_gpu_dump_json_metadata(FILE *, void *) {} 163 | 164 | #endif // DSC_TRACING -------------------------------------------------------------------------------- /dsc/include/cpu/dsc_cpu.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024-2025, Christian Gilli 2 | // All rights reserved. 3 | // 4 | // This code is licensed under the terms of the 3-clause BSD license 5 | // (https://opensource.org/license/bsd-3-clause). 6 | 7 | #pragma once 8 | 9 | #include "dsc.h" 10 | 11 | 12 | struct dsc_device; 13 | 14 | 15 | // ============================================================ 16 | // CPU-specific operations 17 | // 18 | 19 | extern void dsc_cpu_cast(dsc_device *dev, 20 | const dsc_tensor *DSC_RESTRICT x, 21 | dsc_tensor *DSC_RESTRICT out); 22 | 23 | extern void dsc_cpu_arange(dsc_device *dev, 24 | dsc_tensor *DSC_RESTRICT x, 25 | f64 start, f64 step); 26 | 27 | extern void dsc_cpu_repeat(dsc_device *dev, 28 | const dsc_tensor *DSC_RESTRICT x, 29 | dsc_tensor *DSC_RESTRICT out, 30 | int repeats, int axis_idx); 31 | 32 | extern void dsc_cpu_randn(dsc_device *dev, dsc_tensor *DSC_RESTRICT x); 33 | 34 | extern void dsc_cpu_kth(dsc_device *dev, 35 | const dsc_tensor *DSC_RESTRICT x, 36 | dsc_tensor *DSC_RESTRICT out, 37 | int k); 38 | 39 | extern void dsc_cpu_multinomial(dsc_device *dev, 40 | const dsc_tensor *DSC_RESTRICT x, 41 | dsc_tensor *DSC_RESTRICT out, 42 | int num_samples); 43 | 44 | extern void dsc_cpu_concat(dsc_device *dev, 45 | dsc_tensor **to_concat, 46 | int tensors, 47 | dsc_tensor *DSC_RESTRICT out, 48 | int axis_idx); 49 | 50 | extern void dsc_cpu_transpose(dsc_device *dev, 51 | const dsc_tensor *DSC_RESTRICT x, 52 | dsc_tensor *DSC_RESTRICT out, 53 | const int *new_shape, 54 | const int *new_stride); 55 | 56 | extern void dsc_cpu_tril(dsc_device *dev, 57 | const dsc_tensor *DSC_RESTRICT x, 58 | int diagonal, 59 | dsc_tensor *DSC_RESTRICT out); 60 | 61 | // ============================================================ 62 | // Indexing and Slicing 63 | // 64 | 65 | extern void dsc_cpu_get_slice(dsc_device *dev, 66 | const dsc_tensor *DSC_RESTRICT x, 67 | dsc_tensor *DSC_RESTRICT out, 68 | int n_slices, const dsc_slice *slices, 69 | bool whole); 70 | 71 | extern void dsc_cpu_set_slice(dsc_device *dev, 72 | dsc_tensor *DSC_RESTRICT xa, 73 | bool xa_scalar, 74 | const dsc_tensor *DSC_RESTRICT xb, 75 | bool xb_scalar, 76 | int n_slices, 77 | const dsc_slice *slices, 78 | bool whole); 79 | 80 | // ============================================================ 81 | // Binary Operations 82 | 83 | extern void dsc_cpu_add(dsc_device *dev, 84 | const dsc_tensor *xa, 85 | const dsc_tensor *xb, 86 | dsc_tensor *out); 87 | 88 | extern void dsc_cpu_sub(dsc_device *dev, 89 | const dsc_tensor *xa, 90 | const dsc_tensor *xb, 91 | dsc_tensor *out); 92 | 93 | extern void dsc_cpu_mul(dsc_device *dev, 94 | const dsc_tensor *xa, 95 | const dsc_tensor *xb, 96 | dsc_tensor *out); 97 | 98 | extern void dsc_cpu_div(dsc_device *dev, 99 | const dsc_tensor *xa, 100 | const dsc_tensor *xb, 101 | dsc_tensor *out); 102 | 103 | extern void dsc_cpu_pow(dsc_device *dev, 104 | const dsc_tensor *xa, 105 | const dsc_tensor *xb, 106 | dsc_tensor *out); 107 | 108 | extern void dsc_cpu_matmul(dsc_device *devdev, 109 | const dsc_tensor *DSC_RESTRICT xa, 110 | const dsc_tensor *DSC_RESTRICT xb, 111 | bool trans_b, 112 | dsc_tensor *DSC_RESTRICT out); 113 | 114 | extern void dsc_cpu_compare(dsc_device *dev, 115 | const dsc_tensor *xa, 116 | const dsc_tensor *xb, 117 | dsc_comparison_op comp, 118 | dsc_tensor *out); 119 | 120 | extern void dsc_cpu_masked_fill(dsc_device *dev, 121 | dsc_tensor *x, 122 | const dsc_tensor *mask, 123 | f64 value); 124 | 125 | extern void dsc_cpu_outer(dsc_device *dev, 126 | const dsc_tensor *DSC_RESTRICT xa, 127 | const dsc_tensor *DSC_RESTRICT xb, 128 | dsc_tensor *DSC_RESTRICT out); 129 | 130 | extern void dsc_cpu_where(dsc_device *dev, 131 | const dsc_tensor *DSC_RESTRICT condition, 132 | const dsc_tensor *DSC_RESTRICT input, 133 | const dsc_tensor *DSC_RESTRICT other, 134 | dsc_tensor *DSC_RESTRICT out); 135 | 136 | // ============================================================ 137 | // Unary Operations 138 | 139 | extern void dsc_cpu_cos(dsc_device *dev, 140 | const dsc_tensor *DSC_RESTRICT x, 141 | dsc_tensor *DSC_RESTRICT out); 142 | 143 | extern void dsc_cpu_sin(dsc_device *dev, 144 | const dsc_tensor *DSC_RESTRICT x, 145 | dsc_tensor *DSC_RESTRICT out); 146 | 147 | extern void dsc_cpu_tanh(dsc_device *devdev, 148 | const dsc_tensor *DSC_RESTRICT x, 149 | dsc_tensor *DSC_RESTRICT out); 150 | 151 | extern void dsc_cpu_exp(dsc_device *dev, 152 | const dsc_tensor *DSC_RESTRICT x, 153 | dsc_tensor *DSC_RESTRICT out); 154 | 155 | extern void dsc_cpu_sqrt(dsc_device *dev, 156 | const dsc_tensor *DSC_RESTRICT x, 157 | dsc_tensor *DSC_RESTRICT out); 158 | 159 | // ============================================================ 160 | // Unary Operations Along Axis 161 | 162 | extern void dsc_cpu_sum(dsc_device *dev, 163 | const dsc_tensor *DSC_RESTRICT x, 164 | dsc_tensor *DSC_RESTRICT out, 165 | int axis_idx); 166 | 167 | extern void dsc_cpu_min(dsc_device *dev, 168 | const dsc_tensor *DSC_RESTRICT x, 169 | dsc_tensor *DSC_RESTRICT out, 170 | int axis_idx); 171 | 172 | extern void dsc_cpu_max(dsc_device *dev, 173 | const dsc_tensor *DSC_RESTRICT x, 174 | dsc_tensor *DSC_RESTRICT out, 175 | int axis_idx); -------------------------------------------------------------------------------- /python/tests/test_ops_cpu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025, Christian Gilli 2 | # All rights reserved. 3 | # 4 | # This code is licensed under the terms of the 3-clause BSD license 5 | # (https://opensource.org/license/bsd-3-clause). 6 | 7 | import pytest 8 | from random import randint, random 9 | from utils_cpu import * 10 | 11 | 12 | @pytest.fixture(scope='session', autouse=True) 13 | def session_fixture(): 14 | # This is invoked once before starting the test session 15 | dsc.init(int(2**30)) 16 | dsc.set_default_device('cpu') 17 | yield 18 | 19 | 20 | @pytest.fixture(autouse=True) 21 | def teardown_fixture(): 22 | # This is invoked automatically after each test 23 | yield 24 | 25 | 26 | class TestOps: 27 | def test_binary(self): 28 | ops = { 29 | 'add': (np.add, dsc.add), 30 | 'sub': (np.subtract, dsc.sub), 31 | 'mul': (np.multiply, dsc.mul), 32 | 'div': (np.true_divide, dsc.true_div), 33 | 'power': (np.power, dsc.power), 34 | 'equal': (np.equal, dsc.equal), 35 | 'not_equal': (np.not_equal, dsc.not_equal), 36 | 'less': (np.less, dsc.less), 37 | 'less_equal': (np.less_equal, dsc.less_equal), 38 | 'greater': (np.greater, dsc.greater), 39 | 'greater_equal': (np.greater_equal, dsc.greater_equal), 40 | } 41 | for op_name in ops.keys(): 42 | np_op, dsc_op = ops[op_name] 43 | for dtype in DTYPES: 44 | if op_name == 'sub': 45 | np_op = np.bitwise_xor if is_bool(dtype) else np.subtract 46 | 47 | print(f'Testing operator {op_name} with {dtype.__name__}') 48 | shape = [randint(2, 10) for _ in range(4)] 49 | x = random_nd(shape, dtype=dtype) 50 | x_dsc = dsc.from_numpy(x) 51 | 52 | # Same shape 53 | y = random_nd(shape, dtype=dtype) 54 | y_dsc = dsc.from_numpy(y) 55 | 56 | res_np = np_op(x, y) 57 | res_dsc = dsc_op(x_dsc, y_dsc) 58 | r_res_np = np_op(y, x) 59 | r_res_dsc = dsc_op(y_dsc, x_dsc) 60 | assert all_close(res_dsc, res_np) 61 | assert all_close(r_res_dsc, r_res_np) 62 | 63 | # Broadcasting 64 | collapse_idx = randint(0, 3) 65 | shape[collapse_idx] = 1 66 | y_b = random_nd(shape, dtype=dtype) 67 | y_dsc_b = dsc.from_numpy(y_b) 68 | res_np_b = np_op(x, y_b) 69 | res_dsc_b = dsc_op(x_dsc, y_dsc_b) 70 | r_res_np_b = np_op(y_b, x) 71 | r_res_dsc_b = dsc_op(y_dsc_b, x_dsc) 72 | assert all_close(res_dsc_b, res_np_b) 73 | assert all_close(r_res_dsc_b, r_res_np_b) 74 | 75 | # Scalar 76 | if is_float(dtype): 77 | y_s = random() 78 | elif is_bool(dtype): 79 | y_s = bool(randint(0, 1)) 80 | else: 81 | y_s = randint(0, 10) 82 | 83 | res_np_s = np_op(x, y_s) 84 | res_dsc_s = dsc_op(x_dsc, y_s) 85 | r_res_np_s = np_op(y_s, x) 86 | r_res_dsc_s = dsc_op(y_s, x_dsc) 87 | 88 | assert all_close(res_dsc_s, res_np_s) 89 | assert all_close(r_res_dsc_s, r_res_np_s) 90 | 91 | def test_outer(self): 92 | for dtype in DTYPES: 93 | for _ in range(10): 94 | xa = random_nd([randint(2, 50)], dtype) 95 | xb = random_nd([randint(2, 50)], dtype) 96 | xa_dsc = dsc.from_numpy(xa) 97 | xb_dsc = dsc.from_numpy(xb) 98 | 99 | out = np.outer(xa, xb) 100 | out_dsc = dsc.outer(xa_dsc, xb_dsc) 101 | assert all_close(out_dsc, out) 102 | 103 | def test_matmul(self): 104 | def _mnk() -> tuple[int, int, int]: 105 | return randint(50, 100), randint(50, 100), randint(50, 100) 106 | 107 | def _test_matmul(shape_a: List[int], shape_b: List[int], dt: np.dtype): 108 | print(f'Testing {shape_a} @ {shape_b} with {dt.__name__}') 109 | xa = random_nd(shape_a, dtype=dt) 110 | xb = random_nd(shape_b, dtype=dt) 111 | xa_dsc = dsc.from_numpy(xa) 112 | xb_dsc = dsc.from_numpy(xb) 113 | 114 | res = xa @ xb 115 | res_dsc = xa_dsc @ xb_dsc 116 | assert all_close(res_dsc, res) 117 | 118 | for dtype in DSC_DTYPES: 119 | if is_bool(dtype): 120 | continue 121 | # 2D matrices 122 | for _ in range(5): 123 | m, n, k = _mnk() 124 | _test_matmul([m, k], [k, n], dtype) 125 | 126 | # Batched case 127 | for _ in range(5): 128 | batch_1, batch_2 = randint(2, 10), randint(2, 10) 129 | m, n, k = _mnk() 130 | _test_matmul([batch_1, batch_2, m, k], [batch_1, batch_2, k, n], dtype) 131 | 132 | # Batched case with broadcasting 133 | for batch_1 in range(1, 6): 134 | for batch_2 in range(1, 6): 135 | m, n, k = _mnk() 136 | _test_matmul([batch_1 if batch_1%2 == 0 else 1, 137 | batch_2 if batch_2%2 == 0 else 1, m, k], 138 | [batch_1 if batch_1%2 == 1 else 1, 139 | batch_2 if batch_2%2 == 1 else 1, k, n], 140 | dtype) 141 | 142 | def test_unary(self): 143 | ops = { 144 | 'sin': (np.sin, dsc.sin), 145 | 'cos': (np.cos, dsc.cos), 146 | 'tanh': (np.tanh, dsc.tanh), 147 | 'exp': (np.exp, dsc.exp), 148 | 'sqrt': (np.sqrt, dsc.sqrt), 149 | } 150 | for op_name in ops.keys(): 151 | np_op, dsc_op = ops[op_name] 152 | for dtype in DTYPES: 153 | print(f'Testing {op_name} with {dtype.__name__}') 154 | x = random_nd([randint(1, 10) for _ in range(4)], dtype=dtype) 155 | x_dsc = dsc.from_numpy(x) 156 | 157 | res_np = np_op(x) 158 | res_dsc = dsc_op(x_dsc) 159 | # There are precision issues when working with non-float types 160 | assert all_close(res_dsc, res_np, 1e-5 if is_float(dtype) else 1e-3) 161 | 162 | def test_unary_axis(self): 163 | ops = { 164 | 'sum': (np.sum, dsc.sum), 165 | 'mean': (np.mean, dsc.mean), 166 | 'var': (np.var, dsc.var), 167 | 'max': (np.max, dsc.max), 168 | 'min': (np.min, dsc.min), 169 | } 170 | for op_name in ops.keys(): 171 | np_op, dsc_op = ops[op_name] 172 | for dtype in DTYPES: 173 | for axis in range(-4, 4): 174 | print(f'Testing {op_name} with {dtype.__name__} along axis {axis}') 175 | x = random_nd( 176 | [randint(1, 10) for _ in range(4)], dtype=dtype 177 | ) 178 | x_dsc = dsc.from_numpy(x) 179 | 180 | res_np = np_op(x, axis=axis, keepdims=True) 181 | res_dsc = dsc_op(x_dsc, axis=axis, keepdims=True) 182 | assert all_close(res_dsc, res_np) 183 | 184 | res_np_2 = np_op(x, axis=axis, keepdims=False) 185 | res_dsc_2 = dsc_op(x_dsc, axis=axis, keepdims=False) 186 | assert all_close(res_dsc_2, res_np_2) 187 | -------------------------------------------------------------------------------- /python/dsc/nn/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025, Christian Gilli 2 | # All rights reserved. 3 | # 4 | # This code is licensed under the terms of the 3-clause BSD license 5 | # (https://opensource.org/license/bsd-3-clause). 6 | 7 | from ..tensor import Tensor, power, matmul, rsqrt 8 | from ..dtype import Dtype 9 | from ..device import Device 10 | from .._bindings import _dsc_new_tensor 11 | from ..context import _get_ctx 12 | from ..profiler import trace 13 | from typing import Iterator, Dict, Iterable, Any, Tuple, Callable, Optional, OrderedDict, Mapping, List 14 | from abc import ABC, abstractmethod 15 | from tqdm import tqdm 16 | from .utils import safe_load 17 | from . import functional 18 | 19 | 20 | class Parameter(Tensor): 21 | def __init__(self, shape: Tuple[int, ...], dtype: Dtype = Dtype.F32, device: Device = Device.DEFAULT, on_load: Optional[Callable[[Tensor], Tensor]] = None): 22 | # Parameters are lazy tensors (i.e. Tensors that don't have an underlying buffer) 23 | super().__init__(_dsc_new_tensor(_get_ctx(), shape, dtype, device, True)) 24 | self._on_load = on_load 25 | 26 | def load(self, x: Tensor): 27 | if self._on_load is not None: 28 | x = self._on_load(x) 29 | super().load(x) 30 | 31 | class Module(ABC): 32 | def __init__(self): 33 | super().__init__() 34 | self._parameters = {} 35 | self._modules = {} 36 | 37 | def register_parameter(self, name: str, param: Parameter): 38 | if name in self._parameters: 39 | raise RuntimeError(f'parameter "{name}" already registered') 40 | 41 | self._parameters[name] = param 42 | 43 | def register_module(self, name: str, module: 'Module'): 44 | if name in self._modules: 45 | raise RuntimeError(f'module "{name}" already registered') 46 | 47 | self._modules[name] = module 48 | 49 | def __setattr__(self, name: str, value: Any): 50 | if isinstance(value, Parameter): 51 | self.register_parameter(name, value) 52 | elif isinstance(value, Module): 53 | self.register_module(name, value) 54 | 55 | super().__setattr__(name, value) 56 | 57 | def parameters(self) -> Iterator[Parameter]: 58 | for param in self._parameters.values(): 59 | yield param 60 | for module in self._modules.values(): 61 | yield from module.parameters() 62 | 63 | def named_parameters(self, prefix: str = '') -> Iterator[Tuple[str, Parameter]]: 64 | for name, param in self._parameters.items(): 65 | yield prefix + ('.' if prefix else '') + name, param 66 | for module_name, module in self._modules.items(): 67 | submodule_prefix = prefix + ('.' if prefix else '') + module_name 68 | yield from module.named_parameters(submodule_prefix) 69 | 70 | def state_dict(self) -> OrderedDict[str, Parameter]: 71 | res = OrderedDict[str, Parameter]() 72 | for name, param in self.named_parameters(): 73 | res[name] = param 74 | return res 75 | 76 | def from_state(self, state_dict: Dict[str, Tensor], 77 | on_hook: Optional[List[Tuple[List[str], Callable[[Tensor], Tensor]]]] = None, 78 | tied: Optional[Dict[str, str]] = None): 79 | with tqdm(total=len(state_dict), desc='Loading model parameters') as pbar: 80 | for name, param in self.named_parameters(): 81 | real_name = name 82 | if tied is not None and name in tied: 83 | name = tied[name] 84 | 85 | if name not in state_dict: 86 | pbar.write(f'{name} not found in DSC model') 87 | pbar.update(1) 88 | continue 89 | 90 | tensor = state_dict[name] 91 | if on_hook: 92 | # on_hook defines transformations on tensors that are called before loading the tensors in DSC 93 | for keys, hook in on_hook: 94 | # If any of the keys starts with 'name' I'll apply the hook 95 | if any(name.endswith(key) for key in keys): 96 | tensor = hook(tensor) 97 | 98 | pbar.set_description(f'{real_name if real_name == name else f"{real_name} (tied to {name})"} {tensor.shape} {tensor.dtype}') 99 | param.load(tensor) 100 | pbar.update(1) 101 | 102 | @abstractmethod 103 | def forward(self, *args, **kwargs): 104 | pass 105 | 106 | def __call__(self, *args, **kwargs): 107 | return self.forward(*args, **kwargs) 108 | 109 | class ModuleList(Module): 110 | def __init__(self, modules: Iterable[Module]): 111 | super().__init__() 112 | for i, module in enumerate(modules): 113 | self.register_module(str(i), module) 114 | 115 | def __len__(self) -> int: 116 | return len(self._modules) 117 | 118 | def __iter__(self) -> Iterator[Module]: 119 | return iter(self._modules.values()) 120 | 121 | def forward(self): 122 | raise NotImplementedError('forward() is not supported in ModuleList') 123 | 124 | class ModuleDict(Module): 125 | def __init__(self, modules: Mapping[str, Module]): 126 | super().__init__() 127 | for name, module in modules.items(): 128 | setattr(self, name, module) 129 | 130 | def __len__(self) -> int: 131 | return len(self._modules) 132 | 133 | def __iter__(self) -> Iterator[str]: 134 | return iter(self._modules) 135 | 136 | def forward(self): 137 | raise NotImplementedError('forward() is not supported in ModuleDict') 138 | 139 | class Linear(Module): 140 | def __init__(self, in_features: int, out_features: int, bias: bool = True, dtype: Dtype = Dtype.F32): 141 | super().__init__() 142 | self.weight = Parameter((out_features, in_features), dtype=dtype) 143 | self.bias = Parameter((out_features, ), dtype=dtype) if bias else None 144 | 145 | @trace('Linear') 146 | def forward(self, x: Tensor) -> Tensor: 147 | out = matmul(x, self.weight, trans_b=True) 148 | if self.bias: 149 | out += self.bias 150 | return out 151 | 152 | class LayerNorm(Module): 153 | def __init__(self, n_features: int, epsilon: float = 1e-5, dtype: Dtype = Dtype.F32): 154 | super().__init__() 155 | self.epsilon = epsilon 156 | self.weight = Parameter((n_features, ), dtype=dtype) 157 | self.bias = Parameter((n_features, ), dtype=dtype) 158 | 159 | @trace('LayerNorm') 160 | def forward(self, x: Tensor) -> Tensor: 161 | mean = x.mean(-1, keepdims=True) 162 | var = x.var(-1, keepdims=True) 163 | 164 | out = (x - mean) / (var + self.epsilon) ** 0.5 165 | out = out * self.weight + self.bias 166 | 167 | return out 168 | 169 | class RMSNorm(Module): 170 | def __init__(self, in_shape: int, epsilon: float = 1e-6, dtype: Dtype = Dtype.F32): 171 | super().__init__() 172 | self.epsilon = epsilon 173 | self.weight = Parameter((in_shape, ), dtype=dtype) 174 | 175 | @trace('RMSNorm') 176 | def forward(self, x: Tensor) -> Tensor: 177 | var = power(x, 2).mean(-1, keepdims=True) 178 | out = x * rsqrt(var + self.epsilon) 179 | return out * self.weight 180 | 181 | class Embedding(Module): 182 | def __init__(self, num_embeddings: int, embedding_size: int, dtype: Dtype = Dtype.F32): 183 | super().__init__() 184 | self.weight = Parameter((num_embeddings, embedding_size), dtype=dtype) 185 | 186 | @trace('Embedding') 187 | def forward(self, x: Tensor) -> Tensor: 188 | return self.weight[x] 189 | -------------------------------------------------------------------------------- /dsc/include/dsc_device.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024-2025, Christian Gilli 2 | // All rights reserved. 3 | // 4 | // This code is licensed under the terms of the 3-clause BSD license 5 | // (https://opensource.org/license/bsd-3-clause). 6 | 7 | #pragma once 8 | 9 | #include "dsc.h" 10 | #include 11 | 12 | #define dsc_node_is_free(PTR) ((PTR)->data == nullptr && (PTR)->next == nullptr && (PTR)->size == 0) 13 | #define dsc_node_mark_free(PTR) \ 14 | do { \ 15 | (PTR)->data = nullptr; \ 16 | (PTR)->next = nullptr; \ 17 | (PTR)->size = 0; \ 18 | } while (0) 19 | 20 | struct dsc_data_buffer { 21 | void *data; 22 | usize size; 23 | int refs; 24 | }; 25 | 26 | struct dsc_free_node { 27 | void *data; 28 | dsc_free_node *next; 29 | usize size; 30 | }; 31 | 32 | enum dsc_memcpy_dir : u8 { 33 | UNUSED, 34 | FROM_DEVICE, 35 | TO_DEVICE, 36 | ON_DEVICE 37 | }; 38 | 39 | 40 | static constexpr dsc_memcpy_dir DSC_MEMCPY_DIRECTIONS_LOOKUP[DSC_MAX_DEVICES][DSC_MAX_DEVICES] = { 41 | {UNUSED, TO_DEVICE}, 42 | {FROM_DEVICE, ON_DEVICE}, 43 | }; 44 | 45 | 46 | struct dsc_device { 47 | dsc_data_buffer used_nodes[DSC_MAX_OBJS]; 48 | dsc_free_node free_nodes[DSC_MAX_OBJS]; 49 | dsc_free_node *head; 50 | void *device_mem; 51 | usize alignment; 52 | 53 | // Extra device-specific infos 54 | void *extra_info; 55 | 56 | dsc_trace_ctx *trace_ctx; 57 | 58 | usize mem_size, used_mem; 59 | dsc_device_type type; 60 | 61 | void (*memcpy) (void *dst, const void *src, usize nb, dsc_memcpy_dir dir); 62 | void (*memset) (void *dst, int c, usize nb); 63 | void (*dispose) (dsc_device *dev); 64 | 65 | // Iterator method to get to the next trace. Will update trace_ctx->current_trace 66 | void (*next_trace) (dsc_trace_ctx *ctx); 67 | void (*dump_trace) (void *trace, FILE *json_file, bool to_console, bool to_json); 68 | // Dump device-specific json metadata events (i.e. set processor name). This is purely cosmetic 69 | void (*dump_json_metadata) (FILE *json_file, void *extra_info); 70 | }; 71 | 72 | namespace internal::alloc { 73 | DSC_INLINE dsc_free_node *find_best(dsc_device *dev, 74 | const usize required_size, 75 | dsc_free_node **prev) { 76 | dsc_free_node *node = dev->head; 77 | dsc_free_node *best = node->size >= required_size ? node : nullptr; 78 | dsc_free_node *prev_node = nullptr; 79 | 80 | while (node->next != nullptr) { 81 | if (node->next->size >= required_size && 82 | (best == nullptr || best->size >= node->next->size)) { 83 | prev_node = node; 84 | best = node->next; 85 | } 86 | node = node->next; 87 | } 88 | 89 | *prev = prev_node; 90 | 91 | return best; 92 | } 93 | 94 | DSC_INLINE void node_insert(dsc_free_node **head, 95 | dsc_free_node *prev, 96 | dsc_free_node *to_insert) { 97 | if (prev == nullptr) { 98 | if (*head != nullptr) { 99 | to_insert->next = *head; 100 | } 101 | *head = to_insert; 102 | } else { 103 | if (prev->next == nullptr) { 104 | prev->next = to_insert; 105 | to_insert->next = nullptr; 106 | } else { 107 | to_insert->next = prev->next; 108 | prev->next = to_insert; 109 | } 110 | } 111 | } 112 | 113 | DSC_INLINE void node_remove(dsc_free_node **head, 114 | dsc_free_node *prev, 115 | dsc_free_node *to_remove) { 116 | if (prev == nullptr) { 117 | *head = to_remove->next; 118 | } else { 119 | prev->next = to_remove->next; 120 | } 121 | 122 | dsc_node_mark_free(to_remove); 123 | } 124 | 125 | DSC_INLINE dsc_free_node *next_free_node(dsc_device *dev) { 126 | for (int i = 0; i < DSC_MAX_OBJS; ++i) { 127 | if (dsc_free_node *bin = &dev->free_nodes[i]; dsc_node_is_free(bin)) { 128 | return bin; 129 | } 130 | } 131 | return nullptr; 132 | } 133 | } 134 | 135 | using namespace internal::alloc; 136 | 137 | static DSC_INLINE dsc_data_buffer *dsc_data_alloc(dsc_device *dev, usize nb) { 138 | DSC_ASSERT(dev != nullptr); 139 | DSC_ASSERT(nb > 0); 140 | 141 | nb = DSC_ALIGN(nb, dev->alignment); 142 | 143 | dsc_free_node *prev = nullptr; 144 | dsc_free_node *node = find_best(dev, nb, &prev); 145 | if (node == nullptr) { 146 | DSC_LOG_FATAL("error allocating %.2fKB on %s", DSC_B_TO_KB(nb), DSC_DEVICE_NAMES[dev->type]); 147 | } 148 | 149 | if (const usize left = node->size - nb; left >= dev->alignment) { 150 | dsc_free_node *new_node = next_free_node(dev); 151 | if (new_node == nullptr) { 152 | DSC_LOG_FATAL("memory reached critical fragmentation!"); 153 | } 154 | 155 | node->size = nb; 156 | new_node->size = left; 157 | // The data for the new bin starts after the previous one 158 | new_node->data = (byte *) node->data + node->size; 159 | node_insert(&dev->head, node, new_node); 160 | } 161 | 162 | dsc_data_buffer *data_buf = nullptr; 163 | for (int i = 0; i < DSC_MAX_OBJS; ++i) { 164 | if (dsc_data_buffer *buf = &dev->used_nodes[i]; buf->refs == 0) { 165 | data_buf = buf; 166 | break; 167 | } 168 | } 169 | if (!data_buf) { 170 | DSC_LOG_FATAL("can't allocate any more objects!"); 171 | } 172 | 173 | data_buf->data = node->data; 174 | data_buf->refs = 1; 175 | data_buf->size = node->size; 176 | dev->used_mem += nb; 177 | 178 | node_remove(&dev->head, prev, node); 179 | 180 | return data_buf; 181 | } 182 | 183 | static DSC_INLINE void dsc_data_free(dsc_device *dev, dsc_data_buffer *ptr) { 184 | DSC_ASSERT(dev != nullptr); 185 | DSC_ASSERT(ptr != nullptr); 186 | DSC_ASSERT(ptr->refs > 0); 187 | 188 | ptr->refs--; 189 | 190 | if (ptr->refs > 0) return; 191 | 192 | DSC_LOG_DEBUG("%p will be freed", ptr); 193 | 194 | const uintptr_t ptr_addr = (uintptr_t) ptr->data; 195 | dsc_free_node *new_node = next_free_node(dev); 196 | 197 | dsc_free_node *node = dev->head, *prev = nullptr; 198 | while (node != nullptr) { 199 | if (const uintptr_t node_addr = (uintptr_t) node->data; ptr_addr < node_addr) { 200 | new_node->size = ptr->size; 201 | new_node->next = nullptr; 202 | new_node->data = ptr->data; 203 | node_insert(&dev->head, prev, new_node); 204 | break; 205 | } 206 | 207 | prev = node; 208 | node = node->next; 209 | } 210 | 211 | dev->used_mem -= new_node->size; 212 | 213 | // Coalescence 214 | if (new_node->next != nullptr && 215 | (uintptr_t) ((byte *) new_node->data + new_node->size) == (uintptr_t) new_node->next->data) { 216 | new_node->size += new_node->next->size; 217 | node_remove(&dev->head, new_node, new_node->next); 218 | } 219 | 220 | if (prev != nullptr && prev->next != nullptr && 221 | (uintptr_t) ((byte *) prev->data + prev->size) == (uintptr_t) new_node->data) { 222 | prev->size += new_node->size; 223 | node_remove(&dev->head, prev, new_node); 224 | } 225 | } 226 | 227 | extern dsc_device *dsc_cpu_device(usize mem_size); 228 | 229 | #if defined(DSC_CUDA) || defined(DSC_HIP) 230 | extern dsc_device *dsc_gpu_device(usize mem_size, int dev_idx); 231 | #else 232 | static DSC_INLINE dsc_device *dsc_gpu_device(usize, int) { 233 | return nullptr; 234 | } 235 | #endif 236 | 237 | #undef dsc_node_is_free 238 | #undef dsc_node_mark_free -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | Logo 3 | 4 |

5 | DSC 6 |

7 | 8 | [![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) 9 | [![Unit Tests](https://github.com/nirw4nna/dsc/actions/workflows/tests.yml/badge.svg)](https://github.com/nirw4nna/dsc/actions/workflows/tests.yml) 10 | 11 |
12 | 13 | --- 14 | 15 | ## About 16 | DSC is a PyTorch-compatible tensor library and inference framework for machine learning models. 17 | It features a C-compatible low-level API that is wrapped in a modern Python API very similar to NumPy / PyTorch but 18 | with some nice usability improvements. 19 | 20 | 21 | Some key features of DSC include: 22 | - **Intuitive API**: DSC Python API closely resembles NumPy / PyTorch. 23 | 24 | 25 | - **Built-in neural networks support**: DSC comes with `nn.Module` built-in. Porting a model from PyTorch to DSC 26 | is trivial (check out the [examples](https://github.com/nirw4nna/dsc/tree/main/examples/models)). 27 | 28 | 29 | - **Multiple backends**: DSC supports both **CPU** and **CUDA** with other backends being worked on. 30 | Programs written using DSC can seamlessly switch between backends by simply adding a `dsc.set_default_device('...')` 31 | instruction, no changes needed. 32 | 33 | 34 | - **Minimal external dependencies**: DSC doesn't require external libraries to be efficient. 35 | On CPU the core operations are written from scratch in portable C++, this makes code written using DSC extremely portable. 36 | 37 | 38 | - **No runtime allocations**: DSC has its own custom memory allocator, memory is pre-allocated 39 | only once so no extra calls to `malloc()` or `free()` are required. It's also possible 40 | to switch to a linear allocator to remove the (minimal) overhead introduced by a general purpose allocator. 41 | 42 | 43 | --- 44 | 45 | 46 | ## Quick start 47 | Getting started with DSC is very simple. The only requirements are: 48 | - A compiler with good support for C++20 49 | - GNU Make for building 50 | 51 | On a Linux-based system these can be obtained with: 52 | ```shell 53 | sudo apt update 54 | sudo apt install build-essential 55 | ``` 56 | 57 | ### Installation 58 | The recommended way to install DSC is from source: 59 | ```shell 60 | git clone git@github.com:nirw4nna/dsc.git 61 | cd dsc/ 62 | python3 -m venv venv 63 | source venv/bin/activate 64 | python3 -m pip install -e . 65 | ``` 66 | 67 | To build the C++ library: 68 | ```shell 69 | make clean; make shared DSC_FAST=1 70 | ``` 71 | This will compile DSC without any debug information, you can specify different options 72 | to enable/disable specific features: 73 | 74 | | Option | Description | 75 | |---------------|------------------------------------------------------------------------------| 76 | | DSC_LOG_LEVEL | Configure the logging level (values: [0-3] with 0 meaning everything on) | 77 | | DSC_FAST | Turn off logging (level=2) and compile with the highest optimisation level | 78 | | DSC_GPU | Enable GPU support (**default=0**) | 79 | | DSC_MAX_OBJS | Max number of DSC tensors that can be used at the same time (**default=1K**) | 80 | | DSC_TRACING | Enable tracing (**default=0**) | 81 | 82 | To verify that everything worked out as expected try a simple operation: 83 | ```shell 84 | python3 -c "import dsc; x = dsc.arange(10); print(x)" 85 | ``` 86 | 87 | ### Environment Variables 88 | | Variable | Description | 89 | |-----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 90 | | DSC_NUM_THREADS | Use multiple threads (CPU). If you set it to -1 it will use half of your available cores (**default=1**) | 91 | | TRACE | Enable tracing DSC kernels. Values: [0-3] with 0 meaning no tracing, 1 dump only to the console, 2 dump only as Perfetto-compatible json, 3 dump as both console and Perfetto. This option requires DSC to be compiled with tracing support enabled (**default=0**) | 92 | 93 | ### Notes on GPU support 94 | DSC supports both AMD and NVIDIA GPUs. If compiled with `DSC_GPU=1` it will automatically detect the appropriate backend. 95 | You can see which backend has been selected by checking the output of the Makefile or, once the compilation is done, 96 | use the Python API: 97 | ```python 98 | import dsc 99 | 100 | if dsc.gpu.is_available(): # If a GPU backend has been detected you can check if it's ROCm or CUDA 101 | dsc.gpu.is_rocm() 102 | dsc.gpu.is_cuda() 103 | ``` 104 | 105 | ### CUDA backend 106 | This provides GPU acceleration on NVIDIA GPUs. To get started make sure to have the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) 107 | installed. 108 | 109 | To build the C++ library with CUDA enabled simply specify `DSC_GPU=1`. CUDA will be detected automatically if you installed it. 110 | 111 | **Note:** if you see errors when compiling with CUDA support make sure that the CUDA installation path specified in the Makefile 112 | is correct. If this is not the case you have to manually update the Makefile or set the `CUDA` environment variable before calling `make`. 113 | 114 | To verify that the CUDA backend is working try: 115 | ```shell 116 | python3 -c "import dsc; print(dsc.gpu.is_available() and dsc.gpu.is_cuda())" 117 | ``` 118 | 119 | ### HIP backend 120 | This provides GPU acceleration on AMD GPUs. To get started make sure to have the [ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/quick-start.html#rocm-install-quick) 121 | installed. 122 | 123 | To build the C++ library with ROCm enabled simply specify `DSC_GPU=1`. ROCm will be detected automatically if you installed it. 124 | 125 | **Note:** if you see errors when compiling with ROCm support make sure that the ROCm installation path specified in the Makefile 126 | is correct. If this is not the case you have to manually update the Makefile or set the `ROCM` environment variable before calling `make`. 127 | 128 | To verify that the ROCm backend is working try: 129 | ```shell 130 | python3 -c "import dsc; print(dsc.gpu.is_available() and dsc.gpu.is_rocm())" 131 | ``` 132 | 133 | ## Setting a default device 134 | The default device in DSC is the CPU. This means that, if you don't specify anything, all the operations will be 135 | performed on the CPU even when a GPU device is available. To set a different device as default you can use 136 | ```python 137 | dsc.set_default_device('gpu') 138 | ``` 139 | This will make the GPU the default device and DSC will perform all the operations there by default. 140 | 141 | ## Running tests 142 | DSC uses `pytest` to run unit tests against NumPy which is the reference for correctness. 143 | 144 | The tests are structured as follows: 145 | - `test_ops_common` and `test_indexing` are used to test operations both on CPU and GPU using NumPy as reference 146 | - `test_ops_cpu` are CPU-specific 147 | - `test_ops_gpu` are GPU-specific and they use PyTorch as reference 148 | 149 | The device on which tests are run can be configured by setting the environment variable `DSC_DEVICE` before calling pytest. 150 | 151 | **Note:** to use PyTorch with a ROCm-compatible GPU please refer to https://pytorch.org/get-started/locally/. 152 | 153 | To run all the tests simple do: 154 | ```bash 155 | cd python/tests/ 156 | pytest -s .py --no-header --no-summary -q 157 | ``` 158 | **Note:** there are quite a few tests so to run them it's better to compile DSC with `DSC_FAST=1`. 159 | 160 | ## License 161 | BSD-3-Clause 162 | -------------------------------------------------------------------------------- /dsc/include/cpu/dsc_tracing.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024-2025, Christian Gilli 2 | // All rights reserved. 3 | // 4 | // This code is licensed under the terms of the 3-clause BSD license 5 | // (https://opensource.org/license/bsd-3-clause). 6 | 7 | #pragma once 8 | 9 | #include "dsc.h" 10 | #include "dsc_tracing_common.h" 11 | 12 | #if defined(DSC_TRACING) 13 | 14 | #include // getpid() 15 | #include // pthread_self() 16 | 17 | 18 | #undef DSC_INSERT_TYPED_TRACE 19 | #undef DSC_INSERT_NAMED_TRACE 20 | 21 | #define DSC_INSERT_TYPED_TRACE(DEV, T, type_) \ 22 | dsc_cpu_trace_tracker trace__ { (DEV)->trace_ctx, __FUNCTION__, (type_), &args__ } 23 | 24 | #define DSC_INSERT_NAMED_TRACE(DEV, T, type_, name_) \ 25 | dsc_cpu_trace_tracker trace__ { (DEV)->trace_ctx, (name_), (type_), &args__ } 26 | 27 | 28 | struct dsc_cpu_trace { 29 | dsc_trace_common base; 30 | 31 | u64 tid, start_us, stop_us; 32 | int pid; 33 | }; 34 | 35 | template 36 | struct dsc_cpu_trace_tracker { 37 | dsc_cpu_trace_tracker(dsc_trace_ctx *ctx, 38 | const char *name, 39 | const dsc_trace_type type, 40 | const T *args) { 41 | using namespace internal::tracing; 42 | 43 | // Memory allocations are CPU-only 44 | static const bool filter_alloc = dsc_get_env("TRACE_ALLOC", 0) == 0; 45 | if (dsc_tracing_is_enabled() && (!filter_alloc || (type != DSC_TENSOR_ALLOC && type != DSC_TENSOR_FREE))) { 46 | check_if_full(ctx); 47 | trace_ = next_empty_trace(ctx); 48 | fill_trace(&trace_->base, name, type, args); 49 | trace_->pid = getpid(); 50 | trace_->tid = pthread_self(); 51 | trace_->start_us = time_us(); 52 | } 53 | } 54 | 55 | ~dsc_cpu_trace_tracker() { 56 | using namespace internal::tracing; 57 | if (trace_) { 58 | trace_->stop_us= time_us(); 59 | } 60 | } 61 | 62 | private: 63 | dsc_cpu_trace *trace_ = nullptr; 64 | }; 65 | 66 | static DSC_INLINE dsc_trace_ctx *dsc_cpu_tracing_init() { 67 | return internal::tracing::init(); 68 | } 69 | 70 | static DSC_INLINE void dsc_cpu_tracing_dispose(const dsc_trace_ctx *ctx) { 71 | internal::tracing::dispose(ctx); 72 | } 73 | 74 | static DSC_INLINE void dsc_cpu_tracing_dump(void *trace, FILE *json_file, 75 | const bool to_console, const bool to_json) { 76 | static constexpr const char *COLOR_NONE = "\033[0m"; 77 | static constexpr const char *COLOR_CUSTOM = "\033[38;5;51m"; // cyan 78 | static constexpr const char *COLOR_COPY = "\033[38;5;201m"; // deep magenta 79 | 80 | const dsc_cpu_trace *cpu_trace = (dsc_cpu_trace *) trace; 81 | 82 | const dsc_trace_common *base = &cpu_trace->base; 83 | const u64 elapsed_us = cpu_trace->stop_us - cpu_trace->start_us; 84 | const f64 bandwidth = (f64) base->rw_bytes / ((f64) elapsed_us * 1e-6 * DSC_GB(1)); 85 | 86 | if (to_console) { 87 | char device_str[16]; 88 | if (base->type == DSC_COPY_OP) { 89 | snprintf(device_str, sizeof(device_str), "%s <- %s", 90 | DSC_DEVICE_NAMES[base->copy.x.device], 91 | DSC_DEVICE_NAMES[base->copy.data_device]); 92 | } else if (base->type == DSC_TO_OP) { 93 | snprintf(device_str, sizeof(device_str), "%s <- %s", 94 | DSC_DEVICE_NAMES[base->to.new_device], 95 | DSC_DEVICE_NAMES[base->to.x.device]); 96 | } else if (base->type == DSC_GET_IDX) { 97 | snprintf(device_str, sizeof(device_str), "%s <- %s", 98 | DSC_DEVICE_NAMES[base->get_idx.x.device], 99 | DSC_DEVICE_NAMES[base->get_idx.x.device]); 100 | } else if (base->type == DSC_GET_TENSOR) { 101 | snprintf(device_str, sizeof(device_str), "%s <- %s", 102 | DSC_DEVICE_NAMES[base->get_tensor.x.device], 103 | DSC_DEVICE_NAMES[base->get_tensor.x.device]); 104 | } else { 105 | snprintf(device_str, sizeof(device_str), "CPU"); 106 | } 107 | 108 | const char *ansi_color_1 = "", *ansi_color_2 = ""; 109 | switch (base->type) { 110 | case DSC_COPY_OP: 111 | case DSC_TO_OP: 112 | case DSC_GET_IDX: 113 | case DSC_GET_TENSOR: 114 | ansi_color_1 = COLOR_COPY; 115 | ansi_color_2 = COLOR_NONE; 116 | break; 117 | case DSC_TRACE_CUSTOM: 118 | ansi_color_1 = COLOR_CUSTOM; 119 | ansi_color_2 = COLOR_NONE; 120 | break; 121 | default: 122 | break; 123 | } 124 | 125 | printf("*** [%ld] %-12s %s%-40s%s %.2fms (%6ldus)\t|", 126 | base->ingestion_time_us, 127 | device_str, 128 | ansi_color_1, 129 | base->name, 130 | ansi_color_2, 131 | (f64) elapsed_us * 1e-3, 132 | elapsed_us); 133 | 134 | // Don't show bandwidth for custom traces 135 | if (base->type != DSC_TRACE_CUSTOM && base->type != DSC_TENSOR_ALLOC && base->type != DSC_TENSOR_FREE) { 136 | printf("\t%10.2fGB/s (%ldB)", 137 | bandwidth, 138 | base->rw_bytes); 139 | } 140 | 141 | printf("\n"); 142 | } 143 | 144 | if (to_json) { 145 | fprintf(json_file, R"({"name":"%s","cat":"%s","ph":"X","ts":%ld,"dur":%ld,"pid":%d,"tid":%ld)", 146 | base->name, 147 | DSC_TRACE_CATEGORY[base->type], 148 | base->ingestion_time_us, 149 | elapsed_us, 150 | cpu_trace->pid, 151 | cpu_trace->tid); 152 | 153 | fprintf(json_file, R"(,"args":{)"); 154 | 155 | if (base->type != DSC_TRACE_CUSTOM) fprintf(json_file, R"("bandwidth":"%.2fGB/s")", bandwidth); 156 | 157 | internal::tracing::dump_trace_base(json_file, base); 158 | fprintf(json_file, R"(}})" ",\n"); 159 | } 160 | } 161 | 162 | static DSC_INLINE void dsc_cpu_next_trace(dsc_trace_ctx *ctx) { 163 | internal::tracing::advance_current_trace(ctx); 164 | } 165 | 166 | static DSC_INLINE void dsc_cpu_dump_json_metadata(FILE *json_file, void *) { 167 | fprintf(json_file, R"({"name":"process_name","ph":"M","pid":%d,"tid":%ld,"args":{"name":"CPU"},"process_sort_index":0})" ",\n", getpid(), pthread_self()); 168 | fprintf(json_file, R"({"name":"thread_name","ph":"M","pid":%d,"tid":%ld,"args":{"name":"Main Thread"},"thread_sort_index":1})" ",\n", getpid(), pthread_self()); 169 | } 170 | 171 | static DSC_INLINE void dsc_cpu_insert_user_trace(dsc_trace_ctx *ctx, 172 | const char *name, 173 | const u64 start, 174 | const u64 duration) { 175 | if (!dsc_tracing_is_enabled()) return; 176 | 177 | dsc_cpu_trace *trace = internal::tracing::next_empty_trace(ctx); 178 | internal::tracing::fill_trace(&trace->base, name, DSC_TRACE_CUSTOM); 179 | // For user-generated traces the ingestion time must be inserted manually 180 | trace->base.ingestion_time_us = start; 181 | trace->start_us = start; 182 | trace->stop_us = start + duration; 183 | // NOTE: maybe it's a good idea to put these kind of events on a separate pid/tid? 184 | trace->pid = getpid(); 185 | trace->tid = pthread_self(); 186 | } 187 | 188 | #else 189 | 190 | static DSC_INLINE dsc_trace_ctx *dsc_cpu_tracing_init() { return nullptr; } 191 | static DSC_INLINE void dsc_cpu_tracing_dispose(const dsc_trace_ctx *) {} 192 | static DSC_INLINE void dsc_cpu_tracing_dump(void *, FILE *, bool, bool) {} 193 | static DSC_INLINE void dsc_cpu_next_trace(dsc_trace_ctx *) {} 194 | static DSC_INLINE void dsc_cpu_dump_json_metadata(FILE *, void *) {} 195 | static DSC_INLINE void dsc_cpu_insert_user_trace(dsc_trace_ctx *, const char *, const u64, const u64) {} 196 | 197 | #endif // DSC_TRACING -------------------------------------------------------------------------------- /python/tests/test_indexing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025, Christian Gilli 2 | # All rights reserved. 3 | # 4 | # This code is licensed under the terms of the 3-clause BSD license 5 | # (https://opensource.org/license/bsd-3-clause). 6 | 7 | import pytest 8 | from utils_cpu import * 9 | from random import randint, random 10 | from typing import List 11 | import math 12 | 13 | 14 | @pytest.fixture(scope='session', autouse=True) 15 | def session_fixture(): 16 | # This is invoked once before starting the test session 17 | dsc.init(int(2**30)) 18 | print(f'Running tests on {DEVICE}') 19 | dsc.set_default_device(DEVICE) 20 | yield 21 | 22 | 23 | @pytest.fixture(autouse=True) 24 | def teardown_fixture(): 25 | # This is invoked automatically after each test 26 | yield 27 | 28 | 29 | class TestIndexing: 30 | def test_get_idx(self): 31 | # The idea is to start with 1D tensors and then, for all dtypes, test with a growing number of indexes 32 | # from 1 up to the number of dimensions (to select a scalar value). Given the number of indexes we generate 33 | # a bunch of random pairs to try and cover most use cases. 34 | for n_dim in range(4): 35 | for dtype in DTYPES: 36 | x = random_nd([10 for _ in range(n_dim + 1)], dtype=dtype) 37 | x_dsc = dsc.from_numpy(x) 38 | 39 | for indexes in range(n_dim + 1): 40 | for _ in range(10): 41 | idx = tuple(randint(-10, 9) for _ in range(indexes + 1)) 42 | res = x[idx] 43 | res_dsc = x_dsc[idx] 44 | if isinstance(res_dsc, dsc.Tensor): 45 | assert all_close(res_dsc, res) 46 | else: 47 | assert np.isclose(res, res_dsc) 48 | def test_get_tensor(self): 49 | for dtype in DTYPES: 50 | rows = randint(1, 100) 51 | cols = randint(1, 100) 52 | x = random_nd([rows, cols], dtype=dtype) 53 | x_dsc = dsc.from_numpy(x) 54 | 55 | indexes = np.array([randint(0, rows - 1) for _ in range(randint(1, rows))]).astype(np.int32) 56 | # Indexes are always on CPU 57 | indexes_dsc = dsc.from_numpy(indexes, device='cpu') 58 | 59 | res = x[indexes] 60 | res_dsc = x_dsc[indexes_dsc] 61 | assert all_close(res_dsc, res) 62 | 63 | 64 | @staticmethod 65 | def _validate_slice(sl: slice, max_dim: int) -> bool: 66 | s_start = sl.start 67 | s_stop = sl.stop 68 | s_step = sl.step 69 | san_start = s_start if s_start >= 0 else s_start + max_dim 70 | san_stop = s_stop if s_stop >= 0 else s_stop + max_dim 71 | # Some of these checks should probably be handles gracefully by DSC 72 | if s_step == 0 or san_start == san_stop: 73 | return False 74 | if (s_step > 0 and san_stop < san_start) or ( 75 | s_step < 0 and san_stop > san_start 76 | ): 77 | return False 78 | return True 79 | 80 | def test_get_slice(self): 81 | # Note: this should probably be more exhaustive 82 | x_1d = random_nd([10], np.float32) 83 | x_1d_dsc = dsc.from_numpy(x_1d) 84 | 85 | for start in range(-10, 10): 86 | for stop in range(-10, 10): 87 | for step in range(-10, 10): 88 | s = slice(start, stop, step) 89 | if not TestIndexing._validate_slice(s, 10): 90 | continue 91 | assert all_close(x_1d_dsc[s], x_1d[s]) 92 | 93 | x_2d = random_nd([5, 5], np.float32) 94 | x_2d_dsc = dsc.from_numpy(x_2d) 95 | for start in range(-5, 5): 96 | for stop in range(-5, 5): 97 | for step in range(-5, 5): 98 | s = slice(start, stop, step) 99 | if not TestIndexing._validate_slice(s, 5): 100 | continue 101 | assert all_close( 102 | x_2d_dsc[(slice(None, None, None), s)], 103 | x_2d[(slice(None, None, None), s)], 104 | ) 105 | 106 | for extra_dim in range(-5, 5): 107 | for start in range(-5, 5): 108 | for stop in range(-5, 5): 109 | for step in range(-5, 5): 110 | s = slice(start, stop, step) 111 | if not TestIndexing._validate_slice(s, 5): 112 | continue 113 | 114 | x_dsc_1 = x_2d_dsc[(extra_dim, s)] 115 | x_np_1 = x_2d[(extra_dim, s)] 116 | assert all_close(x_dsc_1, x_np_1) 117 | 118 | x_dsc_2 = x_2d_dsc[(s, extra_dim)] 119 | x_np_2 = x_2d[(s, extra_dim)] 120 | assert all_close(x_dsc_2, x_np_2) 121 | 122 | def test_set_idx(self): 123 | for n_dim in range(1, 5): 124 | for dtype in DTYPES: 125 | x = random_nd([10 for _ in range(n_dim)], dtype=dtype) 126 | x_dsc = dsc.from_numpy(x) 127 | 128 | for indexes in range(1, n_dim): 129 | for _ in range(10): 130 | idx = tuple(randint(-10, 9) for _ in range(indexes)) 131 | val = ( 132 | random() + 1 133 | if indexes == n_dim 134 | else random_nd( 135 | [10 for _ in range(n_dim - indexes)], dtype=dtype 136 | ) 137 | ) 138 | x[idx] = val 139 | x_dsc[idx] = val 140 | assert all_close(x_dsc, x) 141 | 142 | def test_set_slice(self): 143 | def _shape_from_slice(sl: slice, max_dim: int) -> List[int]: 144 | real_start = sl.start if sl.start >= 0 else sl.start + max_dim 145 | real_stop = sl.stop if sl.stop >= 0 else sl.stop + max_dim 146 | return [math.ceil(math.fabs(real_start - real_stop) / math.fabs(sl.step))] 147 | 148 | # This is not exhaustive, but it's good enough for now 149 | x_1d = random_nd([10], np.float32) 150 | x_1d_dsc = dsc.from_numpy(x_1d) 151 | 152 | x_1d[:] = np.ones(10, dtype=np.float32) 153 | x_1d_dsc[:] = np.ones(10, dtype=np.float32) 154 | assert all_close(x_1d_dsc, x_1d) 155 | 156 | for start in range(-10, 10): 157 | for stop in range(-10, 10): 158 | for step in range(-10, 10): 159 | s = slice(start, stop, step) 160 | if not TestIndexing._validate_slice(s, 10): 161 | continue 162 | x_1d[s] = 1516.0 163 | x_1d_dsc[s] = 1516.0 164 | assert all_close(x_1d_dsc, x_1d) 165 | 166 | val_shape = _shape_from_slice(s, 10) 167 | val = random_nd(val_shape, dtype=np.float32) 168 | x_1d[s] = val 169 | x_1d_dsc[s] = val 170 | assert all_close(x_1d_dsc, x_1d) 171 | 172 | x_2d = random_nd([5, 5], np.float32) 173 | x_2d_dsc = dsc.from_numpy(x_2d) 174 | 175 | for extra_dim in range(-5, 5): 176 | for start in range(-5, 5): 177 | for stop in range(-5, 5): 178 | for step in range(-5, 5): 179 | s = slice(start, stop, step) 180 | if not TestIndexing._validate_slice(s, 5): 181 | continue 182 | 183 | x_2d[(extra_dim, s)] = 12.0 184 | x_2d_dsc[(extra_dim, s)] = 12.0 185 | assert all_close(x_2d_dsc, x_2d) 186 | 187 | x_2d[(s, extra_dim)] = -1.55 188 | x_2d_dsc[(s, extra_dim)] = -1.55 189 | assert all_close(x_2d_dsc, x_2d) 190 | 191 | val_shape = _shape_from_slice(s, 5) 192 | val = random_nd(val_shape, np.float32) 193 | x_2d[(extra_dim, s)] = val 194 | x_2d_dsc[(extra_dim, s)] = val 195 | assert all_close(x_2d_dsc, x_2d) 196 | 197 | val = random_nd(val_shape, np.float32) 198 | x_2d[(s, extra_dim)] = val 199 | x_2d_dsc[(s, extra_dim)] = val 200 | assert all_close(x_2d_dsc, x_2d) -------------------------------------------------------------------------------- /python/tests/test_ops_common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025, Christian Gilli 2 | # All rights reserved. 3 | # 4 | # This code is licensed under the terms of the 3-clause BSD license 5 | # (https://opensource.org/license/bsd-3-clause). 6 | 7 | import pytest 8 | from random import randint, random 9 | from itertools import permutations 10 | from utils_cpu import * 11 | 12 | 13 | @pytest.fixture(scope='session', autouse=True) 14 | def session_fixture(): 15 | # This is invoked once before starting the test session 16 | dsc.init(int(2**30)) 17 | print(f'Running tests on {DEVICE}') 18 | dsc.set_default_device(DEVICE) 19 | yield 20 | 21 | 22 | @pytest.fixture(autouse=True) 23 | def teardown_fixture(): 24 | # This is invoked automatically after each test 25 | yield 26 | 27 | class TestInit: 28 | def test_arange(self): 29 | for _ in range(10): 30 | n = randint(1, 10_000) 31 | for dtype in DTYPES: 32 | if is_bool(dtype): 33 | continue 34 | print(f'Tensing arange with N={n} and dtype={dtype.__name__} ') 35 | res_np = np.arange(n, dtype=dtype) 36 | res_dsc = dsc.arange(n, dtype=DSC_DTYPES[dtype]) 37 | assert all_close(res_dsc, res_np) 38 | 39 | def test_random(self): 40 | for _ in range(10): 41 | shape = tuple([randint(1, 10) for _ in range(4)]) 42 | for dtype in DTYPES: 43 | if not is_float(dtype): 44 | continue 45 | print(f'Tensing randn with dtype={dtype.__name__} ') 46 | 47 | res_np = np.random.randn(*shape).astype(dtype) 48 | res_dsc = dsc.randn(*shape, dtype=DSC_DTYPES[dtype]) 49 | res_dsc_np = res_dsc.numpy() 50 | 51 | assert res_dsc_np.dtype == res_np.dtype 52 | assert res_dsc_np.shape == res_np.shape 53 | 54 | def test_creation(): 55 | for n_dim in range(4): 56 | for dtype in DTYPES: 57 | shape = tuple(randint(1, 20) for _ in range(n_dim + 1)) 58 | fill = random() 59 | 60 | x = np.full(shape, fill_value=fill, dtype=dtype) 61 | x_dsc = dsc.full(shape, fill_value=fill, dtype=DSC_DTYPES[dtype]) 62 | assert all_close(x_dsc, x) 63 | 64 | like = np.ones([randint(1, 10) for _ in range(n_dim + 1)]) 65 | 66 | x = np.full_like(like, fill_value=fill, dtype=dtype) 67 | x_dsc = dsc.full_like(like, fill_value=fill, dtype=DSC_DTYPES[dtype]) 68 | assert all_close(x_dsc, x) 69 | 70 | x = np.ones(shape, dtype=dtype) 71 | x_dsc = dsc.ones(shape, dtype=DSC_DTYPES[dtype]) 72 | assert all_close(x_dsc, x) 73 | 74 | x = np.ones_like(like, dtype=dtype) 75 | x_dsc = dsc.ones_like(like, dtype=DSC_DTYPES[dtype]) 76 | assert all_close(x_dsc, x) 77 | 78 | x = np.zeros(shape, dtype=dtype) 79 | x_dsc = dsc.zeros(shape, dtype=DSC_DTYPES[dtype]) 80 | assert all_close(x_dsc, x) 81 | 82 | x = np.zeros_like(like, dtype=dtype) 83 | x_dsc = dsc.zeros_like(like, dtype=DSC_DTYPES[dtype]) 84 | assert all_close(x_dsc, x) 85 | 86 | def test_reshape(): 87 | x = np.ones((10, 10)) 88 | x_dsc = dsc.from_numpy(x) 89 | assert all_close(x_dsc.reshape(4, 5, 5), x.reshape(4, 5, 5)) 90 | assert all_close(x_dsc.reshape([4, 5, 5]), x.reshape([4, 5, 5])) 91 | assert all_close(x_dsc.reshape((4, 5, 5)), x.reshape((4, 5, 5))) 92 | 93 | assert all_close(x_dsc.reshape(-1, 5), x.reshape(-1, 5)) 94 | assert all_close(x_dsc.reshape([-1, 5]), x.reshape([-1, 5])) 95 | assert all_close(x_dsc.reshape((-1, 5)), x.reshape((-1, 5))) 96 | 97 | def test_concat(): 98 | for n_dim in range(1, 5): 99 | for dtype in DTYPES: 100 | shape = [randint(2, 10) for _ in range(n_dim)] 101 | for axis_idx in range(n_dim): 102 | print( 103 | f'Testing concat with {n_dim}-dimensional tensors of type {dtype.__name__} on axis {axis_idx}' 104 | ) 105 | shape_x1 = list(shape) 106 | shape_x1[axis_idx] = randint(2, 10) 107 | shape_x2 = list(shape) 108 | shape_x2[axis_idx] = randint(2, 10) 109 | x1 = random_nd(shape_x1, dtype) 110 | x2 = random_nd(shape_x2, dtype) 111 | x1_dsc = dsc.from_numpy(x1) 112 | x2_dsc = dsc.from_numpy(x2) 113 | 114 | res_np = np.concat((x1, x2), axis_idx) 115 | res_dsc = dsc.concat((x1_dsc, x2_dsc), axis_idx) 116 | assert all_close(res_dsc, res_np) 117 | 118 | # Test flatten 119 | res_np_flat = np.concat((x1, x2), None) 120 | res_dsc_flat = dsc.concat((x1_dsc, x2_dsc), None) 121 | assert all_close(res_dsc_flat, res_np_flat) 122 | 123 | def test_split(): 124 | for n_dim in range(1, 5): 125 | for dtype in DTYPES: 126 | for axis_idx in range(n_dim): 127 | shape = [randint(2, 10) for _ in range(n_dim)] 128 | print(f'Testing split with {n_dim}-dimensional tensors of type {dtype.__name__} on axis {axis_idx}') 129 | ne = shape[axis_idx] 130 | multi = randint(1, 10) 131 | shape[axis_idx] *= multi 132 | x = random_nd(shape, dtype) 133 | x_dsc = dsc.from_numpy(x) 134 | 135 | res = np.split(x, multi, axis=axis_idx) 136 | res_dsc = dsc.split(x_dsc, ne, axis=axis_idx) 137 | assert len(res) == len(res_dsc) 138 | for r_np, r_dsc in zip(res, res_dsc): 139 | assert all_close(r_dsc, r_np) 140 | 141 | def test_repeat(): 142 | for n_dim in range(1, 5): 143 | for dtype in DTYPES: 144 | shape = [randint(2, 10) for _ in range(n_dim)] 145 | for axis_idx in range(n_dim): 146 | print(f'Testing repeat with {n_dim}-dimensional tensors of type {dtype.__name__} on axis {axis_idx}') 147 | x = random_nd(shape, dtype) 148 | x_dsc = dsc.from_numpy(x) 149 | repeats = randint(2, 5) 150 | res = np.repeat(x, repeats, axis=axis_idx) 151 | res_dsc = dsc.repeat(x_dsc, repeats, axis=axis_idx) 152 | assert all_close(res_dsc, res) 153 | 154 | def test_where(): 155 | for n_dim in range(1, 5): 156 | for dtype in DTYPES: 157 | print(f'Testing where with {n_dim}-dimensional condition tensor and values of type {dtype.__name__}') 158 | x = np.random.choice([True, False], size=tuple([randint(1, 10) for _ in range(n_dim)])) 159 | values = random_nd([2], dtype=dtype) 160 | this = values[0]; that = values[1] 161 | x_dsc = dsc.from_numpy(x) 162 | res = np.where(x, this, that) 163 | res_dsc = dsc.where(x_dsc, this, that) 164 | assert all_close(res_dsc, res) 165 | 166 | def test_transpose(): 167 | for n_dim in range(2, 5): 168 | for dtype in DTYPES: 169 | print( 170 | f'Testing transpose with {n_dim}-dimensional tensors of type {dtype.__name__}' 171 | ) 172 | shape = [randint(2, 10) for _ in range(n_dim)] 173 | x = random_nd(shape, dtype) 174 | x_dsc = dsc.from_numpy(x) 175 | # Simple transpose 176 | res_np_simple = np.transpose(x) 177 | res_dsc_simple = dsc.transpose(x_dsc) 178 | assert all_close(res_dsc_simple, res_np_simple) 179 | 180 | # Test with all the permutations of axes, both positive and negative 181 | for axes in permutations(range(-n_dim, 0), n_dim): 182 | res_np = np.transpose(x, axes) 183 | res_dsc = dsc.transpose(x_dsc, axes) 184 | assert all_close(res_dsc, res_np) 185 | 186 | for axes in permutations(range(0, n_dim), n_dim): 187 | res_np = np.transpose(x, axes) 188 | res_dsc = dsc.transpose(x_dsc, axes) 189 | assert all_close(res_dsc, res_np) 190 | 191 | def test_tril(): 192 | for n_dim in range(2, 5): 193 | for dtype in DTYPES: 194 | x = random_nd([randint(1, 10) for _ in range(n_dim)], dtype) 195 | x_dsc = dsc.from_numpy(x) 196 | for k in range(-1, 2): 197 | print(f'Testing tril with {n_dim}-dimensional tensors of type {dtype.__name__} k={k}') 198 | res = np.tril(x, k) 199 | res_dsc = dsc.tril(x_dsc, k) 200 | assert all_close(res_dsc, res) 201 | 202 | def test_masked_fill(): 203 | for n_dim in range(1, 5): 204 | for dtype in DTYPES: 205 | if not is_float(dtype): 206 | continue 207 | 208 | print(f'Testing masked_fill with {n_dim}-dimensional tensors of type {dtype.__name__}') 209 | x = random_nd([randint(1, 10) for _ in range(n_dim)], dtype) 210 | mask = random_nd(x.shape, np.bool) 211 | x_dsc = dsc.from_numpy(x) 212 | mask_dsc = dsc.from_numpy(mask) 213 | fill = random() 214 | 215 | x[mask] = fill 216 | res_dsc = x_dsc.masked_fill(mask_dsc, fill) 217 | assert all_close(res_dsc, x) 218 | -------------------------------------------------------------------------------- /dsc/include/gpu/dsc_gpu.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024-2025, Christian Gilli 2 | // All rights reserved. 3 | // 4 | // This code is licensed under the terms of the 3-clause BSD license 5 | // (https://opensource.org/license/bsd-3-clause). 6 | 7 | #pragma once 8 | 9 | #include "dsc.h" 10 | // #include "dsc_device.h" 11 | 12 | #if defined(DSC_CUDA) && defined(DSC_HIP) 13 | # error "DSC can't be compiled with both CUDA and HIP support" 14 | #endif 15 | 16 | 17 | #if defined(DSC_CUDA) || defined(DSC_HIP) 18 | 19 | #if defined(DSC_CUDA) 20 | # include "platform/dsc_cuda_platform.h" 21 | #endif 22 | 23 | #if defined(DSC_HIP) 24 | # include "platform/dsc_hip_platform.h" 25 | #endif 26 | 27 | #define DSC_GPU_KERNEL __global__ 28 | #define DSC_GPU_FUNC __device__ 29 | #define DSC_GPU_DEFAULT_THREADS ((uint) 256) 30 | #define DSC_GPU_MAX_BLOCKS ((uint) 256) 31 | 32 | #define DSC_GPU_BLOCKS(n) DSC_MIN(DSC_GPU_MAX_BLOCKS, DSC_CEIL(n, DSC_GPU_DEFAULT_THREADS)) 33 | #define DSC_GPU_TID() const int tid = (int) (threadIdx.x + blockIdx.x * blockDim.x) 34 | #define DSC_GPU_STRIDE() const int stride = (int) (blockDim.x * gridDim.x) 35 | 36 | struct dsc_device; 37 | 38 | struct dsc_gpu_dev_info { 39 | char name[256]; 40 | gpu_rand_state *rand_state; 41 | gpu_blas_handle blas_handle; 42 | int dev_idx; 43 | dsc_gpu_platform platform; 44 | }; 45 | 46 | // ============================================================ 47 | // Utilities 48 | // 49 | 50 | static DSC_INLINE int dsc_gpu_devices() { 51 | int devices; 52 | DSC_GPU_CHECK(gpu_get_device_count(&devices)); 53 | return devices; 54 | } 55 | 56 | static DSC_INLINE int dsc_gpu_dev_capability(const int dev) { 57 | gpu_device_props prop{}; 58 | DSC_GPU_CHECK(gpu_get_device_properties(&prop, dev)); 59 | return prop.major * 100 + prop.minor * 10; 60 | } 61 | 62 | static DSC_INLINE void dsc_gpu_dev_name(const int dev, char *dst) { 63 | gpu_device_props prop{}; 64 | DSC_GPU_CHECK(gpu_get_device_properties(&prop, dev)); 65 | strncpy(dst, prop.name, 256); 66 | } 67 | 68 | static DSC_INLINE usize dsc_gpu_dev_mem(const int dev) { 69 | gpu_device_props prop{}; 70 | DSC_GPU_CHECK(gpu_get_device_properties(&prop, dev)); 71 | return prop.totalGlobalMem; 72 | } 73 | 74 | static DSC_INLINE void dsc_gpu_sync() { 75 | DSC_GPU_CHECK(gpu_device_sync()); 76 | } 77 | 78 | static DSC_INLINE bool dsc_gpu_has_bf16() { 79 | #if defined(DSC_BF16) 80 | return true; 81 | #else 82 | return false; 83 | #endif 84 | } 85 | 86 | // ============================================================ 87 | // GPU-specific operations 88 | // 89 | 90 | extern void dsc_gpu_cast(dsc_device *dev, const dsc_tensor *DSC_RESTRICT x, 91 | dsc_tensor *DSC_RESTRICT out); 92 | 93 | extern void dsc_gpu_arange(dsc_device *dev, dsc_tensor *DSC_RESTRICT x, 94 | f64 start, f64 step); 95 | 96 | extern void dsc_gpu_repeat(dsc_device *dev, 97 | const dsc_tensor *DSC_RESTRICT x, 98 | dsc_tensor *DSC_RESTRICT out, 99 | int repeats, int axis_idx); 100 | 101 | extern void dsc_gpu_randn(dsc_device *dev, dsc_tensor *DSC_RESTRICT x); 102 | 103 | extern void dsc_gpu_concat(dsc_device *dev, 104 | dsc_tensor **to_concat, 105 | int tensors, 106 | dsc_tensor *DSC_RESTRICT out, 107 | int axis_idx); 108 | 109 | extern void dsc_gpu_transpose(dsc_device *dev, 110 | const dsc_tensor *DSC_RESTRICT x, 111 | dsc_tensor *DSC_RESTRICT out, 112 | const int *new_shape, 113 | const int *new_stride); 114 | 115 | extern void dsc_gpu_tril(dsc_device *dev, 116 | const dsc_tensor *DSC_RESTRICT x, 117 | int diagonal, 118 | dsc_tensor *DSC_RESTRICT out); 119 | 120 | // ============================================================ 121 | // Indexing and Slicing 122 | 123 | extern void dsc_gpu_get_slice(dsc_device *dev, 124 | const dsc_tensor *DSC_RESTRICT x, 125 | dsc_tensor *DSC_RESTRICT out, 126 | int n_slices, const dsc_slice *slices, 127 | bool whole); 128 | 129 | extern void dsc_gpu_set_slice(dsc_device *dev, 130 | dsc_tensor *DSC_RESTRICT xa, 131 | bool xa_scalar, 132 | const dsc_tensor *DSC_RESTRICT xb, 133 | bool xb_scalar, 134 | int n_slices, 135 | const dsc_slice *slices, 136 | bool whole); 137 | 138 | // ============================================================ 139 | // Binary Operations 140 | 141 | extern void dsc_gpu_add(dsc_device *dev, 142 | const dsc_tensor *xa, 143 | const dsc_tensor *xb, 144 | dsc_tensor *out); 145 | 146 | extern void dsc_gpu_sub(dsc_device *dev, 147 | const dsc_tensor *xa, 148 | const dsc_tensor *xb, 149 | dsc_tensor *out); 150 | 151 | extern void dsc_gpu_mul(dsc_device *dev, 152 | const dsc_tensor *xa, 153 | const dsc_tensor *xb, 154 | dsc_tensor *out); 155 | 156 | extern void dsc_gpu_div(dsc_device *dev, 157 | const dsc_tensor *xa, 158 | const dsc_tensor *xb, 159 | dsc_tensor *out); 160 | 161 | extern void dsc_gpu_pow(dsc_device *dev, 162 | const dsc_tensor *xa, 163 | const dsc_tensor *xb, 164 | dsc_tensor *out); 165 | 166 | extern void dsc_gpu_matmul(dsc_device *dev, 167 | const dsc_tensor *DSC_RESTRICT xa, 168 | const dsc_tensor *DSC_RESTRICT xb, 169 | bool trans_b, 170 | dsc_tensor *DSC_RESTRICT out); 171 | 172 | extern void dsc_gpu_compare(dsc_device *dev, 173 | const dsc_tensor *xa, 174 | const dsc_tensor *xb, 175 | dsc_comparison_op comp, 176 | dsc_tensor *out); 177 | 178 | extern void dsc_gpu_masked_fill(dsc_device *dev, 179 | dsc_tensor *DSC_RESTRICT x, 180 | const dsc_tensor *DSC_RESTRICT mask, 181 | f64 value); 182 | 183 | extern void dsc_gpu_outer(dsc_device *dev, 184 | const dsc_tensor *DSC_RESTRICT xa, 185 | const dsc_tensor *DSC_RESTRICT xb, 186 | dsc_tensor *DSC_RESTRICT out); 187 | 188 | extern void dsc_gpu_where(dsc_device *dev, 189 | const dsc_tensor *DSC_RESTRICT condition, 190 | const dsc_tensor *DSC_RESTRICT input, 191 | const dsc_tensor *DSC_RESTRICT other, 192 | dsc_tensor *DSC_RESTRICT out); 193 | 194 | // ============================================================ 195 | // Unary Operations 196 | 197 | extern void dsc_gpu_cos(dsc_device *dev, 198 | const dsc_tensor *DSC_RESTRICT x, 199 | dsc_tensor *DSC_RESTRICT out); 200 | 201 | extern void dsc_gpu_sin(dsc_device *dev, 202 | const dsc_tensor *DSC_RESTRICT x, 203 | dsc_tensor *DSC_RESTRICT out); 204 | 205 | extern void dsc_gpu_tanh(dsc_device *dev, 206 | const dsc_tensor *DSC_RESTRICT x, 207 | dsc_tensor *DSC_RESTRICT out); 208 | 209 | extern void dsc_gpu_exp(dsc_device *dev, 210 | const dsc_tensor *DSC_RESTRICT x, 211 | dsc_tensor *DSC_RESTRICT out); 212 | 213 | extern void dsc_gpu_sqrt(dsc_device *dev, 214 | const dsc_tensor *DSC_RESTRICT x, 215 | dsc_tensor *DSC_RESTRICT out); 216 | 217 | // ============================================================ 218 | // Unary Operations Along Axis 219 | 220 | extern void dsc_gpu_sum(dsc_device *dev, 221 | const dsc_tensor *DSC_RESTRICT x, 222 | dsc_tensor *DSC_RESTRICT out, 223 | int axis_idx); 224 | 225 | extern void dsc_gpu_min(dsc_device *dev, 226 | const dsc_tensor *DSC_RESTRICT x, 227 | dsc_tensor *DSC_RESTRICT out, 228 | int axis_idx); 229 | 230 | extern void dsc_gpu_max(dsc_device *dev, 231 | const dsc_tensor *DSC_RESTRICT x, 232 | dsc_tensor *DSC_RESTRICT out, 233 | int axis_idx); 234 | 235 | #else 236 | 237 | #define DSC_GPU_PLATFORM NONE 238 | 239 | static DSC_INLINE int dsc_gpu_devices() { 240 | return 0; 241 | } 242 | 243 | static DSC_INLINE int dsc_gpu_dev_capability(const int) { 244 | return 0; 245 | } 246 | 247 | static DSC_INLINE void dsc_gpu_dev_name(const int, char *) {} 248 | 249 | static DSC_INLINE usize dsc_gpu_dev_mem(const int) { 250 | return 0; 251 | } 252 | 253 | static DSC_INLINE void dsc_gpu_sync() {} 254 | 255 | static DSC_INLINE bool dsc_gpu_has_bf16() { 256 | return false; 257 | } 258 | 259 | #endif // DSC_CUDA || DSC_HIP -------------------------------------------------------------------------------- /dsc/include/gpu/dsc_ops.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024-2025, Christian Gilli 2 | // All rights reserved. 3 | // 4 | // This code is licensed under the terms of the 3-clause BSD license 5 | // (https://opensource.org/license/bsd-3-clause). 6 | 7 | #pragma once 8 | 9 | #include "gpu/dsc_gpu.h" 10 | 11 | #define atomic_cas_f32(PTR, VAL) \ 12 | do { \ 13 | uint *addr = (uint *) (PTR); \ 14 | uint old = *addr, assumed; \ 15 | do { \ 16 | assumed = old; \ 17 | const f32 assumed_val = __int_as_float(assumed); \ 18 | const f32 new_val = VAL; \ 19 | old = atomicCAS(addr, assumed, __float_as_int(new_val)); \ 20 | } while (old != assumed); \ 21 | } while (0) 22 | 23 | #define atomic_cas_f64(PTR, VAL) \ 24 | do { \ 25 | unsigned long long *addr = (unsigned long long *) (PTR); \ 26 | unsigned long long old = *addr, assumed; \ 27 | do { \ 28 | assumed = old; \ 29 | const f64 assumed_val = __longlong_as_double(assumed); \ 30 | const f64 new_val = VAL; \ 31 | old = atomicCAS(addr, assumed, __double_as_longlong(new_val)); \ 32 | } while (old != assumed); \ 33 | } while (0) 34 | 35 | 36 | struct gpu_cast_op { 37 | template 38 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE Tout operator()(const Tin in) const { 39 | #if defined(DSC_BF16) 40 | return (Tout) in; 41 | #else 42 | if constexpr (dsc_is_type()) { 43 | // If BF16 is not supported use the same logic as the CPU 44 | union { 45 | f32 f; 46 | u32 i; 47 | } u; 48 | u.i = (u32) in << 16; 49 | return (Tout) u.f; 50 | } else { 51 | return (Tout) in; 52 | } 53 | #endif 54 | } 55 | }; 56 | 57 | struct gpu_add_op { 58 | template 59 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const { 60 | if constexpr (dsc_is_type()) { 61 | return xa || xb; 62 | } else { 63 | return xa + xb; 64 | } 65 | } 66 | }; 67 | 68 | struct gpu_atomic_add_op { 69 | template 70 | DSC_GPU_FUNC DSC_INLINE void operator()(T *x, const T val) const { 71 | if constexpr (dsc_is_type()) { 72 | atomicOr(x, val); 73 | } else { 74 | atomicAdd(x, val); 75 | } 76 | } 77 | }; 78 | 79 | struct gpu_sub_op { 80 | template 81 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const { 82 | if constexpr (dsc_is_type()) { 83 | return xa ^ xb; 84 | } else { 85 | return xa - xb; 86 | } 87 | } 88 | }; 89 | 90 | struct gpu_mul_op { 91 | template 92 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const { 93 | if constexpr (dsc_is_type()) { 94 | return xa && xb; 95 | } else { 96 | return xa * xb; 97 | } 98 | } 99 | }; 100 | 101 | struct gpu_div_op { 102 | template 103 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const { 104 | return xa / xb; 105 | } 106 | }; 107 | 108 | struct gpu_pow_op { 109 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE i32 operator()(const i32 base, const i32 exp) const { 110 | i32 acc = 1; 111 | for (int i = 0; i < exp; ++i) acc *= base; 112 | return acc; 113 | } 114 | 115 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bf16 operator()(const bf16 base, const bf16 exp) const { 116 | return gpu_pow_op()((f32) base, (f32) exp); 117 | } 118 | 119 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 base, const f32 exp) const { 120 | return powf(base, exp); 121 | } 122 | 123 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 base, const f64 exp) const { 124 | return pow(base, exp); 125 | } 126 | }; 127 | 128 | struct gpu_cos_op { 129 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bf16 operator()(const bf16 x) const { 130 | return gpu_cos_op()((f32) x); 131 | } 132 | 133 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const { 134 | return cosf(x); 135 | } 136 | 137 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const { 138 | return cos(x); 139 | } 140 | }; 141 | 142 | struct gpu_sin_op { 143 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bf16 operator()(const bf16 x) const { 144 | return gpu_sin_op()((f32) x); 145 | } 146 | 147 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const { 148 | return sinf(x); 149 | } 150 | 151 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const { 152 | return sin(x); 153 | } 154 | }; 155 | 156 | struct gpu_tanh_op { 157 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bf16 operator()(const bf16 x) const { 158 | return gpu_tanh_op()((f32) x); 159 | } 160 | 161 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const { 162 | return tanhf(x); 163 | } 164 | 165 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const { 166 | return tanh(x); 167 | } 168 | }; 169 | 170 | struct gpu_sqrt_op { 171 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bf16 operator()(const bf16 x) const { 172 | return gpu_sqrt_op()((f32) x); 173 | } 174 | 175 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const { 176 | return sqrtf(x); 177 | } 178 | 179 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const { 180 | return sqrt(x); 181 | } 182 | }; 183 | 184 | struct gpu_exp_op { 185 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bf16 operator()(const bf16 x) const { 186 | return gpu_exp_op()((f32) x); 187 | } 188 | 189 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f32 operator()(const f32 x) const { 190 | return expf(x); 191 | } 192 | 193 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE f64 operator()(const f64 x) const { 194 | return exp(x); 195 | } 196 | }; 197 | 198 | struct gpu_max_op { 199 | template 200 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const { 201 | return DSC_MAX(xa, xb); 202 | } 203 | }; 204 | 205 | struct gpu_atomic_max_op { 206 | DSC_GPU_FUNC DSC_INLINE void operator()(f32 *x, const f32 val) const { 207 | atomic_cas_f32(x, DSC_MAX(val, assumed_val)); 208 | } 209 | 210 | DSC_GPU_FUNC DSC_INLINE void operator()(f64 *x, const f64 val) const { 211 | atomic_cas_f64(x, DSC_MAX(val, assumed_val)); 212 | } 213 | }; 214 | 215 | struct gpu_min_op { 216 | template 217 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE T operator()(const T xa, const T xb) const { 218 | return DSC_MIN(xa, xb); 219 | } 220 | }; 221 | 222 | struct gpu_atomic_min_op { 223 | DSC_GPU_FUNC DSC_INLINE void operator()(f32 *x, const f32 val) const { 224 | atomic_cas_f32(x, DSC_MIN(val, assumed_val)); 225 | } 226 | 227 | DSC_GPU_FUNC DSC_INLINE void operator()(f64 *x, const f64 val) const { 228 | atomic_cas_f64(x, DSC_MIN(val, assumed_val)); 229 | } 230 | }; 231 | 232 | struct gpu_eq_op { 233 | template 234 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const { 235 | return xa == xb; 236 | } 237 | }; 238 | 239 | struct gpu_ne_op { 240 | template 241 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const { 242 | return !gpu_eq_op()(xa, xb); 243 | } 244 | }; 245 | 246 | struct gpu_lt_op { 247 | template 248 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const { 249 | return xa < xb; 250 | } 251 | }; 252 | struct gpu_le_op { 253 | template 254 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const { 255 | return xa <= xb; 256 | } 257 | }; 258 | 259 | struct gpu_gt_op { 260 | template 261 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const { 262 | return xa > xb; 263 | } 264 | }; 265 | 266 | struct gpu_ge_op { 267 | template 268 | DSC_GPU_FUNC DSC_INLINE DSC_STRICTLY_PURE bool operator()(const T xa, const T xb) const { 269 | return xa >= xb; 270 | } 271 | }; 272 | 273 | 274 | template 275 | consteval bool is_comparison_op() { 276 | return dsc_is_type() || 277 | dsc_is_type() || 278 | dsc_is_type() || 279 | dsc_is_type() || 280 | dsc_is_type() || 281 | dsc_is_type(); 282 | } 283 | 284 | template 285 | consteval bool is_bool_arith_op() { 286 | return dsc_is_type() || 287 | dsc_is_type() || 288 | dsc_is_type(); 289 | } -------------------------------------------------------------------------------- /examples/models/gpt2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025, Christian Gilli 2 | # All rights reserved. 3 | # 4 | # This code is licensed under the terms of the 3-clause BSD license 5 | # (https://opensource.org/license/bsd-3-clause). 6 | 7 | 8 | import dsc 9 | import dsc.nn as nn 10 | import dsc.nn.functional as F 11 | from dataclasses import dataclass 12 | from transformers import GPT2Tokenizer 13 | from time import perf_counter 14 | import argparse 15 | 16 | 17 | @dataclass 18 | class GPT2Hparams: 19 | # default hyperparameters for GPT-2 small 20 | n_layers: int = 12 21 | n_heads: int = 12 22 | emb_size: int = 768 23 | block_size: int = 1024 24 | vocab_size: int = 50257 25 | 26 | 27 | class MultiHeadAttention(nn.Module): 28 | def __init__(self, hparams: GPT2Hparams, use_cache: bool = True, dtype: dsc.Dtype = dsc.f32): 29 | super().__init__() 30 | self.block_size = hparams.block_size 31 | self.emb_size = hparams.emb_size 32 | self.n_heads = hparams.n_heads 33 | # Stacked attention, contains the projections of both Q, K and V 34 | self.c_attn = nn.Linear(self.emb_size, 3 * self.emb_size, dtype=dtype) 35 | self.c_proj = nn.Linear(self.emb_size, self.emb_size, dtype=dtype) 36 | # Causal mask 37 | self.tril = dsc.tril(dsc.ones((self.block_size, self.block_size))) 38 | 39 | # KV cache 40 | self.use_cache = use_cache 41 | self.cache_k = None 42 | self.cache_v = None 43 | 44 | @dsc.trace('MultiHeadAttention') 45 | def forward(self, x: dsc.Tensor) -> dsc.Tensor: 46 | B, T, C = x.shape # (block size, context size, emb size) 47 | attn = self.c_attn(x) 48 | 49 | q, k, v = attn.split(self.emb_size, axis=2) # (B, T, C) 50 | q = q.reshape(B, T, self.n_heads, self.emb_size // self.n_heads).transpose((0, 2, 1, 3)) # (B, nh, T, hs) given EMB_SIZE is a multiple of N_HEADS 51 | k = k.reshape(B, T, self.n_heads, self.emb_size // self.n_heads).transpose((0, 2, 1, 3)) # (B, nh, T, hs) 52 | v = v.reshape(B, T, self.n_heads, self.emb_size // self.n_heads).transpose((0, 2, 1, 3)) # (B, nh, T, hs) 53 | 54 | if self.use_cache: 55 | if self.cache_k is not None: 56 | k = dsc.concat([self.cache_k, k], axis=2) 57 | 58 | if self.cache_v is not None: 59 | v = dsc.concat([self.cache_v, v], axis=2) 60 | 61 | self.cache_k = k 62 | self.cache_v = v 63 | 64 | seq_len = k.size(2) 65 | k_t = k.transpose((0, 1, 3, 2)) 66 | 67 | # Self Attention (B, nh, T, hs) @ (B, nh, hs, T) = (B, nh, T, T) 68 | q_k = q @ k_t 69 | attention = q_k * q.size(-1) ** -0.5 70 | 71 | if not self.use_cache or seq_len == T: 72 | # Masking is needed when we are not using the cache or when using the cache and we are processing the prompt 73 | attention = attention.masked_fill(self.tril[:T, :T] == 0, float('-inf')) 74 | 75 | attention = F.softmax(attention, axis=-1) 76 | out = attention @ v # (B, nh, T, T) @ (B, nh, T, hs) = (B, nh, T, hs) 77 | out = out.transpose((0, 2, 1, 3)).reshape(B, T, C) 78 | 79 | return self.c_proj(out) 80 | 81 | 82 | class TransformerBlock(nn.Module): 83 | def __init__(self, hparams: GPT2Hparams, use_cache: bool = True, dtype: dsc.Dtype = dsc.f32): 84 | super().__init__() 85 | self.ln_1 = nn.LayerNorm(hparams.emb_size, dtype=dtype) 86 | self.attn = MultiHeadAttention(hparams, use_cache, dtype=dtype) 87 | self.ln_2 = nn.LayerNorm(hparams.emb_size, dtype=dtype) 88 | self.mlp = nn.ModuleDict(dict( 89 | c_fc = nn.Linear(hparams.emb_size, hparams.emb_size * 4, dtype=dtype), 90 | c_proj = nn.Linear(hparams.emb_size * 4, hparams.emb_size, dtype=dtype), 91 | )) 92 | 93 | @dsc.trace('TransformerBlock') 94 | def forward(self, x: dsc.Tensor) -> dsc.Tensor: 95 | m = self.mlp 96 | x = x + self.attn(self.ln_1(x)) 97 | return x + m.c_proj(F.gelu(m.c_fc(self.ln_2(x)))) 98 | 99 | 100 | class GPT2(nn.Module): 101 | def __init__(self, hparams: GPT2Hparams, use_cache: bool = True, dtype: dsc.Dtype = dsc.f32): 102 | super().__init__() 103 | self.hparams = hparams 104 | self.wpe = nn.Embedding(hparams.block_size, hparams.emb_size, dtype=dtype) 105 | self.wte = nn.Embedding(hparams.vocab_size, hparams.emb_size, dtype=dtype) 106 | self.h = nn.ModuleList([TransformerBlock(hparams, use_cache, dtype=dtype) for _ in range(hparams.n_layers)]) 107 | self.ln_f = nn.LayerNorm(hparams.emb_size, dtype=dtype) 108 | self.lm_head = nn.Linear(hparams.emb_size, hparams.vocab_size, bias=False, dtype=dtype) 109 | self.use_cache = use_cache 110 | self.kv_pos = 0 111 | 112 | n_params = sum([p.ne for p in self.parameters()]) 113 | print(f'Model has {round(n_params / 1e6)}M parameters') 114 | 115 | @staticmethod 116 | def from_pretrained(hparams: GPT2Hparams = GPT2Hparams(), use_cache: bool = True, dtype: dsc.Dtype = dsc.f32) -> 'GPT2': 117 | # GPT2 uses Conv1D instead of a Linear layer which means we have to transpose the weights 118 | state_dict = nn.safe_load('https://huggingface.co/openai-community/gpt2/resolve/main/model.safetensors', 119 | use_dtype=dtype) 120 | for i in range(hparams.n_layers): 121 | # The causal mask doesn't need to be loaded, so I'll just remove it 122 | del state_dict[f'h.{i}.attn.bias'] 123 | 124 | to_transpose = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight'] 125 | my_model = GPT2(hparams, use_cache, dtype=dtype) 126 | my_model.from_state(state_dict, 127 | on_hook=[(to_transpose, lambda x: x.transpose())], 128 | tied={'lm_head.weight': 'wte.weight'}) # lm_head and wte weights are tied 129 | del state_dict 130 | dsc.print_mem_usage() 131 | return my_model 132 | 133 | @dsc.trace('GPT2') 134 | def forward(self, idx: dsc.Tensor) -> dsc.Tensor: 135 | B, T = idx.shape 136 | tok_emb = self.wte(idx) 137 | if self.use_cache: 138 | pos_emb = self.wpe(dsc.arange(T, device='cpu') + self.kv_pos) 139 | else: 140 | pos_emb = self.wpe(dsc.arange(T, device='cpu')) 141 | 142 | x = tok_emb + pos_emb 143 | for block in self.h: 144 | x = block(x) 145 | 146 | x = self.ln_f(x) 147 | logits = self.lm_head(x) 148 | self.kv_pos += T 149 | return logits 150 | 151 | def generate(self, idx: dsc.Tensor, tokenizer, max_new_tokens: int, temp: float = 1) -> dsc.Tensor: 152 | assert max_new_tokens < self.hparams.block_size 153 | # Include the input in the response 154 | generated = idx 155 | # The first time process the entire prompt then only the last token 156 | idx_next = idx 157 | sampling_start = None; sampling_stop = None 158 | generation_start = None; generation_stop = None 159 | for counter in range(max_new_tokens): 160 | if counter == 0: 161 | sampling_start = perf_counter() 162 | elif counter == 1: 163 | generation_start = perf_counter() 164 | 165 | if self.use_cache: 166 | logits = self(idx_next) 167 | else: 168 | logits = self(generated) 169 | # Apply temperature to the last row of each bach 170 | logits = logits[:, -1, :] * (1 / temp) 171 | 172 | kth_value = dsc.kth(logits.reshape(-1), 10) 173 | logits = logits.masked_fill(logits < kth_value, -float('Inf')) 174 | probs = F.softmax(logits, axis=-1) 175 | 176 | idx_next = dsc.multinomial(probs, num_samples=1) 177 | 178 | idx_next = idx_next.to('cpu') 179 | print(tokenizer.decode(idx_next[0]), end='', flush=True) 180 | generated = dsc.concat([generated, idx_next], axis=1) 181 | if counter == 0: 182 | sampling_stop = perf_counter() 183 | 184 | generation_stop = perf_counter() 185 | print('\n') 186 | 187 | # Report metrics 188 | prompt_processing_time_ms = (sampling_stop - sampling_start) * 1e3 189 | generation_processing_time_ms = (generation_stop - generation_start) * 1e3 190 | total_processing_time_ms = (generation_stop - sampling_start) * 1e3 191 | print(f'prompt processing time\t= {round(prompt_processing_time_ms, 1)}ms') 192 | print(f'generation time\t\t= {round(generation_processing_time_ms, 1)} ms | {round(generation_processing_time_ms / max_new_tokens, 2)} ms/tok') 193 | print(f'total time\t\t= {round(total_processing_time_ms, 1)} ms | {round(max_new_tokens / (total_processing_time_ms / 1e3), 2)} tok/s') 194 | return generated 195 | 196 | 197 | if __name__ == '__main__': 198 | cli = argparse.ArgumentParser(description='GPT2 inference CLI') 199 | cli.add_argument('prompt', type=str, help='Model prompt') 200 | cli.add_argument('-n', type=int, default=100, help='Tokens to generate (default=100)') 201 | cli.add_argument('--no-cache', action='store_true', help='Disable KV cache') 202 | cli.add_argument('--device', choices=['cpu', 'gpu'], default='cpu', help='Device on which to run the model') 203 | cli.add_argument('--dtype', choices=['f32', 'bf16'], default='f32', help='Dtype to use for inference') 204 | 205 | args = cli.parse_args() 206 | 207 | dsc.set_default_device(args.device) 208 | use_kv_cache = not args.no_cache 209 | prompt = args.prompt 210 | max_tokens = args.n 211 | 212 | dtype = dsc.f32 213 | if args.dtype == 'bf16': 214 | dtype = dsc.bf16 215 | 216 | print(f'Running model on {args.device} using {dtype}') 217 | 218 | model = GPT2.from_pretrained(use_cache=use_kv_cache, dtype=dtype) 219 | 220 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 221 | print(prompt, end='', flush=True) 222 | 223 | idx = tokenizer.encode(prompt) 224 | model.generate(dsc.tensor(idx, dtype=dsc.i32, device='cpu').reshape(1, -1), tokenizer=tokenizer, max_new_tokens=max_tokens) 225 | -------------------------------------------------------------------------------- /python/tests/test_ops_gpu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025, Christian Gilli 2 | # All rights reserved. 3 | # 4 | # This code is licensed under the terms of the 3-clause BSD license 5 | # (https://opensource.org/license/bsd-3-clause). 6 | 7 | 8 | import torch 9 | import dsc 10 | from random import randint, random 11 | from typing import List 12 | import pytest 13 | 14 | 15 | DTYPES = [torch.bool, torch.int32, torch.float32, torch.float64] 16 | TORCH_TO_DSC_DTYPES = { 17 | torch.bool: dsc.bool_, 18 | torch.int32: dsc.i32, 19 | torch.float32: dsc.f32, 20 | torch.float64: dsc.f64, 21 | } 22 | 23 | if dsc.gpu.has_bf16(): 24 | DTYPES = [torch.bool, torch.int32, torch.bfloat16, torch.float32, torch.float64] 25 | TORCH_TO_DSC_DTYPES = { 26 | torch.bool: dsc.bool_, 27 | torch.int32: dsc.i32, 28 | torch.bfloat16: dsc.bf16, 29 | torch.float32: dsc.f32, 30 | torch.float64: dsc.f64, 31 | } 32 | 33 | DSC_TO_TORCH_DTYPES = {v: k for k, v in TORCH_TO_DSC_DTYPES.items()} 34 | 35 | 36 | def is_float(dtype: torch.dtype) -> bool: 37 | return dtype == torch.bfloat16 or dtype == torch.float32 or dtype == torch.float64 38 | 39 | 40 | def is_bool(dtype: torch.dtype) -> bool: 41 | return dtype == torch.bool 42 | 43 | 44 | def is_integer(dtype: torch.dtype) -> bool: 45 | return dtype == torch.int32 46 | 47 | 48 | @pytest.fixture(scope='session', autouse=True) 49 | def session_fixture(): 50 | if not dsc.gpu.is_available(): 51 | pytest.skip('GPU not available - skipping all GPU tests', allow_module_level=True) 52 | dsc.init(int(2**30)) 53 | # This is invoked once before starting the test session 54 | dsc.set_default_device('gpu') 55 | yield 56 | 57 | 58 | @pytest.fixture(autouse=True) 59 | def teardown_fixture(): 60 | # This is invoked automatically after each test 61 | yield 62 | 63 | 64 | def random_nd(shape: List[int], dtype: torch.dtype = torch.float64) -> (torch.Tensor, dsc.Tensor): 65 | torch_tensor = None 66 | if dtype == torch.bool: 67 | torch_tensor = torch.randint(0, 2, size=tuple(shape), device='cuda').to(dtype) 68 | elif dtype == torch.int32: 69 | # Return a positive integer tensor if the dtype is int32 so that we don't have issues 70 | # with power 71 | torch_tensor = torch.randint(0, 10, size=tuple(shape), device='cuda').to(dtype) 72 | else: 73 | torch_tensor = torch.randn(*tuple(shape), device='cuda').to(dtype) 74 | 75 | return (torch_tensor, 76 | dsc.frombuffer(torch_tensor.shape, TORCH_TO_DSC_DTYPES[torch_tensor.dtype], 77 | torch_tensor.data_ptr(), device='gpu', data_device='gpu')) 78 | 79 | 80 | def all_close(actual: dsc.Tensor, target: torch.Tensor, atol: float = 1e-4, rtol: float = 1e-4) -> bool: 81 | torch.cuda.synchronize() 82 | dsc.gpu.synchronize() 83 | actual_dtype = DSC_TO_TORCH_DTYPES[actual.dtype] 84 | actual_torch = torch.as_tensor(actual, device='cuda').view(actual_dtype) 85 | if is_float(actual_dtype) and not is_float(target.dtype): 86 | target = target.to(dtype=actual_dtype) 87 | return torch.allclose(actual_torch, target, atol=atol, rtol=rtol, equal_nan=True) 88 | 89 | 90 | class TestOps: 91 | def test_binary(self): 92 | ops = { 93 | 'add': (torch.add, dsc.add), 94 | 'sub': (torch.subtract, dsc.sub), 95 | 'mul': (torch.multiply, dsc.mul), 96 | 'div': (torch.true_divide, dsc.true_div), 97 | 'pow': (torch.pow, dsc.power), 98 | 'equal': (torch.eq, dsc.equal), 99 | 'not_equal': (torch.ne, dsc.not_equal), 100 | 'less': (torch.lt, dsc.less), 101 | 'less_equal': (torch.le, dsc.less_equal), 102 | 'greater': (torch.gt, dsc.greater), 103 | 'greater_equal': (torch.ge, dsc.greater_equal), 104 | } 105 | for op_name in ops.keys(): 106 | torch_op, dsc_op = ops[op_name] 107 | for dtype in DTYPES: 108 | if op_name == 'sub': 109 | torch_op = torch.bitwise_xor if is_bool(dtype) else torch.subtract 110 | if op_name == 'pow' and is_bool(dtype): 111 | # Pow on CUDA is not implemented in torch for bool 112 | continue 113 | 114 | atol = 1e-4; rtol = 1e-4 115 | if dtype == torch.bfloat16: 116 | atol = 1e-1; rtol = 1e-2 117 | print(f'Testing operator {op_name} with {dtype}') 118 | shape = [randint(2, 10) for _ in range(4)] 119 | x, x_dsc = random_nd(shape, dtype=dtype) 120 | 121 | # Same shape 122 | y, y_dsc = random_nd(shape, dtype=dtype) 123 | 124 | res_torch = torch_op(x, y) 125 | res_dsc = dsc_op(x_dsc, y_dsc) 126 | r_res_torch = torch_op(y, x) 127 | r_res_dsc = dsc_op(y_dsc, x_dsc) 128 | assert all_close(res_dsc, res_torch, atol, rtol), f'Error testing ({x.shape} {op_name} {y.shape}) dtype={dtype}' 129 | assert all_close(r_res_dsc, r_res_torch, atol, rtol), f'Error testing ({y.shape} {op_name} {x.shape}) dtype={dtype}' 130 | 131 | # Broadcasting 132 | collapse_idx = randint(0, 3) 133 | shape[collapse_idx] = 1 134 | y_b, y_dsc_b = random_nd(shape, dtype=dtype) 135 | 136 | res_torch_b = torch_op(x, y_b) 137 | res_dsc_b = dsc_op(x_dsc, y_dsc_b) 138 | r_res_torch_b = torch_op(y_b, x) 139 | r_res_dsc_b = dsc_op(y_dsc_b, x_dsc) 140 | assert all_close(res_dsc_b, res_torch_b, atol, rtol), f'Error testing ({x.shape} {op_name} {y_b.shape}) dtype={dtype}' 141 | assert all_close(r_res_dsc_b, r_res_torch_b, atol, rtol), f'Error testing ({y_b.shape} {op_name} {x.shape}) dtype={dtype}' 142 | 143 | # Scalar 144 | if is_float(dtype): 145 | y_s = random() 146 | 147 | elif is_bool(dtype): 148 | y_s = bool(randint(0, 1)) 149 | else: 150 | y_s = randint(0, 10) 151 | 152 | res_torch_s = torch_op(x, y_s) 153 | res_dsc_s = dsc_op(x_dsc, y_s) 154 | if 'equal' in op_name or op_name == 'less' or op_name == 'greater': 155 | # For comparison ops torch requires the first argument to be a tensor 156 | continue 157 | r_res_torch_s = torch_op(y_s, x) 158 | r_res_dsc_s = dsc_op(y_s, x_dsc) 159 | assert all_close(res_dsc_s, res_torch_s, atol, rtol), f'Error testing ({x.shape} {op_name} {y_s}) dtype={dtype}' 160 | assert all_close(r_res_dsc_s, r_res_torch_s, atol, rtol), f'Error testing ({y_s} {op_name} {x.shape}) dtype={dtype}' 161 | 162 | def test_unary(self): 163 | ops = { 164 | 'sin': (torch.sin, dsc.sin), 165 | 'cos': (torch.cos, dsc.cos), 166 | 'tanh': (torch.tanh, dsc.tanh), 167 | 'exp': (torch.exp, dsc.exp), 168 | 'sqrt': (torch.sqrt, dsc.sqrt), 169 | } 170 | for op_name in ops.keys(): 171 | torch_op, dsc_op = ops[op_name] 172 | for dtype in DTYPES: 173 | print(f'Testing {op_name} with {dtype}') 174 | x, x_dsc = random_nd([randint(1, 10) for _ in range(4)], dtype=dtype) 175 | 176 | res_torch = torch_op(x) 177 | res_dsc = dsc_op(x_dsc) 178 | 179 | assert all_close(res_dsc, res_torch), f'Error testing {op_name} shape={x.shape} dtype={dtype}' 180 | 181 | def test_unary_axis(self): 182 | ops = { 183 | 'sum': (torch.sum, dsc.sum), 184 | 'mean': (torch.mean, dsc.mean), 185 | 'var': (torch.var, dsc.var), 186 | 'max': (torch.amax, dsc.max), 187 | 'min': (torch.amin, dsc.min), 188 | } 189 | for op_name in ops.keys(): 190 | torch_op, dsc_op = ops[op_name] 191 | for dtype in DTYPES: 192 | for axis in range(-4, 4): 193 | rtol = 1e-4; atol = 1e-4 194 | params_torch = { 195 | 'dim': axis, 196 | 'keepdim': True 197 | } 198 | if op_name == 'mean' or op_name == 'var': 199 | if not is_float(dtype) or dtype == torch.bfloat16: 200 | continue 201 | atol = 1e-3; rtol = 1e-2 202 | if op_name == 'var': 203 | params_torch['correction'] = 0 204 | 205 | print(f'Testing {op_name} with {dtype} along axis {axis}') 206 | x, x_dsc = random_nd( 207 | [randint(1, 10) for _ in range(4)], dtype=dtype 208 | ) 209 | 210 | res_torch = torch_op(x, **params_torch) 211 | res_dsc = dsc_op(x_dsc, axis=axis, keepdims=True) 212 | assert all_close(res_dsc, res_torch, atol, rtol), f'Error testing {op_name} shape={x.shape} dtype={x.dtype} keepdims=True' 213 | 214 | params_torch['keepdim'] = False 215 | res_torch_2 = torch_op(x, **params_torch) 216 | res_dsc_2 = dsc_op(x_dsc, axis=axis, keepdims=False) 217 | assert all_close(res_dsc_2, res_torch_2, atol, rtol), f'Error testing {op_name} shape={x.shape} dtype={x.dtype} keepdims=False' 218 | 219 | def test_matmul(self): 220 | def _mnk() -> tuple[int, int, int]: 221 | return randint(50, 100), randint(50, 100), randint(50, 100) 222 | 223 | def _test_matmul(shape_a: List[int], shape_b: List[int], dt: torch.dtype): 224 | print(f'Testing {shape_a} @ {shape_b} on with {dt}') 225 | xa, xa_dsc = random_nd(shape_a, dtype=dt) 226 | xb, xb_dsc = random_nd(shape_b, dtype=dt) 227 | res = xa @ xb 228 | res_dsc = xa_dsc @ xb_dsc 229 | # TODO: it looks like BF16 has a lower precision, should check what torch actually does. For now fix the tolerance at 1% 230 | assert all_close(res_dsc, res, atol=1e-1, rtol=1e-2), f'Error testing {shape_a} @ {shape_b} with {dt}' 231 | 232 | for dtype in DTYPES: 233 | if not is_float(dtype): 234 | continue 235 | # 2D GEMM 236 | m, n, k = _mnk() 237 | _test_matmul([m, k], [k, n], dtype) 238 | # GEVM 239 | _test_matmul([1, k], [k, n], dtype) 240 | 241 | # Batched case 242 | for _ in range(5): 243 | batch_1, batch_2 = randint(2, 10), randint(2, 10) 244 | m, n, k = _mnk() 245 | _test_matmul([batch_1, batch_2, m, k], [batch_1, batch_2, k, n], dtype) 246 | 247 | # Batched case with broadcasting 248 | for batch_1 in range(1, 6): 249 | for batch_2 in range(1, 6): 250 | m, n, k = _mnk() 251 | _test_matmul([batch_1 if batch_1%2 == 0 else 1, 252 | batch_2 if batch_2%2 == 0 else 1, m, k], 253 | [batch_1 if batch_1%2 == 1 else 1, 254 | batch_2 if batch_2%2 == 1 else 1, k, n], 255 | dtype) 256 | def test_outer(self): 257 | for dtype in DTYPES: 258 | for _ in range(10): 259 | xa, xa_dsc = random_nd([randint(2, 50)], dtype) 260 | xb, xb_dsc = random_nd([randint(2, 50)], dtype) 261 | 262 | out = torch.outer(xa, xb) 263 | out_dsc = dsc.outer(xa_dsc, xb_dsc) 264 | assert all_close(out_dsc, out) 265 | -------------------------------------------------------------------------------- /examples/models/qwen2_5.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024-2025, Christian Gilli 2 | # All rights reserved. 3 | # 4 | # This code is licensed under the terms of the 3-clause BSD license 5 | # (https://opensource.org/license/bsd-3-clause). 6 | 7 | 8 | import dsc 9 | import dsc.nn as nn 10 | import dsc.nn.functional as F 11 | from dataclasses import dataclass 12 | from time import perf_counter 13 | import argparse 14 | from transformers import AutoTokenizer 15 | from typing import Tuple, Optional, List 16 | import math 17 | import numpy as np 18 | 19 | 20 | CacheEntry = Tuple[dsc.Tensor, dsc.Tensor] 21 | Cache = List[CacheEntry] 22 | 23 | 24 | # Default config for Qwen 2.5 0.5B 25 | @dataclass 26 | class Config: 27 | vocab_size: int = 151936 28 | hidden_size: int = 896 29 | intermediate_size: int = 4864 30 | num_hidden_layers: int = 24 31 | num_attention_heads: int = 14 32 | num_key_value_heads: int = 2 33 | max_position_embeddings: int = 1024 34 | rms_norm_eps: float = 1e-6 35 | tie_word_embeddings: bool = True 36 | rope_theta: float = 1000000.0 37 | sliding_window: int = 4096 38 | max_window_layers: int = 28 39 | bos_token_id: int = 151643 40 | eos_token_id: int = 151645 41 | 42 | 43 | class MLP(nn.Module): 44 | def __init__(self, config: Config, dtype: dsc.Dtype = dsc.f32): 45 | super().__init__() 46 | self.hidden_size = config.hidden_size 47 | self.intermediate_size = config.intermediate_size 48 | self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False, dtype=dtype) 49 | self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False, dtype=dtype) 50 | self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False, dtype=dtype) 51 | 52 | @dsc.trace('MLP') 53 | def forward(self, x: dsc.Tensor) -> dsc.Tensor: 54 | return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x)) 55 | 56 | 57 | def _pre_compute_freqs(dim: int, theta: float, max_seq_len: int, dtype: dsc.Dtype = dsc.f32) -> Tuple[dsc.Tensor, dsc.Tensor]: 58 | freqs = 1.0 / (theta ** ((dsc.arange(start=0, stop=dim, step=2)[: (dim // 2)]).cast(dtype) / dim)) 59 | t = dsc.arange(stop=max_seq_len, dtype=dtype) 60 | freqs = dsc.outer(t, freqs) 61 | cos_cache_half = dsc.cos(freqs) 62 | sin_cache_half = dsc.sin(freqs) 63 | 64 | cos_cache = dsc.concat([cos_cache_half, cos_cache_half], axis=-1) 65 | sin_cache = dsc.concat([sin_cache_half, sin_cache_half], axis=-1) 66 | return cos_cache, sin_cache 67 | 68 | 69 | def _rotate_half(x: dsc.Tensor) -> dsc.Tensor: 70 | lim = x.size(-1) // 2 71 | x1 = x[:, :, :, :lim] 72 | x2 = x[:, :, :, lim:] 73 | return dsc.concat([-x2, x1], axis=-1) 74 | 75 | 76 | @dsc.trace('RoPE') 77 | def _apply_rope(q: dsc.Tensor, k: dsc.Tensor, 78 | freq_cos: dsc.Tensor, 79 | freq_sin: dsc.Tensor, 80 | position_ids: dsc.Tensor) -> Tuple[dsc.Tensor, dsc.Tensor]: 81 | cos = freq_cos[position_ids] 82 | sin = freq_sin[position_ids] 83 | 84 | batch_size, seq_len, head_size = cos.shape 85 | 86 | cos = cos.reshape(batch_size, 1, seq_len, head_size) 87 | sin = sin.reshape(batch_size, 1, seq_len, head_size) 88 | 89 | q_embed = (q * cos) + (_rotate_half(q) * sin) 90 | k_embed = (k * cos) + (_rotate_half(k) * sin) 91 | return q_embed, k_embed 92 | 93 | 94 | def _repeat_kv(x: dsc.Tensor, n_rep: int) -> dsc.Tensor: 95 | if n_rep == 1: 96 | return x 97 | return dsc.repeat(x, n_rep, axis=1) 98 | 99 | 100 | class Attention(nn.Module): 101 | def __init__(self, config: Config, dtype: dsc.Dtype = dsc.f32): 102 | super().__init__() 103 | self.head_size = config.hidden_size // config.num_attention_heads 104 | self.num_heads = config.num_attention_heads 105 | self.num_kv_heads = config.num_key_value_heads 106 | self.n_rep = self.num_heads // self.num_kv_heads 107 | self.sliding_window = config.sliding_window 108 | self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_size, dtype=dtype) 109 | self.k_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_size, dtype=dtype) 110 | self.v_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_size, dtype=dtype) 111 | self.o_proj = nn.Linear(self.num_heads * self.head_size, config.hidden_size, bias=False, dtype=dtype) 112 | 113 | @dsc.trace('Attention') 114 | def forward( 115 | self, x: dsc.Tensor, 116 | freq_cos_cache: dsc.Tensor, 117 | freq_sin_cache: dsc.Tensor, 118 | position_ids: dsc.Tensor, 119 | past_key_value: Optional[CacheEntry] = None 120 | ) -> Tuple[dsc.Tensor, CacheEntry]: 121 | 122 | block_size, seq_len, _ = x.shape 123 | q, k_cur, v_cur = self.q_proj(x), self.k_proj(x), self.v_proj(x) 124 | 125 | q = q.reshape(block_size, seq_len, self.num_heads, self.head_size).transpose((0, 2, 1, 3)) 126 | k_cur = k_cur.reshape(block_size, seq_len, self.num_kv_heads, self.head_size).transpose((0, 2, 1, 3)) 127 | v_cur = v_cur.reshape(block_size, seq_len, self.num_kv_heads, self.head_size).transpose((0, 2, 3, 1)) 128 | 129 | q, k_cur = _apply_rope(q, k_cur, freq_cos_cache, freq_sin_cache, position_ids) 130 | 131 | if past_key_value is not None: 132 | past_k, past_v = past_key_value 133 | k = dsc.concat([past_k, k_cur], axis=2) 134 | v = dsc.concat([past_v, v_cur], axis=3) 135 | else: 136 | k = k_cur 137 | v = v_cur 138 | 139 | present_key_value = (k, v) 140 | 141 | k = _repeat_kv(k, self.n_rep) 142 | v = _repeat_kv(v, self.n_rep) 143 | 144 | scores = dsc.matmul(q, k, trans_b=True) * (1.0 / math.sqrt(self.head_size)) 145 | 146 | q_len = q.size(2) 147 | k_len = k.size(2) 148 | 149 | # SWA 150 | k_pos_indices = dsc.arange(k_len).reshape(1, -1) 151 | q_pos_indices = dsc.arange(start=(k_len - q_len), stop=k_len).reshape(-1, 1) 152 | causal_mask = k_pos_indices <= q_pos_indices # shape (q_len, k_len) 153 | window_mask = (q_pos_indices - k_pos_indices) < self.sliding_window 154 | 155 | should_attend = causal_mask * window_mask # shape (q_len, k_len) 156 | 157 | additive_mask = dsc.where( 158 | should_attend, 159 | 0.0, 160 | float('-inf') 161 | ).reshape(1, 1, q_len, k_len).cast(scores.dtype) 162 | masked_scores = scores + additive_mask 163 | 164 | attn_weights = F.softmax(masked_scores, axis=-1) 165 | out = dsc.matmul(attn_weights, v, trans_b=True).transpose((0, 2, 1, 3)).reshape(block_size, seq_len, -1) 166 | 167 | return self.o_proj(out), present_key_value 168 | 169 | 170 | class DecoderLayer(nn.Module): 171 | def __init__(self, config: Config, dtype: dsc.Dtype = dsc.f32): 172 | super().__init__() 173 | self.self_attn = Attention(config, dtype=dtype) 174 | self.mlp = MLP(config, dtype=dtype) 175 | self.input_layernorm = nn.RMSNorm(config.hidden_size, epsilon=config.rms_norm_eps, dtype=dtype) 176 | self.post_attention_layernorm = nn.RMSNorm(config.hidden_size, epsilon=config.rms_norm_eps, dtype=dtype) 177 | 178 | @dsc.trace('DecoderLayer') 179 | def forward( 180 | self, 181 | x: dsc.Tensor, 182 | freq_cos: dsc.Tensor, 183 | freq_sin: dsc.Tensor, 184 | position_ids: dsc.Tensor, 185 | past_key_value: Optional[CacheEntry] = None 186 | ) -> Tuple[dsc.Tensor, CacheEntry]: 187 | 188 | ln_out = self.input_layernorm(x) 189 | attn_out, present_kv = self.self_attn(ln_out, freq_cos, freq_sin, position_ids, past_key_value) 190 | h = x + attn_out 191 | return h + self.mlp(self.post_attention_layernorm(h)), present_kv 192 | 193 | 194 | class Qwen25Model(nn.Module): 195 | def __init__(self, config: Config, dtype: dsc.Dtype = dsc.f32): 196 | super().__init__() 197 | self.config = config 198 | self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, dtype=dtype) 199 | self.layers = nn.ModuleList([DecoderLayer(config, dtype=dtype) for _ in range(config.num_hidden_layers)]) 200 | self.norm = nn.RMSNorm(config.hidden_size, config.rms_norm_eps, dtype=dtype) 201 | self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False, dtype=dtype) 202 | cos_cache, sin_cache = _pre_compute_freqs(config.hidden_size // config.num_attention_heads, config.rope_theta, config.max_position_embeddings, dtype=dtype) 203 | self.cos_cache = cos_cache 204 | self.sin_cache = sin_cache 205 | 206 | 207 | @staticmethod 208 | def from_pretrained(config: Config = Config(), dtype: dsc.Dtype = dsc.f32) -> 'Qwen25Model': 209 | state_dict = nn.safe_load('https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct/resolve/main/model.safetensors', 210 | trim_prefix='model.', 211 | use_dtype=dtype) 212 | model = Qwen25Model(config, dtype) 213 | model.from_state(state_dict, 214 | tied={'lm_head.weight': 'embed_tokens.weight'}) 215 | del state_dict 216 | dsc.print_mem_usage() 217 | return model 218 | 219 | @dsc.trace('Qwen2_5') 220 | def forward(self, 221 | x: dsc.Tensor, 222 | position_ids: dsc.Tensor, 223 | past_key_values: Optional[Cache] = None, 224 | use_cache: bool = True 225 | ) -> Tuple[dsc.Tensor, Optional[Cache]]: 226 | h = self.embed_tokens(x.to('cpu')) 227 | 228 | next_kv_caches = [] if use_cache else None 229 | for i, layer in enumerate(self.layers): 230 | layer_cache = past_key_values[i] if past_key_values is not None else None 231 | h, present_kv = layer( 232 | h, 233 | self.cos_cache, 234 | self.sin_cache, 235 | position_ids=position_ids, 236 | past_key_value=layer_cache 237 | ) 238 | if use_cache: 239 | next_kv_caches.append(present_kv) 240 | 241 | h = self.norm(h) 242 | return self.lm_head(h), next_kv_caches 243 | 244 | def generate(self, idx: dsc.Tensor, tokenizer, max_new_tokens: int, top_k: int = 10): 245 | prompt_processing_start = perf_counter() 246 | prompt_tokens = idx.reshape(1, -1) 247 | prompt_len = prompt_tokens.size(1) 248 | 249 | prompt_position_ids = dsc.arange(stop=prompt_len, device='cpu').reshape(1, -1) 250 | # Run forward without caching 251 | logits, past_key_values = self(prompt_tokens, position_ids=prompt_position_ids, past_key_values=None) 252 | next_token_logits = logits[:, -1, :] 253 | generated_tokens = [] 254 | current_len = prompt_len 255 | prompt_processing_ms = (perf_counter() - prompt_processing_start) * 1e3 256 | 257 | # Loop 258 | generation_start = perf_counter() 259 | for _ in range(max_new_tokens): 260 | k_th_value = dsc.kth(next_token_logits.reshape(-1), top_k) 261 | next_token_logits = next_token_logits.masked_fill(next_token_logits < k_th_value, float('-inf')) 262 | probs = F.softmax(next_token_logits, axis=-1) 263 | 264 | next_token_id = dsc.multinomial(probs, num_samples=1) 265 | tok_id_scalar = next_token_id[0, 0] 266 | if tok_id_scalar == self.config.eos_token_id: 267 | print('\n[EOS]', flush=True) 268 | break 269 | 270 | generated_tokens.append(tok_id_scalar) 271 | print(tokenizer.decode(tok_id_scalar, skip_special_tokens=True), end='', flush=True) 272 | 273 | input_ids = next_token_id 274 | position_ids = dsc.tensor([current_len], dtype=dsc.i32, device='cpu').reshape(1, -1) 275 | 276 | # Run forward with caching 277 | logits, next_past_key_values = self( 278 | input_ids, 279 | position_ids=position_ids, 280 | past_key_values=past_key_values 281 | ) 282 | past_key_values = next_past_key_values 283 | next_token_logits = logits[:, -1, :] # Note: this is probably useless 284 | current_len += 1 285 | 286 | generation_stop = perf_counter() 287 | total_processing_ms = (generation_stop - prompt_processing_start) * 1e3 288 | generation_processing_ms = (generation_stop - generation_start) * 1e3 289 | print() 290 | 291 | print(f'prompt processing time\t= {round(prompt_processing_ms, 1)}ms') 292 | print(f'generation time\t\t= {round(generation_processing_ms, 1)} ms | {round(generation_processing_ms / max_new_tokens, 2)} ms/tok') 293 | print(f'total time\t\t= {round(total_processing_ms, 1)} ms | {round(max_new_tokens / (total_processing_ms / 1e3), 2)} tok/s') 294 | return generated_tokens 295 | 296 | 297 | if __name__ == '__main__': 298 | cli = argparse.ArgumentParser(description='QWEN 2.5 inference CLI') 299 | cli.add_argument('prompt', type=str, help='Model prompt') 300 | cli.add_argument('-n', type=int, default=100, help='Tokens to generate (default=100)') 301 | cli.add_argument('-top-k', type=int, default=10, help='Top K sampling (default=10)') 302 | cli.add_argument('--device', choices=['cpu', 'gpu'], default='cpu', help='Device on which to run the model') 303 | cli.add_argument('--dtype', choices=['f32', 'bf16'], default='f32', help='Dtype to use for inference') 304 | 305 | args = cli.parse_args() 306 | 307 | dsc.set_default_device(args.device) 308 | prompt = args.prompt 309 | max_tokens = args.n 310 | top_k = args.top_k 311 | dtype = dsc.f32 312 | if args.dtype == 'bf16': 313 | dtype = dsc.bf16 314 | 315 | print(f'Running model on {args.device} using {dtype}') 316 | 317 | model = Qwen25Model.from_pretrained(dtype=dtype) 318 | tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-Coder-0.5B-Instruct') 319 | messages = [ 320 | {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."}, 321 | {"role": "user", "content": prompt} 322 | ] 323 | tokens = tokenizer.apply_chat_template( 324 | messages, 325 | tokenize=False, 326 | add_generation_prompt=True, 327 | ) 328 | model_inputs = tokenizer([tokens], return_tensors="np") 329 | 330 | model_input_ids = dsc.from_numpy(model_inputs.input_ids.astype(np.int32), device='cpu') 331 | 332 | model.generate(model_input_ids, tokenizer, max_new_tokens=max_tokens, top_k=top_k) 333 | -------------------------------------------------------------------------------- /dsc/include/dsc.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024-2025, Christian Gilli 2 | // All rights reserved. 3 | // 4 | // This code is licensed under the terms of the 3-clause BSD license 5 | // (https://opensource.org/license/bsd-3-clause). 6 | 7 | #pragma once 8 | 9 | // =============================================================== // 10 | // =========================== Notepad =========================== // 11 | // =============================================================== // 12 | // (1) It's probably a good idea to add a struct for the arguments // 13 | // of dsc_new_tensor, mixing new params with defaults can lead // 14 | // to nasty bugs // 15 | // (2) Create a macro to validate tensors? It's probably a good // 16 | // idea to always check by defaults both tensor != nullptr and // 17 | // tensor->buf != nullptr // 18 | // (3) Sometimes the context in Python is freed before all the // 19 | // associated tensors are freed. This will SEGFAULT! It makes // 20 | // sense to just not free the context in Python for now // 21 | // (4) Evaluate the iterator approach (check codegen with godbolt) // 22 | // (5) Scratch buffer to allocate temporary results on a device // 23 | // (6) Use the same approach to pass shape as `full` // 24 | // =============================================================== // 25 | 26 | #include // getenv, atoi 27 | #include 28 | #include "dsc_dtype.h" 29 | 30 | 31 | #if !defined(DSC_MAX_OBJS) 32 | # define DSC_MAX_OBJS ((int) 1'000) 33 | #endif 34 | 35 | #define DSC_MAX_DEVICES ((int) 2) 36 | #define DSC_DEFAULT_DEVICE CPU 37 | #define DSC_COMPARISON_OPS ((int) 6) 38 | #define DSC_TRACE_NAME_MAX ((int) 32) 39 | #define DSC_TRACE_CAT_MAX ((int) 16) 40 | 41 | #if !defined(DSC_MAX_TRACES_PER_CHUNK) 42 | # define DSC_MAX_TRACES_PER_CHUNK 1'000'000 43 | #endif 44 | 45 | #if !defined(DSC_MAX_CHUNKS) 46 | # define DSC_MAX_CHUNKS 100 47 | #endif 48 | 49 | 50 | static_assert(DSC_MAX_DEVICES == 2, "DSC_MAX_DEVICES != 2 - update the code"); 51 | static_assert(DSC_COMPARISON_OPS == 6, "DSC_COMPARISON_OPS != 6 - update the code"); 52 | 53 | #define DSC_ASSERT(x) \ 54 | do { \ 55 | if (!(x)) { \ 56 | fprintf(stderr, "DSC_ASSERT: %s:%d %s\n", __FILE__, __LINE__, #x); \ 57 | exit(EXIT_FAILURE); \ 58 | } \ 59 | } while(0) 60 | 61 | #define DSC_LOG_FATAL(format, ...) \ 62 | do { \ 63 | fprintf(stderr, "[FATAL] %s: " format "\n", __func__, ##__VA_ARGS__); \ 64 | exit(EXIT_FAILURE); \ 65 | } while (0) 66 | 67 | #if DSC_LOG_LEVEL >= 3 68 | # define DSC_LOG_DEBUG(format, ...) ((void) 0) 69 | # define DSC_LOG_INFO(format, ...) ((void) 0) 70 | # define DSC_LOG_ERR(format, ...) ((void) 0) 71 | #elif DSC_LOG_LEVEL >= 2 72 | # define DSC_LOG_DEBUG(format, ...) ((void) 0) 73 | # define DSC_LOG_INFO(format, ...) ((void) 0) 74 | # define DSC_LOG_ERR(format, ...) fprintf(stderr, "[ERROR] %s: " format"\n",__func__, ##__VA_ARGS__) 75 | #elif DSC_LOG_LEVEL >= 1 76 | # define DSC_LOG_DEBUG(format, ...) ((void) 0) 77 | # define DSC_LOG_INFO(format, ...) fprintf(stdout, "[INFO ] %s: " format"\n",__func__, ##__VA_ARGS__) 78 | # define DSC_LOG_ERR(format, ...) fprintf(stderr, "[ERROR] %s: " format"\n",__func__, ##__VA_ARGS__) 79 | #else 80 | # define DSC_LOG_DEBUG(format, ...) fprintf(stdout, "[DEBUG] %s: " format"\n",__func__, ##__VA_ARGS__) 81 | # define DSC_LOG_INFO(format, ...) fprintf(stdout, "[INFO ] %s: " format"\n",__func__, ##__VA_ARGS__) 82 | # define DSC_LOG_ERR(format, ...) fprintf(stderr, "[ERROR] %s: " format"\n",__func__, ##__VA_ARGS__) 83 | #endif 84 | 85 | #define DSC_INVALID_CASE(format, ...) \ 86 | default: \ 87 | DSC_LOG_FATAL(format, ##__VA_ARGS__) 88 | 89 | #define DSC_UNUSED(x) ((void) (x)) 90 | // Compute the next value of X aligned to Y 91 | #define DSC_ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1)) 92 | #define DSC_MAX(x, y) ((x) > (y) ? (x) : (y)) 93 | #define DSC_MIN(x, y) ((x) < (y) ? (x) : (y)) 94 | #define DSC_CEIL(x, y) (((x) + ((y) - 1)) / (y)) 95 | #define DSC_B_TO_KB(b) ((f64) (b) / 1024.) 96 | #define DSC_B_TO_MB(b) ((f64) (b) / (1024. * 1024.)) 97 | #define DSC_GB(gb) ((usize) ((gb) * 1024ULL * 1024ULL * 1024ULL)) 98 | #define DSC_MB(mb) ((usize) ((mb) * 1024ULL * 1024ULL)) 99 | #define DSC_KB(kb) ((usize) ((kb) * 1024ULL)) 100 | 101 | // A 'strictly pure' function is a function whose return value doesn't depend on the global state of the program, 102 | // this means that it must not access global variables subject to change or access parameters passed by pointer 103 | // unless the actual value of the pointer does not change after the first invocation. 104 | // A 'pure' function is basically the same thing without the restriction on global state change, this means 105 | // that a 'pure' function can take in and read the value of parameters passed by pointer even if that value 106 | // changes between subsequent invocations. 107 | #if defined(__NVCC__) 108 | # define DSC_INLINE __forceinline__ 109 | # define DSC_STRICTLY_PURE __attribute__((const)) 110 | # define DSC_PURE __attribute__((pure)) 111 | #elif defined(__HIPCC__) 112 | # define DSC_INLINE __forceinline__ 113 | # define DSC_STRICTLY_PURE __attribute__((const)) 114 | # define DSC_PURE __attribute__((pure)) 115 | #elif defined(__GNUC__) 116 | # define DSC_INLINE inline __attribute__((always_inline)) 117 | # define DSC_STRICTLY_PURE __attribute__((const)) 118 | # define DSC_PURE __attribute__((pure)) 119 | #else 120 | # define DSC_INLINE inline 121 | # define DSC_STRICTLY_PURE 122 | # define DSC_PURE 123 | #endif 124 | 125 | #define DSC_RESTRICT __restrict 126 | 127 | #if !defined(DSC_MAX_DIMS) 128 | # define DSC_MAX_DIMS ((int) 4) 129 | #endif 130 | 131 | static_assert(DSC_MAX_DIMS == 4, "DSC_MAX_DIMS != 4 - update the code"); 132 | 133 | #define DSC_VALUE_NONE INT32_MAX 134 | #define DSC_DATA_ALIAS(T, X) T *X##_data = (T *) (X)->buf->data 135 | #define DSC_DATA(T, X) T *DSC_RESTRICT X##_data = (T *) (X)->buf->data 136 | 137 | #define dsc_tensor_dim_idx(X, dim) (((dim) < 0) ? (DSC_MAX_DIMS + (dim)) : (DSC_MAX_DIMS - (X)->n_dim + (dim))) 138 | // Note: dsc_tensor_get_dim() MUST NOT be used with the result of dsc_tensor_dim_idx()! 139 | #define dsc_tensor_get_dim(X, dim) ((X)->shape[dsc_tensor_dim_idx((X), (dim))]) 140 | #define dsc_tensor_get_stride(X, dim) ((X)->stride[dsc_tensor_dim_idx((X), (dim))]) 141 | #define dsc_new_like(CTX, X) (dsc_new_tensor((CTX), (X)->n_dim, &dsc_tensor_get_dim(X, 0), (X)->dtype, (X)->device)) 142 | #define dsc_copy_of(CTX, X, dev) (dsc_new_tensor((CTX), (X)->n_dim, &dsc_tensor_get_dim(X, 0), (X)->dtype, (dev), nullptr, false, (X)->buf->data, (X)->device)) 143 | #define dsc_new_view(CTX, X) (dsc_new_tensor((CTX), (X)->n_dim, &dsc_tensor_get_dim(X, 0), (X)->dtype, (X)->device, (X)->buf)) 144 | #define dsc_for(idx, X) for (int idx = 0; idx < (X)->ne; ++idx) 145 | #define dsc_is_scalar(X) (X)->ne == 1 146 | 147 | #if defined(__cplusplus) 148 | extern "C" { 149 | #endif 150 | 151 | struct dsc_data_buffer; 152 | struct dsc_trace_ctx; 153 | struct dsc_device; 154 | 155 | enum dsc_device_type : i8 { 156 | DEFAULT = -1, 157 | CPU, 158 | GPU, 159 | }; 160 | 161 | enum dsc_gpu_platform : i8 { 162 | NONE = -1, 163 | CUDA, 164 | ROCM, 165 | }; 166 | 167 | static constexpr const char *DSC_DEVICE_NAMES[DSC_MAX_DEVICES] = { 168 | "CPU", 169 | "GPU", 170 | }; 171 | 172 | static constexpr const char *DSC_GPU_PLATFORM_NAMES[2] = { 173 | "CUDA", 174 | "ROCm", 175 | }; 176 | 177 | enum dsc_comparison_op : u8 { 178 | EQ, 179 | NE, 180 | LT, 181 | LE, 182 | GT, 183 | GE 184 | }; 185 | 186 | struct dsc_tensor { 187 | // The shape of this tensor, right-aligned. For example a 1D tensor T of 4 elements 188 | // will have dim = [1, 1, 1, 4]. 189 | int shape[DSC_MAX_DIMS]; 190 | // Stride for a given dimension expressed in number of elements. 191 | int stride[DSC_MAX_DIMS]; 192 | dsc_data_buffer *buf; 193 | int ne; 194 | int n_dim; 195 | dsc_dtype dtype; 196 | dsc_device_type device; 197 | }; 198 | 199 | struct dsc_ctx { 200 | dsc_device *devices[DSC_MAX_DEVICES]; 201 | dsc_tensor *tensors; 202 | dsc_device_type default_device; 203 | }; 204 | 205 | struct dsc_slice { 206 | union { 207 | int d[3]; 208 | struct { 209 | int start, stop, step; 210 | }; 211 | }; 212 | }; 213 | 214 | // ============================================================ 215 | // Helpers 216 | 217 | static DSC_INLINE int dsc_get_env(const char *env, int value = 0) { 218 | if (const char *str = std::getenv(env)) { 219 | value = std::atoi(str); 220 | } 221 | 222 | return value; 223 | } 224 | 225 | // ============================================================ 226 | // Initialization 227 | 228 | extern dsc_ctx *dsc_ctx_init(usize mem_size); 229 | 230 | // ============================================================ 231 | // Cleanup/Teardown 232 | 233 | extern void dsc_ctx_free(dsc_ctx *ctx); 234 | 235 | extern void dsc_tensor_free(dsc_ctx *ctx, dsc_tensor *x); 236 | 237 | // ============================================================ 238 | // Utilities 239 | 240 | extern usize dsc_used_mem(dsc_ctx *ctx); 241 | 242 | extern void dsc_print_mem_usage(dsc_ctx *ctx); 243 | 244 | extern void dsc_set_default_device(dsc_ctx *ctx, dsc_device_type device); 245 | 246 | // ============================================================ 247 | // GPU Utilities 248 | 249 | extern dsc_gpu_platform dsc_get_gpu_platform(dsc_ctx *ctx); 250 | 251 | extern void dsc_gpu_set_device(dsc_ctx *ctx, int device); 252 | 253 | extern bool dsc_gpu_available(dsc_ctx *); 254 | 255 | extern int dsc_gpu_devices(dsc_ctx *); 256 | 257 | extern int dsc_gpu_dev_capability(dsc_ctx *, int device); 258 | 259 | extern usize dsc_gpu_dev_mem(dsc_ctx *, int device); 260 | 261 | extern void dsc_gpu_sync(dsc_ctx *); 262 | 263 | extern bool dsc_gpu_has_bf16(dsc_ctx *); 264 | 265 | // ============================================================ 266 | // Tracing 267 | 268 | extern bool dsc_tracing_enabled(); 269 | 270 | extern void dsc_insert_trace(dsc_ctx *ctx, 271 | const char *name, 272 | u64 start, 273 | u64 duration); 274 | 275 | extern void dsc_dump_traces(dsc_ctx *ctx); 276 | 277 | // ============================================================ 278 | // Tensor Creation 279 | 280 | extern void dsc_tensor_set_buffer(dsc_ctx *, 281 | dsc_tensor *DSC_RESTRICT x, 282 | dsc_data_buffer *buf); 283 | 284 | // TODO: (1) (2) 285 | extern dsc_tensor *dsc_new_tensor(dsc_ctx *ctx, 286 | int n_dim, 287 | const int *shape, 288 | dsc_dtype dtype, 289 | dsc_device_type device = DEFAULT, 290 | dsc_data_buffer *buf = nullptr, 291 | bool lazy = false, 292 | const void *DSC_RESTRICT data = nullptr, 293 | dsc_device_type data_device = DEFAULT); 294 | 295 | extern dsc_tensor *dsc_view(dsc_ctx *ctx, 296 | const dsc_tensor *x); 297 | 298 | extern dsc_tensor *dsc_tensor_1d(dsc_ctx *ctx, 299 | dsc_dtype dtype, 300 | int dim1, 301 | dsc_device_type device = DEFAULT, 302 | const void *DSC_RESTRICT data = nullptr, 303 | dsc_device_type data_device = DEFAULT); 304 | 305 | extern dsc_tensor *dsc_tensor_2d(dsc_ctx *ctx, 306 | dsc_dtype dtype, 307 | int dim1, int dim2, 308 | dsc_device_type device = DEFAULT, 309 | const void *DSC_RESTRICT data = nullptr, 310 | dsc_device_type data_device = DEFAULT); 311 | 312 | extern dsc_tensor *dsc_tensor_3d(dsc_ctx *ctx, 313 | dsc_dtype dtype, 314 | int dim1, int dim2, 315 | int dim3, 316 | dsc_device_type device = DEFAULT, 317 | const void *DSC_RESTRICT data = nullptr, 318 | dsc_device_type data_device = DEFAULT); 319 | 320 | extern dsc_tensor *dsc_tensor_4d(dsc_ctx *ctx, 321 | dsc_dtype dtype, 322 | int dim1, int dim2, 323 | int dim3, int dim4, 324 | dsc_device_type device = DEFAULT, 325 | const void *DSC_RESTRICT data = nullptr, 326 | dsc_device_type data_device = DEFAULT); 327 | 328 | extern dsc_tensor *dsc_wrap_bool(dsc_ctx *ctx, 329 | bool val, 330 | dsc_device_type device = DEFAULT); 331 | 332 | extern dsc_tensor *dsc_wrap_i32(dsc_ctx *ctx, 333 | i32 val, 334 | dsc_device_type device = DEFAULT); 335 | 336 | extern dsc_tensor *dsc_wrap_f32(dsc_ctx *ctx, 337 | f32 val, 338 | dsc_device_type device = DEFAULT, 339 | bool as_bf16 = false); 340 | 341 | extern dsc_tensor *dsc_wrap_f64(dsc_ctx *ctx, 342 | f64 val, 343 | dsc_device_type device = DEFAULT); 344 | 345 | extern dsc_tensor *dsc_arange(dsc_ctx *ctx, 346 | f64 stop, 347 | f64 start = 0, 348 | f64 step = 1, 349 | dsc_dtype dtype = I32, 350 | dsc_device_type device = DEFAULT); 351 | 352 | extern dsc_tensor *dsc_repeat(dsc_ctx *ctx, 353 | const dsc_tensor *DSC_RESTRICT x, 354 | int repeats, 355 | int axis = -1); 356 | 357 | extern dsc_tensor *dsc_randn(dsc_ctx *ctx, 358 | int n_dim, 359 | const int *shape, 360 | dsc_dtype dtype = DSC_DEFAULT_TYPE, 361 | dsc_device_type device = DEFAULT); 362 | 363 | extern dsc_tensor *dsc_kth(dsc_ctx *ctx, 364 | const dsc_tensor *DSC_RESTRICT x, 365 | int k); 366 | 367 | extern dsc_tensor *dsc_multinomial(dsc_ctx *ctx, 368 | const dsc_tensor *DSC_RESTRICT x, 369 | int num_samples); 370 | 371 | extern dsc_tensor *dsc_cast(dsc_ctx *ctx, 372 | dsc_tensor *DSC_RESTRICT x, 373 | dsc_dtype new_dtype); 374 | 375 | // ============================================================ 376 | // Tensor Manipulation 377 | 378 | extern void dsc_copy(dsc_ctx *ctx, 379 | dsc_tensor *DSC_RESTRICT x, 380 | void *DSC_RESTRICT data, 381 | usize nb, 382 | dsc_device_type data_device = DEFAULT); 383 | 384 | extern dsc_tensor *dsc_to(dsc_ctx *ctx, 385 | dsc_tensor *DSC_RESTRICT x, 386 | dsc_device_type new_device); 387 | 388 | extern dsc_tensor *dsc_reshape(dsc_ctx *ctx, 389 | const dsc_tensor *DSC_RESTRICT x, 390 | int dimensions...); 391 | 392 | extern dsc_tensor *dsc_concat(dsc_ctx *ctx, 393 | int axis, 394 | int tensors...); 395 | 396 | extern dsc_tensor *dsc_transpose(dsc_ctx *ctx, 397 | const dsc_tensor *DSC_RESTRICT x, 398 | int axes...); 399 | 400 | extern dsc_tensor *dsc_tril(dsc_ctx *ctx, 401 | const dsc_tensor *DSC_RESTRICT x, 402 | int diagonal = 0, 403 | dsc_tensor *DSC_RESTRICT out = nullptr); 404 | 405 | // ============================================================ 406 | // Indexing and Slicing 407 | // 408 | // All indexing and slicing operations will return a new tensor. 409 | // If the number of indexes passed to dsc_tensor_get_idx is equal to the number of 410 | // dimensions of x then a new tensor will be allocated with a single element, 411 | // the caller must take care of unwrapping it if needed. 412 | extern dsc_tensor *dsc_tensor_get_idx(dsc_ctx *ctx, 413 | const dsc_tensor *DSC_RESTRICT x, 414 | int indexes...); 415 | 416 | extern dsc_tensor *dsc_tensor_get_slice(dsc_ctx *ctx, 417 | const dsc_tensor *DSC_RESTRICT x, 418 | int slices...); 419 | 420 | extern dsc_tensor *dsc_tensor_get_tensor(dsc_ctx *ctx, 421 | const dsc_tensor *DSC_RESTRICT x, 422 | const dsc_tensor *DSC_RESTRICT indexes); 423 | 424 | extern void dsc_tensor_set_idx(dsc_ctx *ctx, 425 | dsc_tensor *DSC_RESTRICT xa, 426 | const dsc_tensor *DSC_RESTRICT xb, 427 | int indexes...); 428 | 429 | extern void dsc_tensor_set_slice(dsc_ctx *ctx, 430 | dsc_tensor *DSC_RESTRICT xa, 431 | const dsc_tensor *DSC_RESTRICT xb, 432 | int slices...); 433 | 434 | // ============================================================ 435 | // Binary Operations 436 | 437 | extern dsc_tensor *dsc_add(dsc_ctx *ctx, 438 | dsc_tensor *xa, 439 | dsc_tensor *xb, 440 | dsc_tensor *out = nullptr); 441 | 442 | extern dsc_tensor *dsc_sub(dsc_ctx *ctx, 443 | dsc_tensor *xa, 444 | dsc_tensor *xb, 445 | dsc_tensor *out = nullptr); 446 | 447 | extern dsc_tensor *dsc_mul(dsc_ctx *ctx, 448 | dsc_tensor *xa, 449 | dsc_tensor *xb, 450 | dsc_tensor *out = nullptr); 451 | 452 | extern dsc_tensor *dsc_div(dsc_ctx *ctx, 453 | dsc_tensor *xa, 454 | dsc_tensor *xb, 455 | dsc_tensor *out = nullptr); 456 | 457 | extern dsc_tensor *dsc_pow(dsc_ctx *ctx, 458 | dsc_tensor *xa, 459 | dsc_tensor *xb, 460 | dsc_tensor *out = nullptr); 461 | 462 | extern dsc_tensor *dsc_matmul(dsc_ctx *ctx, 463 | dsc_tensor *DSC_RESTRICT xa, 464 | dsc_tensor *DSC_RESTRICT xb, 465 | bool trans_b = false, 466 | dsc_tensor *DSC_RESTRICT out = nullptr); 467 | 468 | extern dsc_tensor *dsc_compare(dsc_ctx *ctx, 469 | const dsc_tensor *xa, 470 | const dsc_tensor *xb, 471 | dsc_comparison_op comp, 472 | dsc_tensor *out = nullptr); 473 | 474 | extern void dsc_masked_fill(dsc_ctx *ctx, 475 | dsc_tensor *DSC_RESTRICT x, 476 | const dsc_tensor *DSC_RESTRICT mask, 477 | f64 value); 478 | 479 | extern dsc_tensor *dsc_outer(dsc_ctx *ctx, 480 | dsc_tensor *DSC_RESTRICT xa, 481 | dsc_tensor *DSC_RESTRICT xb, 482 | dsc_tensor *DSC_RESTRICT out = nullptr); 483 | 484 | extern dsc_tensor *dsc_where(dsc_ctx *ctx, 485 | const dsc_tensor *DSC_RESTRICT condition, 486 | const dsc_tensor *DSC_RESTRICT input, 487 | const dsc_tensor *DSC_RESTRICT other, 488 | dsc_tensor *DSC_RESTRICT out = nullptr); 489 | 490 | // ============================================================ 491 | // Unary Operations 492 | 493 | extern dsc_tensor *dsc_cos(dsc_ctx *ctx, 494 | dsc_tensor *DSC_RESTRICT x, 495 | dsc_tensor *DSC_RESTRICT out = nullptr); 496 | 497 | extern dsc_tensor *dsc_sin(dsc_ctx *ctx, 498 | dsc_tensor *DSC_RESTRICT x, 499 | dsc_tensor *DSC_RESTRICT out = nullptr); 500 | 501 | extern dsc_tensor *dsc_tanh(dsc_ctx *ctx, 502 | dsc_tensor *DSC_RESTRICT x, 503 | dsc_tensor *DSC_RESTRICT out = nullptr); 504 | 505 | extern dsc_tensor *dsc_exp(dsc_ctx *ctx, 506 | dsc_tensor *DSC_RESTRICT x, 507 | dsc_tensor *DSC_RESTRICT out = nullptr); 508 | 509 | extern dsc_tensor *dsc_sqrt(dsc_ctx *ctx, 510 | dsc_tensor *DSC_RESTRICT x, 511 | dsc_tensor *DSC_RESTRICT out = nullptr); 512 | 513 | // ============================================================ 514 | // Unary Operations Along Axis 515 | 516 | extern dsc_tensor *dsc_sum(dsc_ctx *ctx, 517 | dsc_tensor *DSC_RESTRICT x, 518 | dsc_tensor *DSC_RESTRICT out = nullptr, 519 | int axis = -1, 520 | bool keep_dims = true); 521 | 522 | extern dsc_tensor *dsc_max(dsc_ctx *ctx, 523 | dsc_tensor *DSC_RESTRICT x, 524 | dsc_tensor *DSC_RESTRICT out = nullptr, 525 | int axis = -1, 526 | bool keep_dims = true); 527 | 528 | extern dsc_tensor *dsc_min(dsc_ctx *ctx, 529 | dsc_tensor *DSC_RESTRICT x, 530 | dsc_tensor *DSC_RESTRICT out = nullptr, 531 | int axis = -1, 532 | bool keep_dims = true); 533 | 534 | #if defined(__cplusplus) 535 | } 536 | #endif 537 | --------------------------------------------------------------------------------