├── Obsidian_Memory_Transformers.pdf
├── LICENSE
├── include
    ├── cuda_ops.cuh
    └── core
    │   ├── ops
    │       ├── mma_ops.cuh
    │       └── fused_ops.cuh
    │   ├── ltm
    │       ├── compression_gate.cuh
    │       └── memory_bank.cuh
    │   ├── quantization
    │       └── quantizer.cuh
    │   ├── attention
    │       ├── memory_attention.cuh
    │       └── flash_attention.cuh
    │   ├── utils
    │       ├── cuda_utils.cuh
    │       └── tensor.cuh
    │   └── transformer
    │       └── titan_inspired_block.cuh
├── src
    ├── trainer
    │   └── CMakeLists.txt
    ├── cuda_ops.cu
    ├── inference
    │   └── CMakeLists.txt
    ├── models
    │   └── CMakeLists.txt
    └── core
    │   ├── parallel
    │       ├── mpi_utils.cpp
    │       ├── pipeline.cpp
    │       └── tensor_parallel.cpp
    │   ├── ops
    │       ├── mma_ops.cu
    │       └── fused_ops.cu
    │   └── quantization
    │       ├── calibrator.cu
    │       └── quantizer.cu
├── .gitignore
├── python_bindings
    ├── CMakeLists.txt
    ├── setup.py
    ├── ltm
    │   └── __init__.py
    └── src
    │   └── main.cpp
├── CMakeLists.txt
├── tests
    └── CMakeLists.txt
├── CONTRIBUTING.md
├── README.md
└── docs
    ├── design
        └── architecture.md
    ├── performance
        └── optimization.md
    └── usage
        └── guide.md


/Obsidian_Memory_Transformers.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sahibzada-allahyar/Obsidian-Memory-Transformer/HEAD/Obsidian_Memory_Transformers.pdf


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MITc
2 | 
3 | C()OPRvbPPR,IHUULFR.:PUARXPRLUGBU(TTMTAN")HTASSPACAG.NTSHRTHSAWHECICWFEOEE/:THSOTAIVIDAWITHUTARRATFYKNEXPREMPIIUINGNOTMITTTHEARRATIMRHANTABIITFITNFRAPARTIUARPURENNNNNEMENEVTSHATHAUHRORORHTHODERBEABFOAAIMAMAGERHERIABITHETHRINANAIOFONRATTORTOROTHERARIGFRMOUOFORNNOTHTHSFARRHUEORHEAINGSITHEAR


--------------------------------------------------------------------------------
/include/cuda_ops.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda_runtime.h>
 4 | 
 5 | namespace cuda_ops {
 6 | 
 7 | // CUDA kernel function to add two arrays
 8 | __global__ void addArrays(const float* a, const float* b, float* c, int size);
 9 | 
10 | // Host function to allocate memory and launch kernel
11 | void vectorAdd(const float* hostA, const float* hostB, float* hostC, int size);
12 | 
13 | } // namespace cuda_ops
14 | 


--------------------------------------------------------------------------------
/src/trainer/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Trainer library configuration
 2 | add_library(ltm_trainer OBJECT
 3 |     trainer.cpp
 4 |     optimizer.cpp
 5 |     scheduler.cpp
 6 |     data_loader.cpp
 7 |     checkpoint.cpp
 8 |     metrics.cpp
 9 |     distributed.cpp
10 |     profiler.cpp
11 | )
12 | 
13 | target_compile_features(ltm_trainer PUBLIC cxx_std_17)
14 | 
15 | set_target_properties(ltm_trainer PROPERTIES
16 |     CUDA_SEPARABLE_COMPILATION ON
17 |     POSITION_INDEPENDENT_CODE ON
18 | )
19 | 
20 | target_include_directories(ltm_trainer PUBLIC
21 |     ${CMAKE_SOURCE_DIR}/include
22 | )
23 | 
24 | target_link_libraries(ltm_trainer PUBLIC
25 |     ltm_core
26 |     ltm_attention
27 |     ltm_memory
28 |     ltm_ops
29 |     ltm_parallel
30 |     ltm_quantization
31 |     ltm_transformer
32 |     ${CUDA_LIBRARIES}
33 |     ${CUDA_CUBLAS_LIBRARIES}
34 |     ${MPI_CXX_LIBRARIES}
35 |     ${NCCL_LIBRARIES}
36 |     ${CUTLASS_LIBRARIES}
37 | )
38 | 
39 | # Trainer executable
40 | add_executable(ltm_train
41 |     main.cpp
42 | )
43 | 
44 | target_link_libraries(ltm_train PRIVATE
45 |     ltm_trainer
46 |     ltm
47 | )
48 | 
49 | # Install targets
50 | install(TARGETS ltm_trainer
51 |     LIBRARY DESTINATION lib
52 |     ARCHIVE DESTINATION lib
53 | )
54 | 
55 | install(TARGETS ltm_train
56 |     RUNTIME DESTINATION bin
57 | )
58 | 
59 | # Configuration files
60 | install(FILES
61 |     ${CMAKE_CURRENT_SOURCE_DIR}/config/default_config.yaml
62 |     ${CMAKE_CURRENT_SOURCE_DIR}/config/distributed_config.yaml
63 |     DESTINATION etc/ltm/trainer
64 | )
65 | 


--------------------------------------------------------------------------------
/src/cuda_ops.cu:
--------------------------------------------------------------------------------
 1 | #include "cuda_ops.cuh"
 2 | #include <cstdio>
 3 | 
 4 | namespace cuda_ops {
 5 | 
 6 | __global__ void addArrays(const float* a, const float* b, float* c, int size) {
 7 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 8 |     if (idx < size) {
 9 |         c[idx] = a[idx] + b[idx];
10 |     }
11 | }
12 | 
13 | void vectorAdd(const float* hostA, const float* hostB, float* hostC, int size) {
14 |     // Declare device pointers
15 |     float *deviceA, *deviceB, *deviceC;
16 |     
17 |     // Allocate device memory
18 |     cudaMalloc(&deviceA, size * sizeof(float));
19 |     cudaMalloc(&deviceB, size * sizeof(float));
20 |     cudaMalloc(&deviceC, size * sizeof(float));
21 |     
22 |     // Copy inputs to device
23 |     cudaMemcpy(deviceA, hostA, size * sizeof(float), cudaMemcpyHostToDevice);
24 |     cudaMemcpy(deviceB, hostB, size * sizeof(float), cudaMemcpyHostToDevice);
25 |     
26 |     // Launch kernel
27 |     int threadsPerBlock = 256;
28 |     int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
29 |     addArrays<<<blocksPerGrid, threadsPerBlock>>>(deviceA, deviceB, deviceC, size);
30 |     
31 |     // Copy result back to host
32 |     cudaMemcpy(hostC, deviceC, size * sizeof(float), cudaMemcpyDeviceToHost);
33 |     
34 |     // Free device memory
35 |     cudaFree(deviceA);
36 |     cudaFree(deviceB);
37 |     cudaFree(deviceC);
38 |     
39 |     // Check for errors
40 |     cudaError_t error = cudaGetLastError();
41 |     if (error != cudaSuccess) {
42 |         printf("CUDA error: %s\n", cudaGetErrorString(error));
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Build directories
  2 | build/
  3 | dist/
  4 | _build/
  5 | cmake-build-*/
  6 | 
  7 | # Python
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | *.so
 12 | .Python
 13 | develop-eggs/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # C++
 29 | # Prerequisites
 30 | *.d
 31 | 
 32 | # Compiled Object files
 33 | *.slo
 34 | *.lo
 35 | *.o
 36 | *.obj
 37 | 
 38 | # Precompiled Headers
 39 | *.gch
 40 | *.pch
 41 | 
 42 | # Compiled Dynamic libraries
 43 | *.so
 44 | *.dylib
 45 | *.dll
 46 | 
 47 | # Compiled Static libraries
 48 | *.lai
 49 | *.la
 50 | *.a
 51 | *.lib
 52 | 
 53 | # Executables
 54 | *.exe
 55 | *.out
 56 | *.app
 57 | 
 58 | # CUDA
 59 | *.i
 60 | *.ii
 61 | *.gpu
 62 | *.ptx
 63 | *.cubin
 64 | *.fatbin
 65 | 
 66 | # CMake
 67 | CMakeCache.txt
 68 | CMakeFiles
 69 | CMakeScripts
 70 | Testing
 71 | Makefile
 72 | cmake_install.cmake
 73 | install_manifest.txt
 74 | compile_commands.json
 75 | CTestTestfile.cmake
 76 | 
 77 | # IDE
 78 | .idea/
 79 | .vscode/
 80 | *.swp
 81 | *.swo
 82 | *~
 83 | 
 84 | # Environment
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Logs and databases
 94 | *.log
 95 | *.sqlite
 96 | *.db
 97 | 
 98 | # Model checkpoints and data
 99 | checkpoints/
100 | data/
101 | *.bin
102 | *.pt
103 | *.pth
104 | *.ckpt
105 | *.h5
106 | *.hdf5
107 | *.npz
108 | *.npy
109 | 
110 | # Documentation
111 | docs/_build/
112 | site/
113 | 
114 | # OS generated files
115 | .DS_Store
116 | .DS_Store?
117 | ._*
118 | .Spotlight-V100
119 | .Trashes
120 | ehthumbs.db
121 | Thumbs.db
122 | 
123 | # Project specific
124 | third_party/
125 | *.config.json
126 | *.yaml
127 | !config/*.yaml
128 | *.cache
129 | wandb/
130 | runs/
131 | outputs/
132 | results/
133 | tmp/
134 | temp/
135 | 
136 | # Profiling and debugging
137 | *.nvvp
138 | *.nsys-rep
139 | *.qdrep
140 | *.sqlite
141 | core.*
142 | *.trace
143 | *.profile
144 | 
145 | # Dependencies
146 | node_modules/
147 | jspm_packages/
148 | bower_components/
149 | 
150 | # Coverage reports
151 | htmlcov/
152 | .tox/
153 | .coverage
154 | .coverage.*
155 | .cache
156 | nosetests.xml
157 | coverage.xml
158 | *.cover
159 | .hypothesis/
160 | .pytest_cache/
161 | 
162 | # Jupyter Notebook
163 | .ipynb_checkpoints
164 | *.ipynb
165 | 
166 | # Distribution / packaging
167 | .Python
168 | build/
169 | develop-eggs/
170 | dist/
171 | downloads/
172 | eggs/
173 | .eggs/
174 | lib/
175 | lib64/
176 | parts/
177 | sdist/
178 | var/
179 | wheels/
180 | *.egg-info/
181 | .installed.cfg
182 | *.egg
183 | MANIFEST
184 | 


--------------------------------------------------------------------------------
/python_bindings/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Python bindings configuration
 2 | pybind11_add_module(ltm_python
 3 |     src/main.cpp
 4 |     src/model.cpp
 5 |     src/trainer.cpp
 6 |     src/inference.cpp
 7 | )
 8 | 
 9 | target_compile_features(ltm_python PRIVATE cxx_std_17)
10 | 
11 | target_include_directories(ltm_python PRIVATE
12 |     ${CMAKE_CURRENT_SOURCE_DIR}/../include
13 |     ${pybind11_INCLUDE_DIRS}
14 |     ${PYTHON_INCLUDE_DIRS}
15 | )
16 | 
17 | # Link against our main library and its dependencies
18 | target_link_libraries(ltm_python PRIVATE
19 |     ltm
20 |     ${CUDA_LIBRARIES}
21 |     ${CUDA_CUBLAS_LIBRARIES}
22 |     ${MPI_CXX_LIBRARIES}
23 |     ${NCCL_LIBRARIES}
24 |     ${CUTLASS_LIBRARIES}
25 | )
26 | 
27 | # Set output directory for the Python module
28 | set_target_properties(ltm_python PROPERTIES
29 |     LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/ltm
30 |     OUTPUT_NAME "_ltm"  # The module will be imported as ltm._ltm
31 |     PREFIX "${PYTHON_MODULE_PREFIX}"
32 |     SUFFIX "${PYTHON_MODULE_EXTENSION}"
33 | )
34 | 
35 | # Install Python package
36 | install(TARGETS ltm_python
37 |     LIBRARY DESTINATION ${PYTHON_SITE_PACKAGES}/ltm
38 |     ARCHIVE DESTINATION ${PYTHON_SITE_PACKAGES}/ltm
39 | )
40 | 
41 | install(FILES
42 |     ltm/__init__.py
43 |     ltm/model.py
44 |     ltm/trainer.py
45 |     ltm/inference.py
46 |     DESTINATION ${PYTHON_SITE_PACKAGES}/ltm
47 | )
48 | 
49 | # Copy Python files to build directory
50 | add_custom_command(TARGET ltm_python POST_BUILD
51 |     COMMAND ${CMAKE_COMMAND} -E copy_if_different
52 |         ${CMAKE_CURRENT_SOURCE_DIR}/ltm/__init__.py
53 |         ${CMAKE_CURRENT_SOURCE_DIR}/ltm/model.py
54 |         ${CMAKE_CURRENT_SOURCE_DIR}/ltm/trainer.py
55 |         ${CMAKE_CURRENT_SOURCE_DIR}/ltm/inference.py
56 |         ${CMAKE_CURRENT_BINARY_DIR}/ltm/
57 | )
58 | 
59 | # Setup.py configuration
60 | configure_file(
61 |     ${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
62 |     ${CMAKE_CURRENT_SOURCE_DIR}/setup.py
63 |     @ONLY
64 | )
65 | 
66 | # Build wheel package
67 | add_custom_target(wheel
68 |     COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
69 |     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
70 |     DEPENDS ltm_python
71 | )
72 | 
73 | # Development mode installation
74 | add_custom_target(develop
75 |     COMMAND ${PYTHON_EXECUTABLE} setup.py develop
76 |     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
77 |     DEPENDS ltm_python
78 | )
79 | 
80 | # Clean Python build artifacts
81 | add_custom_target(clean_python
82 |     COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_CURRENT_SOURCE_DIR}/build
83 |     COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_CURRENT_SOURCE_DIR}/dist
84 |     COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_CURRENT_SOURCE_DIR}/ltm.egg-info
85 |     COMMAND ${CMAKE_COMMAND} -E remove_directory ${CMAKE_CURRENT_SOURCE_DIR}/ltm/__pycache__
86 | )
87 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.18)
  2 | project(ltm LANGUAGES CXX CUDA)
  3 | 
  4 | # Set C++ standard
  5 | set(CMAKE_CXX_STANDARD 17)
  6 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
  7 | set(CMAKE_CUDA_STANDARD 17)
  8 | set(CMAKE_CUDA_STANDARD_REQUIRED ON)
  9 | 
 10 | # Set default build type to Release
 11 | if(NOT CMAKE_BUILD_TYPE)
 12 |     set(CMAKE_BUILD_TYPE Release)
 13 | endif()
 14 | 
 15 | # CUDA configuration
 16 | find_package(CUDA REQUIRED)
 17 | set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86)  # Support Volta, Turing, Ampere
 18 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall -Xcompiler -Wextra")
 19 | 
 20 | # Dependencies
 21 | find_package(MPI REQUIRED)
 22 | find_package(NCCL REQUIRED)
 23 | find_package(pybind11 REQUIRED)
 24 | find_package(CUTLASS REQUIRED)
 25 | 
 26 | # Set include directories
 27 | include_directories(
 28 |     ${CMAKE_CURRENT_SOURCE_DIR}/include
 29 |     ${CUDA_INCLUDE_DIRS}
 30 |     ${MPI_INCLUDE_PATH}
 31 |     ${NCCL_INCLUDE_DIRS}
 32 |     ${pybind11_INCLUDE_DIRS}
 33 |     ${CUTLASS_INCLUDE_DIRS}
 34 | )
 35 | 
 36 | # Compiler flags
 37 | if(CMAKE_BUILD_TYPE STREQUAL "Debug")
 38 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0")
 39 |     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G -O0")
 40 | else()
 41 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
 42 |     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
 43 | endif()
 44 | 
 45 | # Enable fast math for CUDA
 46 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
 47 | 
 48 | # Add subdirectories
 49 | add_subdirectory(src)
 50 | add_subdirectory(python_bindings)
 51 | add_subdirectory(tests)
 52 | 
 53 | # Main library target
 54 | add_library(ltm STATIC
 55 |     $<TARGET_OBJECTS:ltm_core>
 56 |     $<TARGET_OBJECTS:ltm_attention>
 57 |     $<TARGET_OBJECTS:ltm_memory>
 58 |     $<TARGET_OBJECTS:ltm_ops>
 59 |     $<TARGET_OBJECTS:ltm_parallel>
 60 |     $<TARGET_OBJECTS:ltm_quantization>
 61 |     $<TARGET_OBJECTS:ltm_transformer>
 62 | )
 63 | 
 64 | target_link_libraries(ltm PUBLIC
 65 |     ${CUDA_LIBRARIES}
 66 |     ${CUDA_CUBLAS_LIBRARIES}
 67 |     ${CUDA_CUSPARSE_LIBRARIES}
 68 |     ${CUDA_curand_LIBRARY}
 69 |     ${MPI_CXX_LIBRARIES}
 70 |     ${NCCL_LIBRARIES}
 71 |     ${CUTLASS_LIBRARIES}
 72 | )
 73 | 
 74 | # Installation
 75 | install(TARGETS ltm
 76 |     LIBRARY DESTINATION lib
 77 |     ARCHIVE DESTINATION lib
 78 |     RUNTIME DESTINATION bin
 79 | )
 80 | 
 81 | install(DIRECTORY include/
 82 |     DESTINATION include
 83 |     FILES_MATCHING PATTERN "*.h*"
 84 | )
 85 | 
 86 | # Package configuration
 87 | include(CMakePackageConfigHelpers)
 88 | write_basic_package_version_file(
 89 |     "${CMAKE_CURRENT_BINARY_DIR}/ltmConfigVersion.cmake"
 90 |     VERSION 0.1.0
 91 |     COMPATIBILITY SameMajorVersion
 92 | )
 93 | 
 94 | install(FILES
 95 |     "${CMAKE_CURRENT_BINARY_DIR}/ltmConfigVersion.cmake"
 96 |     DESTINATION lib/cmake/ltm
 97 | )
 98 | 
 99 | # Export targets
100 | export(TARGETS ltm FILE "${CMAKE_CURRENT_BINARY_DIR}/ltmTargets.cmake")
101 | install(EXPORT ltmTargets
102 |     FILE ltmTargets.cmake
103 |     NAMESPACE ltm::
104 |     DESTINATION lib/cmake/ltm
105 | )
106 | 


--------------------------------------------------------------------------------
/python_bindings/setup.py:
--------------------------------------------------------------------------------
  1 | from setuptools import setup, find_packages
  2 | from setuptools.dist import Distribution
  3 | import os
  4 | import sys
  5 | 
  6 | class BinaryDistribution(Distribution):
  7 |     """Distribution which always forces a binary package"""
  8 |     def has_ext_modules(self):
  9 |         return True
 10 | 
 11 | def get_version():
 12 |     """Get version from version.h"""
 13 |     with open(os.path.join(os.path.dirname(__file__), '..', 'version.h'), 'r') as f:
 14 |         for line in f:
 15 |             if line.startswith('#define LTM_VERSION'):
 16 |                 return line.split('"')[1]
 17 |     return '0.1.0'  # Default version if not found
 18 | 
 19 | # Check Python version
 20 | if sys.version_info < (3, 7):
 21 |     sys.exit('Python >= 3.7 is required')
 22 | 
 23 | # Get long description from README
 24 | with open(os.path.join(os.path.dirname(__file__), '..', 'README.md'), 'r', encoding='utf-8') as f:
 25 |     long_description = f.read()
 26 | 
 27 | setup(
 28 |     name='ltm-transformer',
 29 |     version=get_version(),
 30 |     author='Sahibzada Allahyar',
 31 |     author_email='allahyar@singularityresearch.org',
 32 |     description='Long-term Memory Transformer with Titan-inspired architecture',
 33 |     long_description=long_description,
 34 |     long_description_content_type='text/markdown',
 35 |     url='https://github.com/singularityresearch/ltm-transformer',
 36 |     packages=find_packages(),
 37 |     package_data={
 38 |         'ltm': ['*.so', '*.pyd', 'config/*.yaml'],
 39 |     },
 40 |     distclass=BinaryDistribution,
 41 |     python_requires='>=3.7',
 42 |     install_requires=[
 43 |         'numpy>=1.19.0',
 44 |         'torch>=1.9.0',
 45 |         'pyyaml>=5.1',
 46 |         'tqdm>=4.45.0',
 47 |         'tensorboard>=2.4.0',
 48 |         'transformers>=4.5.0',
 49 |         'datasets>=1.6.0',
 50 |         'sentencepiece>=0.1.96',
 51 |         'tokenizers>=0.10.3',
 52 |         'wandb>=0.12.0',
 53 |         'pytest>=6.0.0',
 54 |         'pytest-benchmark>=3.4.0',
 55 |     ],
 56 |     extras_require={
 57 |         'dev': [
 58 |             'black',
 59 |             'isort',
 60 |             'flake8',
 61 |             'mypy',
 62 |             'pytest-cov',
 63 |             'sphinx',
 64 |             'sphinx-rtd-theme',
 65 |         ],
 66 |         'distributed': [
 67 |             'mpi4py>=3.0.0',
 68 |             'horovod>=0.21.0',
 69 |         ],
 70 |         'quantization': [
 71 |             'onnx>=1.9.0',
 72 |             'onnxruntime>=1.8.0',
 73 |         ],
 74 |         'serving': [
 75 |             'fastapi>=0.65.0',
 76 |             'uvicorn>=0.14.0',
 77 |             'grpcio>=1.38.0',
 78 |             'grpcio-tools>=1.38.0',
 79 |         ],
 80 |     },
 81 |     classifiers=[
 82 |         'Development Status :: 4 - Beta',
 83 |         'Intended Audience :: Science/Research',
 84 |         'License :: OSI Approved :: Apache Software License',
 85 |         'Operating System :: POSIX :: Linux',
 86 |         'Programming Language :: Python :: 3',
 87 |         'Programming Language :: Python :: 3.7',
 88 |         'Programming Language :: Python :: 3.8',
 89 |         'Programming Language :: Python :: 3.9',
 90 |         'Programming Language :: Python :: 3.10',
 91 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
 92 |     ],
 93 |     entry_points={
 94 |         'console_scripts': [
 95 |             'ltm-train=ltm.trainer:main',
 96 |             'ltm-infer=ltm.inference:main',
 97 |             'ltm-serve=ltm.server:main',
 98 |             'ltm-convert=ltm.tools.convert:main',
 99 |             'ltm-benchmark=ltm.tools.benchmark:main',
100 |         ],
101 |     },
102 |     project_urls={
103 |         'Documentation': 'https://ltm-transformer.readthedocs.io/',
104 |         'Source': 'https://github.com/singularityresearch/ltm-transformer',
105 |         'Tracker': 'https://github.com/singularityresearch/ltm-transformer/issues',
106 |     },
107 | )
108 | 


--------------------------------------------------------------------------------
/src/inference/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # Inference library configuration
  2 | add_library(ltm_inference OBJECT
  3 |     inference.cpp
  4 |     model_loader.cpp
  5 |     tokenizer.cpp
  6 |     cache_manager.cpp
  7 |     batch_processor.cpp
  8 |     pipeline.cpp
  9 |     quantized_inference.cpp
 10 |     tensor_parallel_inference.cpp
 11 | )
 12 | 
 13 | target_compile_features(ltm_inference PUBLIC cxx_std_17)
 14 | 
 15 | set_target_properties(ltm_inference PROPERTIES
 16 |     CUDA_SEPARABLE_COMPILATION ON
 17 |     POSITION_INDEPENDENT_CODE ON
 18 | )
 19 | 
 20 | target_include_directories(ltm_inference PUBLIC
 21 |     ${CMAKE_SOURCE_DIR}/include
 22 | )
 23 | 
 24 | target_link_libraries(ltm_inference PUBLIC
 25 |     ltm_core
 26 |     ltm_attention
 27 |     ltm_memory
 28 |     ltm_ops
 29 |     ltm_parallel
 30 |     ltm_quantization
 31 |     ltm_transformer
 32 |     ${CUDA_LIBRARIES}
 33 |     ${CUDA_CUBLAS_LIBRARIES}
 34 |     ${MPI_CXX_LIBRARIES}
 35 |     ${NCCL_LIBRARIES}
 36 |     ${CUTLASS_LIBRARIES}
 37 | )
 38 | 
 39 | # Inference executable
 40 | add_executable(ltm_infer
 41 |     main.cpp
 42 | )
 43 | 
 44 | target_link_libraries(ltm_infer PRIVATE
 45 |     ltm_inference
 46 |     ltm
 47 | )
 48 | 
 49 | # Server dependencies
 50 | find_package(Boost REQUIRED COMPONENTS system thread)
 51 | find_package(OpenSSL REQUIRED)
 52 | find_package(gRPC CONFIG REQUIRED)
 53 | find_package(Protobuf REQUIRED)
 54 | 
 55 | # Generate gRPC/Protobuf files
 56 | protobuf_generate_cpp(PROTO_SRCS PROTO_HDRS
 57 |     ${CMAKE_CURRENT_SOURCE_DIR}/proto/inference.proto
 58 | )
 59 | protobuf_generate_grpc_cpp(GRPC_SRCS GRPC_HDRS
 60 |     ${CMAKE_CURRENT_SOURCE_DIR}/proto/inference.proto
 61 | )
 62 | 
 63 | # Server executable
 64 | add_executable(ltm_server
 65 |     server.cpp
 66 |     http_handler.cpp
 67 |     websocket_handler.cpp
 68 |     grpc_service.cpp
 69 |     ${PROTO_SRCS}
 70 |     ${PROTO_HDRS}
 71 |     ${GRPC_SRCS}
 72 |     ${GRPC_HDRS}
 73 | )
 74 | 
 75 | target_include_directories(ltm_server PRIVATE
 76 |     ${CMAKE_CURRENT_BINARY_DIR}  # For generated protobuf files
 77 |     ${Boost_INCLUDE_DIRS}
 78 |     ${OPENSSL_INCLUDE_DIR}
 79 |     ${Protobuf_INCLUDE_DIRS}
 80 |     ${gRPC_INCLUDE_DIRS}
 81 | )
 82 | 
 83 | target_link_libraries(ltm_server PRIVATE
 84 |     ltm_inference
 85 |     ltm
 86 |     Boost::system
 87 |     Boost::thread
 88 |     OpenSSL::SSL
 89 |     OpenSSL::Crypto
 90 |     gRPC::grpc++
 91 |     gRPC::grpc++_reflection
 92 |     protobuf::libprotobuf
 93 | )
 94 | 
 95 | # Install targets
 96 | install(TARGETS ltm_inference
 97 |     LIBRARY DESTINATION lib
 98 |     ARCHIVE DESTINATION lib
 99 | )
100 | 
101 | install(TARGETS ltm_infer ltm_server
102 |     RUNTIME DESTINATION bin
103 | )
104 | 
105 | # Install configuration files
106 | install(FILES
107 |     ${CMAKE_CURRENT_SOURCE_DIR}/config/inference_config.yaml
108 |     ${CMAKE_CURRENT_SOURCE_DIR}/config/server_config.yaml
109 |     DESTINATION etc/ltm/inference
110 | )
111 | 
112 | # Install protocol files
113 | install(FILES
114 |     ${CMAKE_CURRENT_SOURCE_DIR}/proto/inference.proto
115 |     DESTINATION share/ltm/proto
116 | )
117 | 
118 | # Install API documentation
119 | install(FILES
120 |     ${CMAKE_CURRENT_SOURCE_DIR}/docs/api.md
121 |     ${CMAKE_CURRENT_SOURCE_DIR}/docs/server.md
122 |     DESTINATION share/doc/ltm/inference
123 | )
124 | 
125 | # Create version file
126 | configure_file(
127 |     ${CMAKE_CURRENT_SOURCE_DIR}/version.h.in
128 |     ${CMAKE_CURRENT_BINARY_DIR}/version.h
129 | )
130 | 
131 | # Add version information to server
132 | target_include_directories(ltm_server PRIVATE
133 |     ${CMAKE_CURRENT_BINARY_DIR}
134 | )
135 | 
136 | # Deployment scripts
137 | install(PROGRAMS
138 |     ${CMAKE_CURRENT_SOURCE_DIR}/scripts/deploy_server.sh
139 |     ${CMAKE_CURRENT_SOURCE_DIR}/scripts/monitor_server.sh
140 |     DESTINATION bin
141 | )
142 | 
143 | # Docker support
144 | configure_file(
145 |     ${CMAKE_CURRENT_SOURCE_DIR}/Dockerfile.in
146 |     ${CMAKE_CURRENT_BINARY_DIR}/Dockerfile
147 |     @ONLY
148 | )
149 | 
150 | # Add custom target for building Docker image
151 | add_custom_target(docker_image
152 |     COMMAND docker build -t ltm-server:${LTM_VERSION} .
153 |     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
154 |     DEPENDS ltm_server
155 |     COMMENT "Building Docker image for LTM server"
156 | )
157 | 


--------------------------------------------------------------------------------
/src/models/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # Models library configuration
  2 | add_library(ltm_models OBJECT
  3 |     config/model_config.cpp
  4 |     config/training_config.cpp
  5 |     config/inference_config.cpp
  6 |     config/distributed_config.cpp
  7 |     config/yaml_parser.cpp
  8 |     model_factory.cpp
  9 |     model_registry.cpp
 10 |     checkpointing.cpp
 11 |     base_model.cpp
 12 |     titan_model.cpp
 13 |     titan_config.cpp
 14 |     model_utils.cpp
 15 | )
 16 | 
 17 | target_compile_features(ltm_models PUBLIC cxx_std_17)
 18 | 
 19 | set_target_properties(ltm_models PROPERTIES
 20 |     CUDA_SEPARABLE_COMPILATION ON
 21 |     POSITION_INDEPENDENT_CODE ON
 22 | )
 23 | 
 24 | target_include_directories(ltm_models PUBLIC
 25 |     ${CMAKE_SOURCE_DIR}/include
 26 | )
 27 | 
 28 | # Find YAML library for config parsing
 29 | find_package(yaml-cpp REQUIRED)
 30 | 
 31 | target_link_libraries(ltm_models PUBLIC
 32 |     ltm_core
 33 |     ltm_attention
 34 |     ltm_memory
 35 |     ltm_ops
 36 |     ltm_parallel
 37 |     ltm_quantization
 38 |     ltm_transformer
 39 |     yaml-cpp
 40 |     ${CUDA_LIBRARIES}
 41 |     ${CUDA_CUBLAS_LIBRARIES}
 42 |     ${MPI_CXX_LIBRARIES}
 43 |     ${NCCL_LIBRARIES}
 44 |     ${CUTLASS_LIBRARIES}
 45 | )
 46 | 
 47 | # Model configuration files
 48 | set(MODEL_CONFIG_DIR ${CMAKE_CURRENT_SOURCE_DIR}/config)
 49 | set(MODEL_CONFIG_FILES
 50 |     ${MODEL_CONFIG_DIR}/titan_base.yaml
 51 |     ${MODEL_CONFIG_DIR}/titan_small.yaml
 52 |     ${MODEL_CONFIG_DIR}/titan_medium.yaml
 53 |     ${MODEL_CONFIG_DIR}/titan_large.yaml
 54 |     ${MODEL_CONFIG_DIR}/titan_xl.yaml
 55 | )
 56 | 
 57 | # Install model library
 58 | install(TARGETS ltm_models
 59 |     LIBRARY DESTINATION lib
 60 |     ARCHIVE DESTINATION lib
 61 | )
 62 | 
 63 | # Install model configuration files
 64 | install(FILES ${MODEL_CONFIG_FILES}
 65 |     DESTINATION etc/ltm/models
 66 | )
 67 | 
 68 | # Install model schema files
 69 | install(FILES
 70 |     ${MODEL_CONFIG_DIR}/schema/model_schema.json
 71 |     ${MODEL_CONFIG_DIR}/schema/training_schema.json
 72 |     ${MODEL_CONFIG_DIR}/schema/inference_schema.json
 73 |     DESTINATION etc/ltm/models/schema
 74 | )
 75 | 
 76 | # Generate model configuration header
 77 | set(MODEL_CONFIG_TEMPLATE ${CMAKE_CURRENT_SOURCE_DIR}/config/model_config.h.in)
 78 | set(MODEL_CONFIG_OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/model_config.h)
 79 | 
 80 | configure_file(${MODEL_CONFIG_TEMPLATE} ${MODEL_CONFIG_OUTPUT})
 81 | 
 82 | target_include_directories(ltm_models PRIVATE
 83 |     ${CMAKE_CURRENT_BINARY_DIR}  # For generated config header
 84 | )
 85 | 
 86 | # Model registry configuration
 87 | set(MODEL_REGISTRY_FILE ${CMAKE_CURRENT_BINARY_DIR}/model_registry.cpp)
 88 | add_custom_command(
 89 |     OUTPUT ${MODEL_REGISTRY_FILE}
 90 |     COMMAND ${CMAKE_COMMAND}
 91 |         -DCONFIG_DIR=${MODEL_CONFIG_DIR}
 92 |         -DOUTPUT_FILE=${MODEL_REGISTRY_FILE}
 93 |         -P ${CMAKE_CURRENT_SOURCE_DIR}/scripts/generate_registry.cmake
 94 |     DEPENDS ${MODEL_CONFIG_FILES}
 95 |     COMMENT "Generating model registry"
 96 | )
 97 | 
 98 | # Add model registry to sources
 99 | target_sources(ltm_models PRIVATE
100 |     ${MODEL_REGISTRY_FILE}
101 | )
102 | 
103 | # Model documentation
104 | install(FILES
105 |     ${CMAKE_CURRENT_SOURCE_DIR}/docs/models.md
106 |     ${CMAKE_CURRENT_SOURCE_DIR}/docs/configurations.md
107 |     ${CMAKE_CURRENT_SOURCE_DIR}/docs/checkpointing.md
108 |     DESTINATION share/doc/ltm/models
109 | )
110 | 
111 | # Model utilities
112 | install(PROGRAMS
113 |     ${CMAKE_CURRENT_SOURCE_DIR}/scripts/convert_checkpoint.py
114 |     ${CMAKE_CURRENT_SOURCE_DIR}/scripts/analyze_model.py
115 |     ${CMAKE_CURRENT_SOURCE_DIR}/scripts/validate_config.py
116 |     DESTINATION bin
117 | )
118 | 
119 | # Testing support
120 | if(BUILD_TESTING)
121 |     add_subdirectory(tests)
122 | endif()
123 | 
124 | # Python bindings support
125 | if(BUILD_PYTHON_BINDINGS)
126 |     add_subdirectory(python)
127 | endif()
128 | 
129 | # Version information
130 | set(LTM_MODELS_VERSION_MAJOR 0)
131 | set(LTM_MODELS_VERSION_MINOR 1)
132 | set(LTM_MODELS_VERSION_PATCH 0)
133 | 
134 | configure_file(
135 |     ${CMAKE_CURRENT_SOURCE_DIR}/version.h.in
136 |     ${CMAKE_CURRENT_BINARY_DIR}/version.h
137 | )
138 | 
139 | # Export version information
140 | set(LTM_MODELS_VERSION 
141 |     "${LTM_MODELS_VERSION_MAJOR}.${LTM_MODELS_VERSION_MINOR}.${LTM_MODELS_VERSION_PATCH}"
142 |     PARENT_SCOPE
143 | )
144 | 


--------------------------------------------------------------------------------
/python_bindings/ltm/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | LTM Transformer: Long-term Memory Transformer with Titan-inspired architecture
  3 | """
  4 | 
  5 | from ltm.model import (
  6 |     TitanModel,
  7 |     TitanConfig,
  8 |     TitanForCausalLM,
  9 |     TitanForSequenceClassification,
 10 |     TitanForTokenClassification,
 11 |     TitanPreTrainedModel,
 12 | )
 13 | 
 14 | from ltm.trainer import (
 15 |     Trainer,
 16 |     TrainingArguments,
 17 |     DataCollator,
 18 |     DistributedTrainer,
 19 |     TrainerCallback,
 20 |     EarlyStoppingCallback,
 21 |     TensorBoardCallback,
 22 |     WandBCallback,
 23 | )
 24 | 
 25 | from ltm.inference import (
 26 |     InferenceEngine,
 27 |     InferenceConfig,
 28 |     BatchProcessor,
 29 |     CacheManager,
 30 |     QuantizedEngine,
 31 |     TensorParallelEngine,
 32 | )
 33 | 
 34 | # Version information
 35 | __version__ = "0.1.0"
 36 | __author__ = "Sahibzada Allahyar"
 37 | __author_email__ = "allahyar@singularityresearch.org"
 38 | __license__ = "Apache License 2.0"
 39 | __copyright__ = "Copyright 2025 Singularity Research"
 40 | __homepage__ = "https://github.com/singularityresearch/ltm-transformer"
 41 | __docs__ = "https://ltm-transformer.readthedocs.io/"
 42 | 
 43 | # Module level configuration
 44 | import logging
 45 | logging.getLogger(__name__).addHandler(logging.NullHandler())
 46 | 
 47 | # Check for CUDA availability
 48 | import torch
 49 | CUDA_AVAILABLE = torch.cuda.is_available()
 50 | if not CUDA_AVAILABLE:
 51 |     logging.warning("CUDA is not available. LTM Transformer will run in CPU-only mode.")
 52 | 
 53 | # Import optional dependencies
 54 | try:
 55 |     import mpi4py
 56 |     MPI_AVAILABLE = True
 57 | except ImportError:
 58 |     MPI_AVAILABLE = False
 59 |     logging.info("mpi4py not found. Distributed training features will be limited.")
 60 | 
 61 | try:
 62 |     import horovod.torch as hvd
 63 |     HOROVOD_AVAILABLE = True
 64 | except ImportError:
 65 |     HOROVOD_AVAILABLE = False
 66 |     logging.info("Horovod not found. Some distributed training features will be disabled.")
 67 | 
 68 | try:
 69 |     import onnx
 70 |     import onnxruntime
 71 |     ONNX_AVAILABLE = True
 72 | except ImportError:
 73 |     ONNX_AVAILABLE = False
 74 |     logging.info("ONNX/ONNXRuntime not found. Quantization features will be limited.")
 75 | 
 76 | # Public API
 77 | __all__ = [
 78 |     # Models
 79 |     "TitanModel",
 80 |     "TitanConfig",
 81 |     "TitanForCausalLM",
 82 |     "TitanForSequenceClassification",
 83 |     "TitanForTokenClassification",
 84 |     "TitanPreTrainedModel",
 85 |     
 86 |     # Training
 87 |     "Trainer",
 88 |     "TrainingArguments",
 89 |     "DataCollator",
 90 |     "DistributedTrainer",
 91 |     "TrainerCallback",
 92 |     "EarlyStoppingCallback",
 93 |     "TensorBoardCallback",
 94 |     "WandBCallback",
 95 |     
 96 |     # Inference
 97 |     "InferenceEngine",
 98 |     "InferenceConfig",
 99 |     "BatchProcessor",
100 |     "CacheManager",
101 |     "QuantizedEngine",
102 |     "TensorParallelEngine",
103 | ]
104 | 
105 | def get_device():
106 |     """Get the default device (CUDA if available, else CPU)."""
107 |     return torch.device("cuda" if CUDA_AVAILABLE else "cpu")
108 | 
109 | def is_distributed_available():
110 |     """Check if distributed training is available."""
111 |     return MPI_AVAILABLE or HOROVOD_AVAILABLE
112 | 
113 | def is_quantization_available():
114 |     """Check if quantization features are available."""
115 |     return ONNX_AVAILABLE
116 | 
117 | def set_seed(seed: int):
118 |     """Set random seed for reproducibility."""
119 |     import random
120 |     import numpy as np
121 |     random.seed(seed)
122 |     np.random.seed(seed)
123 |     torch.manual_seed(seed)
124 |     if CUDA_AVAILABLE:
125 |         torch.cuda.manual_seed_all(seed)
126 | 
127 | def get_config_path():
128 |     """Get the path to the default configuration directory."""
129 |     import os
130 |     return os.path.join(os.path.dirname(__file__), "config")
131 | 
132 | def cite():
133 |     """Print citation information."""
134 |     print(
135 |         """
136 | If you use LTM Transformer in your research, please cite:
137 | 
138 | @article{allahyar2025ltm,
139 |     title={LTM Transformer: Long-term Memory Transformer with Titan-inspired Architecture},
140 |     author={Allahyar, Sahibzada},
141 |     journal={arXiv preprint arXiv:2025.xxxxx},
142 |     year={2025}
143 | }
144 |         """
145 |     )
146 | 


--------------------------------------------------------------------------------
/include/core/ops/mma_ops.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cuda_runtime.h>
  4 | #include <cuda_fp16.h>
  5 | #include "core/utils/tensor.cuh"
  6 | 
  7 | namespace ltm {
  8 | namespace ops {
  9 | 
 10 | /**
 11 |  * @brief Perform matrix multiplication C = alpha * (A @ B) + beta * C
 12 |  * 
 13 |  * Uses CUTLASS for optimized GEMM computation on tensor cores.
 14 |  * 
 15 |  * @tparam T Data type (float or half)
 16 |  * @param A Input matrix A
 17 |  * @param B Input matrix B
 18 |  * @param C Output matrix C
 19 |  * @param transpose_a Whether to transpose matrix A
 20 |  * @param transpose_b Whether to transpose matrix B
 21 |  * @param alpha Scaling factor for A @ B
 22 |  * @param beta Scaling factor for C
 23 |  * @param stream CUDA stream
 24 |  */
 25 | template<typename T>
 26 | void matmul(
 27 |     const Tensor<T>& A,
 28 |     const Tensor<T>& B,
 29 |     Tensor<T>& C,
 30 |     bool transpose_a = false,
 31 |     bool transpose_b = false,
 32 |     float alpha = 1.0f,
 33 |     float beta = 0.0f,
 34 |     cudaStream_t stream = nullptr
 35 | );
 36 | 
 37 | /**
 38 |  * @brief Fused matrix multiplication and GELU activation
 39 |  * 
 40 |  * Computes C = GELU(A @ B) in a single kernel for better performance.
 41 |  * 
 42 |  * @tparam T Data type (float or half)
 43 |  * @param A Input matrix A
 44 |  * @param B Input matrix B
 45 |  * @param C Output matrix C
 46 |  * @param transpose_a Whether to transpose matrix A
 47 |  * @param transpose_b Whether to transpose matrix B
 48 |  * @param stream CUDA stream
 49 |  */
 50 | template<typename T>
 51 | void mmaGelu(
 52 |     const Tensor<T>& A,
 53 |     const Tensor<T>& B,
 54 |     Tensor<T>& C,
 55 |     bool transpose_a = false,
 56 |     bool transpose_b = false,
 57 |     cudaStream_t stream = nullptr
 58 | );
 59 | 
 60 | /**
 61 |  * @brief Fused matrix multiplication and dropout
 62 |  * 
 63 |  * Computes C = Dropout(A @ B) in a single kernel for better performance.
 64 |  * 
 65 |  * @tparam T Data type (float or half)
 66 |  * @param A Input matrix A
 67 |  * @param B Input matrix B
 68 |  * @param C Output matrix C
 69 |  * @param dropout_prob Dropout probability
 70 |  * @param seed Random seed for dropout
 71 |  * @param transpose_a Whether to transpose matrix A
 72 |  * @param transpose_b Whether to transpose matrix B
 73 |  * @param stream CUDA stream
 74 |  */
 75 | template<typename T>
 76 | void mmaDropout(
 77 |     const Tensor<T>& A,
 78 |     const Tensor<T>& B,
 79 |     Tensor<T>& C,
 80 |     float dropout_prob,
 81 |     unsigned long long seed,
 82 |     bool transpose_a = false,
 83 |     bool transpose_b = false,
 84 |     cudaStream_t stream = nullptr
 85 | );
 86 | 
 87 | /**
 88 |  * @brief Configuration for MMA operations
 89 |  */
 90 | struct MMAConfig {
 91 |     // Thread block dimensions
 92 |     static constexpr int BLOCK_M = 128;
 93 |     static constexpr int BLOCK_N = 128;
 94 |     static constexpr int BLOCK_K = 32;
 95 |     
 96 |     // Warp dimensions
 97 |     static constexpr int WARP_M = 64;
 98 |     static constexpr int WARP_N = 64;
 99 |     static constexpr int WARP_K = 32;
100 |     
101 |     // Instruction dimensions
102 |     static constexpr int INST_M = 16;
103 |     static constexpr int INST_N = 8;
104 |     static constexpr int INST_K = 16;
105 |     
106 |     // Pipeline stages
107 |     static constexpr int NUM_STAGES = 3;
108 |     
109 |     // Shared memory configuration
110 |     static constexpr int SMEM_BYTES_PER_STAGE = 
111 |         (BLOCK_M * BLOCK_K + BLOCK_K * BLOCK_N) * sizeof(half);
112 |     static constexpr int SMEM_BYTES_TOTAL = 
113 |         SMEM_BYTES_PER_STAGE * NUM_STAGES;
114 |     
115 |     // Performance tuning
116 |     static constexpr bool USE_TENSOR_CORES = true;
117 |     static constexpr bool SPLIT_K_SERIAL = false;
118 |     static constexpr int MIN_BLOCKS_PER_SM = 1;
119 | };
120 | 
121 | /**
122 |  * @brief Get optimal grid dimensions for MMA operations
123 |  * 
124 |  * @param m Number of rows in output
125 |  * @param n Number of columns in output
126 |  * @return dim3 Grid dimensions
127 |  */
128 | inline dim3 getMMAGridDim(int m, int n) {
129 |     return dim3(
130 |         (m + MMAConfig::BLOCK_M - 1) / MMAConfig::BLOCK_M,
131 |         (n + MMAConfig::BLOCK_N - 1) / MMAConfig::BLOCK_N,
132 |         1
133 |     );
134 | }
135 | 
136 | /**
137 |  * @brief Get block dimensions for MMA operations
138 |  * 
139 |  * @return dim3 Block dimensions
140 |  */
141 | inline dim3 getMMABlockDim() {
142 |     return dim3(
143 |         MMAConfig::BLOCK_M / MMAConfig::INST_M * 
144 |         MMAConfig::BLOCK_N / MMAConfig::INST_N,
145 |         1,
146 |         1
147 |     );
148 | }
149 | 
150 | /**
151 |  * @brief Check if tensor cores can be used for given dimensions
152 |  * 
153 |  * @param m Number of rows
154 |  * @param n Number of columns
155 |  * @param k Inner dimension
156 |  * @return bool True if tensor cores can be used
157 |  */
158 | inline bool canUseTensorCores(int m, int n, int k) {
159 |     return (
160 |         m % MMAConfig::INST_M == 0 &&
161 |         n % MMAConfig::INST_N == 0 &&
162 |         k % MMAConfig::INST_K == 0
163 |     );
164 | }
165 | 
166 | } // namespace ops
167 | } // namespace ltm
168 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # Tests configuration
  2 | enable_testing()
  3 | 
  4 | # Find GTest package
  5 | find_package(GTest REQUIRED)
  6 | 
  7 | # Test executables
  8 | add_executable(ltm_tests
  9 |     attention/flash_attention_test.cu
 10 |     attention/memory_attention_test.cu
 11 |     ltm/compression_gate_test.cu
 12 |     ltm/memory_bank_test.cu
 13 |     ops/fused_ops_test.cu
 14 |     ops/mma_ops_test.cu
 15 |     parallel/mpi_utils_test.cpp
 16 |     parallel/pipeline_test.cpp
 17 |     parallel/tensor_parallel_test.cpp
 18 |     quantization/calibrator_test.cu
 19 |     quantization/quantizer_test.cu
 20 |     transformer/titan_inspired_block_test.cu
 21 |     main_test.cpp
 22 | )
 23 | 
 24 | target_compile_features(ltm_tests PRIVATE cxx_std_17)
 25 | 
 26 | target_include_directories(ltm_tests PRIVATE
 27 |     ${CMAKE_SOURCE_DIR}/include
 28 |     ${GTEST_INCLUDE_DIRS}
 29 | )
 30 | 
 31 | target_link_libraries(ltm_tests PRIVATE
 32 |     ltm
 33 |     GTest::GTest
 34 |     GTest::Main
 35 |     ${CUDA_LIBRARIES}
 36 |     ${CUDA_CUBLAS_LIBRARIES}
 37 |     ${MPI_CXX_LIBRARIES}
 38 |     ${NCCL_LIBRARIES}
 39 |     ${CUTLASS_LIBRARIES}
 40 | )
 41 | 
 42 | # Set test properties
 43 | set_target_properties(ltm_tests PROPERTIES
 44 |     CUDA_SEPARABLE_COMPILATION ON
 45 |     POSITION_INDEPENDENT_CODE ON
 46 | )
 47 | 
 48 | # Register tests with CTest
 49 | include(GoogleTest)
 50 | gtest_discover_tests(ltm_tests)
 51 | 
 52 | # Performance tests
 53 | add_executable(ltm_perf_tests
 54 |     performance/attention_perf_test.cu
 55 |     performance/memory_perf_test.cu
 56 |     performance/transformer_perf_test.cu
 57 |     performance/main_perf_test.cpp
 58 | )
 59 | 
 60 | target_compile_features(ltm_perf_tests PRIVATE cxx_std_17)
 61 | 
 62 | target_include_directories(ltm_perf_tests PRIVATE
 63 |     ${CMAKE_SOURCE_DIR}/include
 64 |     ${GTEST_INCLUDE_DIRS}
 65 | )
 66 | 
 67 | target_link_libraries(ltm_perf_tests PRIVATE
 68 |     ltm
 69 |     GTest::GTest
 70 |     GTest::Main
 71 |     ${CUDA_LIBRARIES}
 72 |     ${CUDA_CUBLAS_LIBRARIES}
 73 |     ${MPI_CXX_LIBRARIES}
 74 |     ${NCCL_LIBRARIES}
 75 |     ${CUTLASS_LIBRARIES}
 76 | )
 77 | 
 78 | set_target_properties(ltm_perf_tests PROPERTIES
 79 |     CUDA_SEPARABLE_COMPILATION ON
 80 |     POSITION_INDEPENDENT_CODE ON
 81 | )
 82 | 
 83 | # Custom targets for running specific test groups
 84 | add_custom_target(run_unit_tests
 85 |     COMMAND ltm_tests
 86 |     DEPENDS ltm_tests
 87 |     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
 88 | )
 89 | 
 90 | add_custom_target(run_perf_tests
 91 |     COMMAND ltm_perf_tests
 92 |     DEPENDS ltm_perf_tests
 93 |     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
 94 | )
 95 | 
 96 | # Test data directory
 97 | set(TEST_DATA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/data)
 98 | file(MAKE_DIRECTORY ${TEST_DATA_DIR})
 99 | 
100 | # Copy test data to build directory
101 | add_custom_command(TARGET ltm_tests POST_BUILD
102 |     COMMAND ${CMAKE_COMMAND} -E copy_directory
103 |         ${TEST_DATA_DIR}
104 |         ${CMAKE_CURRENT_BINARY_DIR}/data
105 | )
106 | 
107 | # Test configuration
108 | configure_file(
109 |     ${CMAKE_CURRENT_SOURCE_DIR}/test_config.h.in
110 |     ${CMAKE_CURRENT_BINARY_DIR}/test_config.h
111 | )
112 | 
113 | target_include_directories(ltm_tests PRIVATE
114 |     ${CMAKE_CURRENT_BINARY_DIR}
115 | )
116 | 
117 | target_include_directories(ltm_perf_tests PRIVATE
118 |     ${CMAKE_CURRENT_BINARY_DIR}
119 | )
120 | 
121 | # Coverage target (if enabled)
122 | if(CMAKE_BUILD_TYPE STREQUAL "Debug")
123 |     if(CMAKE_COMPILER_IS_GNUCXX)
124 |         include(CodeCoverage)
125 |         append_coverage_compiler_flags()
126 |         setup_target_for_coverage_gcovr_html(
127 |             NAME coverage
128 |             EXECUTABLE ltm_tests
129 |             DEPENDENCIES ltm_tests
130 |         )
131 |     endif()
132 | endif()
133 | 
134 | # Memory check target
135 | find_program(VALGRIND "valgrind")
136 | if(VALGRIND)
137 |     add_custom_target(memcheck
138 |         COMMAND ${VALGRIND} --tool=memcheck --leak-check=full --show-reachable=yes
139 |                 --num-callers=20 --track-origins=yes
140 |                 $<TARGET_FILE:ltm_tests>
141 |         DEPENDS ltm_tests
142 |     )
143 | endif()
144 | 
145 | # Sanitizer builds
146 | if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
147 |     # Address sanitizer
148 |     add_executable(ltm_tests_asan ${TEST_SOURCES})
149 |     target_compile_options(ltm_tests_asan PRIVATE -fsanitize=address -fno-omit-frame-pointer)
150 |     target_link_options(ltm_tests_asan PRIVATE -fsanitize=address)
151 |     target_link_libraries(ltm_tests_asan PRIVATE ltm GTest::GTest GTest::Main)
152 |     
153 |     # Thread sanitizer
154 |     add_executable(ltm_tests_tsan ${TEST_SOURCES})
155 |     target_compile_options(ltm_tests_tsan PRIVATE -fsanitize=thread)
156 |     target_link_options(ltm_tests_tsan PRIVATE -fsanitize=thread)
157 |     target_link_libraries(ltm_tests_tsan PRIVATE ltm GTest::GTest GTest::Main)
158 |     
159 |     # UB sanitizer
160 |     add_executable(ltm_tests_ubsan ${TEST_SOURCES})
161 |     target_compile_options(ltm_tests_ubsan PRIVATE -fsanitize=undefined)
162 |     target_link_options(ltm_tests_ubsan PRIVATE -fsanitize=undefined)
163 |     target_link_libraries(ltm_tests_ubsan PRIVATE ltm GTest::GTest GTest::Main)
164 | endif()
165 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to LTM Transformer
  2 | 
  3 | Thank you for your interest in contributing to LTM Transformer! This document provides guidelines and instructions for contributing to the project.
  4 | 
  5 | ## Table of Contents
  6 | 
  7 | - [Code of Conduct](#code-of-conduct)
  8 | - [Getting Started](#getting-started)
  9 | - [Development Setup](#development-setup)
 10 | - [Making Changes](#making-changes)
 11 | - [Testing](#testing)
 12 | - [Pull Request Process](#pull-request-process)
 13 | - [Style Guide](#style-guide)
 14 | - [Documentation](#documentation)
 15 | 
 16 | ## Code of Conduct
 17 | 
 18 | This project and everyone participating in it is governed by our Code of Conduct. By participating, you are expected to uphold this code. Please report unacceptable behavior to allahyar@singularityresearch.org.
 19 | 
 20 | ## Getting Started
 21 | 
 22 | 1. Fork the repository on GitHub
 23 | 2. Clone your fork locally:
 24 |    ```bash
 25 |    git clone https://github.com/YOUR_USERNAME/ltm-transformer.git
 26 |    cd ltm-transformer
 27 |    ```
 28 | 3. Add the upstream repository:
 29 |    ```bash
 30 |    git remote add upstream https://github.com/singularityresearch/ltm-transformer.git
 31 |    ```
 32 | 4. Create a new branch for your changes:
 33 |    ```bash
 34 |    git checkout -b feature/your-feature-name
 35 |    ```
 36 | 
 37 | ## Development Setup
 38 | 
 39 | ### Prerequisites
 40 | 
 41 | - CMake (>= 3.15)
 42 | - CUDA Toolkit (>= 11.0)
 43 | - C++ Compiler with C++17 support
 44 | - Python (>= 3.7)
 45 | - PyTorch (>= 1.9.0)
 46 | 
 47 | ### Building from Source
 48 | 
 49 | 1. Install dependencies:
 50 |    ```bash
 51 |    # Install Python dependencies
 52 |    pip install -r requirements-dev.txt
 53 |    
 54 |    # Install system dependencies (Ubuntu/Debian)
 55 |    sudo apt-get install build-essential cmake cuda-toolkit-11-0
 56 |    ```
 57 | 
 58 | 2. Build the project:
 59 |    ```bash
 60 |    mkdir build && cd build
 61 |    cmake ..
 62 |    make -j$(nproc)
 63 |    ```
 64 | 
 65 | 3. Run tests:
 66 |    ```bash
 67 |    ctest --output-on-failure
 68 |    ```
 69 | 
 70 | ## Making Changes
 71 | 
 72 | 1. Make sure your changes are made on a new branch based on the latest main:
 73 |    ```bash
 74 |    git checkout main
 75 |    git pull upstream main
 76 |    git checkout -b feature/your-feature-name
 77 |    ```
 78 | 
 79 | 2. Make your changes, following our [Style Guide](#style-guide)
 80 | 
 81 | 3. Write or update tests as needed
 82 | 
 83 | 4. Run the test suite to ensure everything works
 84 | 
 85 | 5. Commit your changes:
 86 |    ```bash
 87 |    git add .
 88 |    git commit -m "feat: description of your changes"
 89 |    ```
 90 |    Please follow [Conventional Commits](https://www.conventionalcommits.org/) for commit messages.
 91 | 
 92 | ## Testing
 93 | 
 94 | - Write unit tests for new functionality
 95 | - Update existing tests when modifying code
 96 | - Ensure all tests pass before submitting a PR
 97 | - Include performance benchmarks for performance-critical code
 98 | 
 99 | ### Running Tests
100 | 
101 | ```bash
102 | # Run all tests
103 | ctest --output-on-failure
104 | 
105 | # Run specific test suite
106 | ./tests/ltm_tests
107 | 
108 | # Run with sanitizers
109 | ./tests/ltm_tests_asan
110 | ./tests/ltm_tests_tsan
111 | ./tests/ltm_tests_ubsan
112 | 
113 | # Run performance benchmarks
114 | ./tests/ltm_perf_tests
115 | ```
116 | 
117 | ## Pull Request Process
118 | 
119 | 1. Update the README.md with details of changes if needed
120 | 2. Update the documentation with any new features or APIs
121 | 3. Ensure all tests pass and CI checks are green
122 | 4. Get at least one code review from a maintainer
123 | 5. Once approved, a maintainer will merge your PR
124 | 
125 | ### PR Title Format
126 | 
127 | Follow the Conventional Commits specification:
128 | - feat: New feature
129 | - fix: Bug fix
130 | - docs: Documentation changes
131 | - style: Code style changes (formatting, etc)
132 | - refactor: Code refactoring
133 | - perf: Performance improvements
134 | - test: Adding or updating tests
135 | - chore: Maintenance tasks
136 | 
137 | ## Style Guide
138 | 
139 | ### C++
140 | 
141 | - Follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html)
142 | - Use clang-format with the provided .clang-format file
143 | - Use meaningful variable and function names
144 | - Document public APIs using Doxygen-style comments
145 | - Keep functions focused and reasonably sized
146 | - Use const correctness
147 | - Handle errors appropriately
148 | - Use smart pointers for memory management
149 | 
150 | ### CUDA
151 | 
152 | - Follow CUDA best practices for performance
153 | - Use appropriate memory access patterns
154 | - Handle errors using CUDA_CHECK macro
155 | - Document kernel launch parameters
156 | - Consider occupancy when setting block sizes
157 | - Profile kernels to ensure optimal performance
158 | 
159 | ### Python
160 | 
161 | - Follow PEP 8 style guide
162 | - Use type hints
163 | - Document functions and classes using docstrings
164 | - Use black for code formatting
165 | - Use isort for import sorting
166 | - Use pylint for linting
167 | 
168 | ## Documentation
169 | 
170 | - Document all public APIs
171 | - Keep documentation up to date with code changes
172 | - Include examples for complex features
173 | - Document performance characteristics
174 | - Update architecture docs for significant changes
175 | 
176 | ### Documentation Structure
177 | 
178 | - API Reference: Detailed documentation of all public APIs
179 | - Tutorials: Step-by-step guides for common tasks
180 | - Examples: Sample code demonstrating features
181 | - Architecture: High-level design documentation
182 | - Performance: Benchmarks and optimization guidelines
183 | 
184 | ## Questions or Problems?
185 | 
186 | Feel free to:
187 | - Open an issue for bugs or feature requests
188 | - Join our discussions for general questions
189 | - Contact maintainers for security issues
190 | 
191 | Thank you for contributing to LTM Transformer!
192 | 


--------------------------------------------------------------------------------
/include/core/ltm/compression_gate.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cuda_runtime.h>
  4 | #include <vector>
  5 | #include <memory>
  6 | #include "core/utils/tensor.cuh"
  7 | #include "core/utils/cuda_utils.cuh"
  8 | 
  9 | namespace ltm {
 10 | namespace memory {
 11 | 
 12 | /**
 13 |  * @brief Configuration for compression gate
 14 |  */
 15 | struct CompressionConfig {
 16 |     int input_dim = 768;              // Input dimension
 17 |     int compressed_dim = 64;          // Compressed dimension
 18 |     float compression_ratio = 0.25f;  // Target compression ratio
 19 |     bool use_attention = true;        // Use attention for compression
 20 |     bool learn_compression = true;    // Learn compression parameters
 21 |     bool use_residual = true;        // Use residual connection
 22 |     float dropout_prob = 0.1f;        // Dropout probability
 23 |     bool use_layer_norm = true;      // Apply layer normalization
 24 |     int num_heads = 1;               // Number of attention heads for compression
 25 |     bool use_gating = true;          // Use gating mechanism
 26 | };
 27 | 
 28 | /**
 29 |  * @brief Compression gate for reducing input dimension
 30 |  * 
 31 |  * Implements a trainable compression mechanism that reduces the dimensionality
 32 |  * of input states while preserving important information. Uses attention and
 33 |  * gating mechanisms to selectively compress information.
 34 |  * 
 35 |  * @tparam T Data type (float or half)
 36 |  */
 37 | template<typename T>
 38 | class CompressionGate {
 39 | public:
 40 |     /**
 41 |      * @brief Create compression gate
 42 |      * 
 43 |      * @param config Compression configuration
 44 |      */
 45 |     explicit CompressionGate(const CompressionConfig& config);
 46 | 
 47 |     /**
 48 |      * @brief Initialize parameters
 49 |      * 
 50 |      * @param stream CUDA stream
 51 |      */
 52 |     void initialize(cudaStream_t stream = nullptr);
 53 | 
 54 |     /**
 55 |      * @brief Forward pass
 56 |      * 
 57 |      * @param input Input tensor [batch_size, seq_len, input_dim]
 58 |      * @param output Output tensor [batch_size, seq_len, compressed_dim]
 59 |      * @param attention_mask Optional attention mask
 60 |      * @param stream CUDA stream
 61 |      */
 62 |     void forward(
 63 |         const Tensor<T>& input,
 64 |         Tensor<T>& output,
 65 |         const Tensor<T>* attention_mask = nullptr,
 66 |         cudaStream_t stream = nullptr
 67 |     );
 68 | 
 69 |     /**
 70 |      * @brief Backward pass
 71 |      * 
 72 |      * @param grad_output Gradient w.r.t. output [batch_size, seq_len, compressed_dim]
 73 |      * @param grad_input Gradient w.r.t. input [batch_size, seq_len, input_dim]
 74 |      * @param stream CUDA stream
 75 |      */
 76 |     void backward(
 77 |         const Tensor<T>& grad_output,
 78 |         Tensor<T>& grad_input,
 79 |         cudaStream_t stream = nullptr
 80 |     );
 81 | 
 82 |     /**
 83 |      * @brief Update parameters
 84 |      * 
 85 |      * @param learning_rate Learning rate
 86 |      * @param stream CUDA stream
 87 |      */
 88 |     void updateParameters(float learning_rate, cudaStream_t stream = nullptr);
 89 | 
 90 |     /**
 91 |      * @brief Get compression configuration
 92 |      * 
 93 |      * @return const CompressionConfig& Configuration
 94 |      */
 95 |     const CompressionConfig& getConfig() const { return config_; }
 96 | 
 97 |     /**
 98 |      * @brief Set compression configuration
 99 |      * 
100 |      * @param config New configuration
101 |      */
102 |     void setConfig(const CompressionConfig& config) { config_ = config; }
103 | 
104 |     /**
105 |      * @brief Get compression statistics
106 |      * 
107 |      * @return std::vector<float> Compression statistics
108 |      */
109 |     std::vector<float> getStats() const;
110 | 
111 |     /**
112 |      * @brief Save compression gate parameters
113 |      * 
114 |      * @param path Path to save parameters
115 |      */
116 |     void save(const std::string& path) const;
117 | 
118 |     /**
119 |      * @brief Load compression gate parameters
120 |      * 
121 |      * @param path Path to load parameters from
122 |      */
123 |     void load(const std::string& path);
124 | 
125 | private:
126 |     // Model parameters
127 |     Tensor<T> query_weight_;     // [input_dim, compressed_dim]
128 |     Tensor<T> key_weight_;       // [input_dim, compressed_dim]
129 |     Tensor<T> value_weight_;     // [input_dim, compressed_dim]
130 |     Tensor<T> gate_weight_;      // [input_dim, compressed_dim]
131 |     Tensor<T> output_weight_;    // [compressed_dim, compressed_dim]
132 |     
133 |     // Layer normalization parameters
134 |     Tensor<T> layer_norm_weight_;  // [input_dim]
135 |     Tensor<T> layer_norm_bias_;    // [input_dim]
136 |     
137 |     // Parameter gradients
138 |     Tensor<T> query_weight_grad_;
139 |     Tensor<T> key_weight_grad_;
140 |     Tensor<T> value_weight_grad_;
141 |     Tensor<T> gate_weight_grad_;
142 |     Tensor<T> output_weight_grad_;
143 |     Tensor<T> layer_norm_weight_grad_;
144 |     Tensor<T> layer_norm_bias_grad_;
145 |     
146 |     // Intermediate buffers
147 |     Tensor<T> attention_scores_;
148 |     Tensor<T> gate_scores_;
149 |     Tensor<T> normalized_input_;
150 |     Tensor<T> compressed_state_;
151 |     
152 |     // Configuration
153 |     CompressionConfig config_;
154 |     
155 |     // Helper functions
156 |     void initializeParameters(cudaStream_t stream);
157 |     
158 |     void computeAttention(
159 |         const Tensor<T>& input,
160 |         const Tensor<T>* mask,
161 |         cudaStream_t stream
162 |     );
163 |     
164 |     void computeGating(
165 |         const Tensor<T>& input,
166 |         cudaStream_t stream
167 |     );
168 |     
169 |     void layerNorm(
170 |         const Tensor<T>& input,
171 |         Tensor<T>& output,
172 |         cudaStream_t stream
173 |     );
174 | };
175 | 
176 | // Explicit instantiations
177 | extern template class CompressionGate<float>;
178 | extern template class CompressionGate<half>;
179 | 
180 | } // namespace memory
181 | } // namespace ltm
182 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Obsidian Memory Transformer
  2 | 
  3 | 
  4 | A novel LLM architecture written in highly optimized low-level C++/CUDA with a new Long-Term Memory (LTM) mechanism for large context windows. This is a high-performance implementation of a Transformer model with long-term memory capabilities, inspired by Google's Titan architecture. This project provides efficient CUDA implementations of FlashAttention and memory-augmented Transformer blocks, along with Python bindings for easy integration.
  5 | 
  6 | ## Features
  7 | 
  8 | - **Long-term Memory**: Novel memory mechanism for handling extended context windows efficiently
  9 | - **FlashAttention**: Memory-efficient attention implementation with minimal memory access
 10 | - **High Performance**:
 11 |   - Optimized CUDA kernels
 12 |   - Mixed precision training (FP16/BF16)
 13 |   - Quantization support (INT8/INT4)
 14 |   - Fused operations for better throughput
 15 | - **Distributed Training**:
 16 |   - Data parallelism
 17 |   - Tensor parallelism
 18 |   - Pipeline parallelism
 19 |   - Multi-node support via MPI
 20 | - **Python Integration**:
 21 |   - HuggingFace-compatible interface
 22 |   - Easy-to-use training API
 23 |   - Efficient inference engine
 24 | 
 25 | ## Installation
 26 | 
 27 | ### Prerequisites
 28 | 
 29 | - CUDA Toolkit (>= 11.0)
 30 | - CMake (>= 3.15)
 31 | - C++17 compatible compiler
 32 | - Python (>= 3.7)
 33 | - PyTorch (>= 1.9.0)
 34 | 
 35 | ### Installing from PyPI
 36 | 
 37 | ```bash
 38 | pip install ltm-transformer
 39 | ```
 40 | 
 41 | ### Building from Source
 42 | 
 43 | 1. Clone the repository:
 44 | ```bash
 45 | git clone https://github.com/singularityresearch/ltm-transformer.git
 46 | cd ltm-transformer
 47 | ```
 48 | 
 49 | 2. Install Python dependencies:
 50 | ```bash
 51 | pip install -r requirements.txt
 52 | ```
 53 | 
 54 | 3. Build and install:
 55 | ```bash
 56 | mkdir build && cd build
 57 | cmake ..
 58 | make -j$(nproc)
 59 | make install
 60 | ```
 61 | 
 62 | ## Quick Start
 63 | 
 64 | ### Python
 65 | 
 66 | ```python
 67 | from ltm import TitanModel, TitanConfig, InferenceEngine
 68 | 
 69 | # Initialize model
 70 | config = TitanConfig(
 71 |     hidden_size=768,
 72 |     num_attention_heads=12,
 73 |     memory_slots=512,
 74 |     use_flash_attention=True
 75 | )
 76 | model = TitanModel(config)
 77 | 
 78 | # Training
 79 | from ltm import Trainer, TrainingArguments
 80 | 
 81 | trainer = Trainer(
 82 |     model=model,
 83 |     args=TrainingArguments(
 84 |         output_dir="./outputs",
 85 |         learning_rate=5e-5,
 86 |         per_device_train_batch_size=8,
 87 |         gradient_accumulation_steps=4
 88 |     ),
 89 |     train_dataset=dataset
 90 | )
 91 | trainer.train()
 92 | 
 93 | # Inference
 94 | engine = InferenceEngine(
 95 |     model=model,
 96 |     config=InferenceConfig(
 97 |         use_flash_attention=True,
 98 |         use_memory_cache=True,
 99 |         max_sequence_length=2048
100 |     )
101 | )
102 | 
103 | output = engine.generate(
104 |     input_ids=tokenizer.encode("Hello, how are"),
105 |     max_new_tokens=50
106 | )
107 | ```
108 | 
109 | ### C++
110 | 
111 | ```cpp
112 | #include "ltm/transformer/titan_inspired_block.cuh"
113 | 
114 | // Configure model
115 | ltm::transformer::TitanBlockConfig config;
116 | config.hidden_dim = 768;
117 | config.num_heads = 12;
118 | config.memory_slots = 512;
119 | config.use_flash_attention = true;
120 | 
121 | // Create model
122 | auto model = std::make_unique<ltm::transformer::TitanBlock<float>>(config);
123 | 
124 | // Run inference
125 | torch::Tensor input = /* ... */;
126 | auto output = model->forward(input);
127 | ```
128 | 
129 | ## Architecture
130 | 
131 | The LTM Transformer extends the standard Transformer architecture with:
132 | 
133 | 1. **Memory Bank**: A trainable matrix storing compressed representations of past context
134 | 2. **Compression Gate**: Mechanism for compressing and storing relevant information
135 | 3. **Memory Attention**: Efficient attention between current context and memory bank
136 | 4. **FlashAttention**: Memory-efficient attention implementation
137 | 
138 | For detailed architecture information, see [docs/design/architecture.md](docs/design/architecture.md).
139 | 
140 | ## Performance
141 | 
142 | ### Memory Usage
143 | 
144 | | Context Length | Standard Transformer | LTM Transformer |
145 | |---------------|---------------------|-----------------|
146 | | 2K tokens     | 4 GB                | 2 GB           |
147 | | 8K tokens     | 64 GB               | 4 GB           |
148 | | 32K tokens    | 1024 GB             | 8 GB           |
149 | 
150 | ### Training Speed
151 | 
152 | - 1.5x faster training compared to standard Transformers
153 | - 4x reduction in memory bandwidth usage
154 | - Linear scaling up to 64 GPUs
155 | 
156 | For detailed benchmarks, see [docs/performance/optimization.md](docs/performance/optimization.md).
157 | 
158 | ## Contributing
159 | 
160 | We welcome contributions! Please see our [Contributing Guidelines](CONTRIBUTING.md) for details.
161 | 
162 | ### Development Setup
163 | 
164 | 1. Install development dependencies:
165 | ```bash
166 | pip install -r requirements-dev.txt
167 | ```
168 | 
169 | 2. Build with testing enabled:
170 | ```bash
171 | mkdir build && cd build
172 | cmake -DBUILD_TESTING=ON ..
173 | make -j$(nproc)
174 | ```
175 | 
176 | 3. Run tests:
177 | ```bash
178 | ctest --output-on-failure
179 | ```
180 | 
181 | ## Citation
182 | 
183 | If you use this work in your research, please cite:
184 | 
185 | ```bibtex
186 | @article{allahyar2025ltm,
187 |     title={LTM Transformer: Long-term Memory Transformer with Titan-inspired Architecture},
188 |     author={Allahyar, Sahibzada},
189 |     journal= https://github.com/Sahibzada-A/Obsidian-Memory-Transformer,
190 |     year={2025}
191 | }
192 | ```
193 | 
194 | ## License
195 | 
196 | This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
197 | 
198 | ## Acknowledgments
199 | 
200 | - Google's Titan architecture for inspiration
201 | - FlashAttention paper for efficient attention implementation
202 | - HuggingFace team for transformer implementations
203 | - NVIDIA for CUDA optimization guidelines
204 | 
205 | ## Contact
206 | 
207 | - Sahibzada A - sahibzada@singularityresearchlabs.com
208 | - Project Link: https://github.com/Sahibzada-A/Obsidian-Memory-Transformer
209 | 


--------------------------------------------------------------------------------
/include/core/ltm/memory_bank.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cuda_runtime.h>
  4 | #include <vector>
  5 | #include <memory>
  6 | #include "core/utils/tensor.cuh"
  7 | #include "core/utils/cuda_utils.cuh"
  8 | 
  9 | namespace ltm {
 10 | namespace memory {
 11 | 
 12 | /**
 13 |  * @brief Configuration for memory bank
 14 |  */
 15 | struct MemoryBankConfig {
 16 |     int num_slots = 512;              // Number of memory slots
 17 |     int slot_dim = 64;                // Dimension of each memory slot
 18 |     float update_rate = 0.9f;         // Memory update rate (alpha)
 19 |     int update_interval = 8;          // Steps between memory updates
 20 |     bool use_attention_scores = true; // Use attention scores for updates
 21 |     float prune_threshold = 0.1f;     // Threshold for pruning unused slots
 22 |     bool use_dynamic_slots = false;   // Dynamically adjust number of slots
 23 |     int min_slots = 128;              // Minimum number of slots if dynamic
 24 |     int max_slots = 1024;             // Maximum number of slots if dynamic
 25 | };
 26 | 
 27 | /**
 28 |  * @brief Memory bank for storing compressed context representations
 29 |  * 
 30 |  * Implements a trainable memory bank that stores compressed states from
 31 |  * previous segments, providing the model with long-term recall capability.
 32 |  * 
 33 |  * @tparam T Data type (float or half)
 34 |  */
 35 | template<typename T>
 36 | class MemoryBank {
 37 | public:
 38 |     /**
 39 |      * @brief Create memory bank
 40 |      * 
 41 |      * @param batch_size Batch size
 42 |      * @param num_heads Number of attention heads
 43 |      * @param num_slots Number of memory slots
 44 |      * @param slot_dim Dimension of each slot
 45 |      * @param update_interval Steps between memory updates
 46 |      */
 47 |     MemoryBank(
 48 |         int batch_size,
 49 |         int num_heads,
 50 |         int num_slots,
 51 |         int slot_dim,
 52 |         int update_interval
 53 |     );
 54 | 
 55 |     /**
 56 |      * @brief Initialize memory bank
 57 |      * 
 58 |      * @param stream CUDA stream
 59 |      */
 60 |     void initialize(cudaStream_t stream = nullptr);
 61 | 
 62 |     /**
 63 |      * @brief Reset memory bank to initial state
 64 |      * 
 65 |      * @param stream CUDA stream
 66 |      */
 67 |     void reset(cudaStream_t stream = nullptr);
 68 | 
 69 |     /**
 70 |      * @brief Store compressed state in memory bank
 71 |      * 
 72 |      * @param state Compressed state to store [batch_size, seq_len, hidden_dim]
 73 |      * @param attention_scores Optional attention scores for update weighting
 74 |      * @param stream CUDA stream
 75 |      */
 76 |     void store(
 77 |         const Tensor<T>& state,
 78 |         const Tensor<T>* attention_scores = nullptr,
 79 |         cudaStream_t stream = nullptr
 80 |     );
 81 | 
 82 |     /**
 83 |      * @brief Retrieve relevant memory slots
 84 |      * 
 85 |      * @param query Query tensor [batch_size, seq_len, hidden_dim]
 86 |      * @param output Output tensor [batch_size, num_slots, hidden_dim]
 87 |      * @param stream CUDA stream
 88 |      */
 89 |     void retrieve(
 90 |         const Tensor<T>& query,
 91 |         Tensor<T>& output,
 92 |         cudaStream_t stream = nullptr
 93 |     );
 94 | 
 95 |     /**
 96 |      * @brief Update memory slots
 97 |      * 
 98 |      * @param new_values New values for memory slots
 99 |      * @param indices Indices of slots to update
100 |      * @param stream CUDA stream
101 |      */
102 |     void update(
103 |         const Tensor<T>& new_values,
104 |         const Tensor<int>& indices,
105 |         cudaStream_t stream = nullptr
106 |     );
107 | 
108 |     /**
109 |      * @brief Prune unused memory slots
110 |      * 
111 |      * @param usage_threshold Usage threshold for pruning
112 |      * @param stream CUDA stream
113 |      */
114 |     void prune(
115 |         float usage_threshold,
116 |         cudaStream_t stream = nullptr
117 |     );
118 | 
119 |     /**
120 |      * @brief Get current memory bank state
121 |      * 
122 |      * @return const Tensor<T>& Memory bank tensor
123 |      */
124 |     const Tensor<T>& getMemoryBank() const { return memory_bank_; }
125 | 
126 |     /**
127 |      * @brief Get memory bank configuration
128 |      * 
129 |      * @return const MemoryBankConfig& Configuration
130 |      */
131 |     const MemoryBankConfig& getConfig() const { return config_; }
132 | 
133 |     /**
134 |      * @brief Set memory bank configuration
135 |      * 
136 |      * @param config New configuration
137 |      */
138 |     void setConfig(const MemoryBankConfig& config) { config_ = config; }
139 | 
140 |     /**
141 |      * @brief Get memory usage statistics
142 |      * 
143 |      * @return std::vector<float> Usage statistics for each slot
144 |      */
145 |     std::vector<float> getUsageStats() const;
146 | 
147 |     /**
148 |      * @brief Save memory bank state
149 |      * 
150 |      * @param path Path to save state
151 |      */
152 |     void save(const std::string& path) const;
153 | 
154 |     /**
155 |      * @brief Load memory bank state
156 |      * 
157 |      * @param path Path to load state from
158 |      */
159 |     void load(const std::string& path);
160 | 
161 | private:
162 |     // Memory bank state
163 |     Tensor<T> memory_bank_;      // [batch_size, num_slots, slot_dim]
164 |     Tensor<T> usage_counts_;     // [batch_size, num_slots]
165 |     Tensor<int> age_;           // [batch_size, num_slots]
166 |     
167 |     // Memory update state
168 |     int update_counter_ = 0;
169 |     Tensor<T> temp_storage_;
170 |     
171 |     // Configuration
172 |     MemoryBankConfig config_;
173 |     int batch_size_;
174 |     int num_heads_;
175 |     int num_slots_;
176 |     int slot_dim_;
177 |     int update_interval_;
178 |     
179 |     // Helper functions
180 |     void updateUsageCounts(
181 |         const Tensor<T>& attention_scores,
182 |         cudaStream_t stream
183 |     );
184 |     
185 |     void updateAges(cudaStream_t stream);
186 |     
187 |     void findUnusedSlots(
188 |         Tensor<int>& unused_indices,
189 |         int& num_unused,
190 |         cudaStream_t stream
191 |     );
192 |     
193 |     void compactMemory(cudaStream_t stream);
194 | };
195 | 
196 | // Explicit instantiations
197 | extern template class MemoryBank<float>;
198 | extern template class MemoryBank<half>;
199 | 
200 | } // namespace memory
201 | } // namespace ltm
202 | 


--------------------------------------------------------------------------------
/include/core/ops/fused_ops.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cuda_runtime.h>
  4 | #include <cuda_fp16.h>
  5 | #include <curand_kernel.h>
  6 | #include "core/utils/tensor.cuh"
  7 | 
  8 | namespace ltm {
  9 | namespace ops {
 10 | 
 11 | /**
 12 |  * @brief Fused layer normalization and residual connection
 13 |  * 
 14 |  * Computes: output = LayerNorm(input + residual) * gamma + beta
 15 |  * Fuses the residual connection and layer normalization for better performance.
 16 |  * 
 17 |  * @tparam T Data type (float or half)
 18 |  * @param input Input tensor [batch_size, hidden_dim]
 19 |  * @param residual Residual tensor [batch_size, hidden_dim]
 20 |  * @param gamma Scale parameter [hidden_dim]
 21 |  * @param beta Bias parameter [hidden_dim]
 22 |  * @param output Output tensor [batch_size, hidden_dim]
 23 |  * @param stream CUDA stream
 24 |  */
 25 | template<typename T>
 26 | void layerNormResidual(
 27 |     const Tensor<T>& input,
 28 |     const Tensor<T>& residual,
 29 |     const Tensor<T>& gamma,
 30 |     const Tensor<T>& beta,
 31 |     Tensor<T>& output,
 32 |     cudaStream_t stream = nullptr
 33 | );
 34 | 
 35 | /**
 36 |  * @brief Fused dropout and residual connection
 37 |  * 
 38 |  * Computes: output = dropout(input) + residual
 39 |  * Fuses dropout and residual connection for better performance.
 40 |  * 
 41 |  * @tparam T Data type (float or half)
 42 |  * @param input Input tensor
 43 |  * @param residual Residual tensor
 44 |  * @param output Output tensor
 45 |  * @param dropout_prob Dropout probability
 46 |  * @param seed Random seed for dropout
 47 |  * @param stream CUDA stream
 48 |  */
 49 | template<typename T>
 50 | void dropoutResidual(
 51 |     const Tensor<T>& input,
 52 |     const Tensor<T>& residual,
 53 |     Tensor<T>& output,
 54 |     float dropout_prob,
 55 |     unsigned long long seed,
 56 |     cudaStream_t stream = nullptr
 57 | );
 58 | 
 59 | /**
 60 |  * @brief Fused bias addition and GELU activation
 61 |  * 
 62 |  * Computes: output = GELU(input + bias)
 63 |  * Fuses bias addition and GELU activation for better performance.
 64 |  * 
 65 |  * @tparam T Data type (float or half)
 66 |  * @param input Input tensor [batch_size, hidden_dim]
 67 |  * @param bias Bias tensor [hidden_dim]
 68 |  * @param output Output tensor [batch_size, hidden_dim]
 69 |  * @param stream CUDA stream
 70 |  */
 71 | template<typename T>
 72 | void biasGeluFused(
 73 |     const Tensor<T>& input,
 74 |     const Tensor<T>& bias,
 75 |     Tensor<T>& output,
 76 |     cudaStream_t stream = nullptr
 77 | );
 78 | 
 79 | /**
 80 |  * @brief Fused bias addition and ReLU activation
 81 |  * 
 82 |  * Computes: output = ReLU(input + bias)
 83 |  * Fuses bias addition and ReLU activation for better performance.
 84 |  * 
 85 |  * @tparam T Data type (float or half)
 86 |  * @param input Input tensor [batch_size, hidden_dim]
 87 |  * @param bias Bias tensor [hidden_dim]
 88 |  * @param output Output tensor [batch_size, hidden_dim]
 89 |  * @param stream CUDA stream
 90 |  */
 91 | template<typename T>
 92 | void biasReluFused(
 93 |     const Tensor<T>& input,
 94 |     const Tensor<T>& bias,
 95 |     Tensor<T>& output,
 96 |     cudaStream_t stream = nullptr
 97 | );
 98 | 
 99 | /**
100 |  * @brief Tensor addition with scaling factors
101 |  * 
102 |  * Computes: output = alpha * input1 + beta * input2
103 |  * 
104 |  * @tparam T Data type (float or half)
105 |  * @param input1 First input tensor
106 |  * @param input2 Second input tensor
107 |  * @param output Output tensor
108 |  * @param alpha Scaling factor for input1
109 |  * @param beta Scaling factor for input2
110 |  * @param stream CUDA stream
111 |  */
112 | template<typename T>
113 | void tensorAdd(
114 |     const Tensor<T>& input1,
115 |     const Tensor<T>& input2,
116 |     Tensor<T>& output,
117 |     float alpha = 1.0f,
118 |     float beta = 1.0f,
119 |     cudaStream_t stream = nullptr
120 | );
121 | 
122 | /**
123 |  * @brief Element-wise multiplication
124 |  * 
125 |  * Computes: output = input1 * input2
126 |  * 
127 |  * @tparam T Data type (float or half)
128 |  * @param input1 First input tensor
129 |  * @param input2 Second input tensor
130 |  * @param output Output tensor
131 |  * @param stream CUDA stream
132 |  */
133 | template<typename T>
134 | void elementwiseMul(
135 |     const Tensor<T>& input1,
136 |     const Tensor<T>& input2,
137 |     Tensor<T>& output,
138 |     cudaStream_t stream = nullptr
139 | );
140 | 
141 | /**
142 |  * @brief Configuration for fused operations
143 |  */
144 | struct FusedOpsConfig {
145 |     // Thread block configuration
146 |     static constexpr int BLOCK_SIZE = 256;
147 |     
148 |     // Layer normalization
149 |     static constexpr float LAYERNORM_EPS = 1e-5f;
150 |     static constexpr int MIN_ELEMENTS_PER_THREAD = 4;
151 |     
152 |     // Shared memory configuration
153 |     static constexpr int MAX_SHARED_MEM = 48 * 1024;  // 48 KB
154 |     static constexpr int MIN_SHARED_MEM = 16 * 1024;  // 16 KB
155 |     
156 |     // Performance tuning
157 |     static constexpr bool USE_VECTORIZED_LOAD = true;
158 |     static constexpr bool USE_VECTORIZED_STORE = true;
159 |     static constexpr int UNROLL_FACTOR = 4;
160 |     
161 |     // Dropout configuration
162 |     static constexpr int THREADS_PER_ROW = 32;
163 |     static constexpr int ROWS_PER_BLOCK = 4;
164 | };
165 | 
166 | /**
167 |  * @brief Get optimal block size for fused operations
168 |  * 
169 |  * @param hidden_dim Hidden dimension size
170 |  * @return int Optimal block size
171 |  */
172 | inline int getOptimalBlockSize(int hidden_dim) {
173 |     // Choose block size based on hidden dimension
174 |     if (hidden_dim <= 128) return 128;
175 |     if (hidden_dim <= 256) return 256;
176 |     if (hidden_dim <= 512) return 512;
177 |     return 1024;
178 | }
179 | 
180 | /**
181 |  * @brief Check if tensor dimensions are compatible with fused operations
182 |  * 
183 |  * @param input Input tensor
184 |  * @param residual Residual tensor
185 |  * @return bool True if dimensions are compatible
186 |  */
187 | template<typename T>
188 | inline bool checkDimensions(
189 |     const Tensor<T>& input,
190 |     const Tensor<T>& residual
191 | ) {
192 |     return (
193 |         input.shape() == residual.shape() &&
194 |         input.shape().size() == 2  // Expect [batch_size, hidden_dim]
195 |     );
196 | }
197 | 
198 | /**
199 |  * @brief Calculate required shared memory size for layer normalization
200 |  * 
201 |  * @param block_size Thread block size
202 |  * @return size_t Required shared memory size in bytes
203 |  */
204 | inline size_t getLayerNormSharedMemSize(int block_size) {
205 |     // Need space for mean and variance
206 |     return 2 * block_size * sizeof(float);
207 | }
208 | 
209 | } // namespace ops
210 | } // namespace ltm
211 | 


--------------------------------------------------------------------------------
/src/core/parallel/mpi_utils.cpp:
--------------------------------------------------------------------------------
  1 | #include <mpi.h>
  2 | #include <cuda_runtime.h>
  3 | #include <nccl.h>
  4 | #include <vector>
  5 | #include <stdexcept>
  6 | #include <memory>
  7 | #include "core/parallel/mpi_utils.hpp"
  8 | 
  9 | namespace ltm {
 10 | namespace parallel {
 11 | 
 12 | class MPIContext {
 13 | public:
 14 |     static MPIContext& getInstance() {
 15 |         static MPIContext instance;
 16 |         return instance;
 17 |     }
 18 | 
 19 |     void initialize() {
 20 |         if (!initialized_) {
 21 |             int provided;
 22 |             MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &provided);
 23 |             if (provided < MPI_THREAD_MULTIPLE) {
 24 |                 throw std::runtime_error("MPI implementation does not support MPI_THREAD_MULTIPLE");
 25 |             }
 26 |             
 27 |             MPI_Comm_rank(MPI_COMM_WORLD, &rank_);
 28 |             MPI_Comm_size(MPI_COMM_WORLD, &world_size_);
 29 |             
 30 |             initializeNCCL();
 31 |             initialized_ = true;
 32 |         }
 33 |     }
 34 | 
 35 |     ~MPIContext() {
 36 |         if (initialized_) {
 37 |             ncclCommDestroy(nccl_comm_);
 38 |             MPI_Finalize();
 39 |         }
 40 |     }
 41 | 
 42 |     int getRank() const { return rank_; }
 43 |     int getWorldSize() const { return world_size_; }
 44 |     ncclComm_t getNCCLComm() const { return nccl_comm_; }
 45 | 
 46 | private:
 47 |     MPIContext() : initialized_(false), rank_(-1), world_size_(-1) {}
 48 |     
 49 |     void initializeNCCL() {
 50 |         // Get unique ID from rank 0
 51 |         ncclUniqueId nccl_id;
 52 |         if (rank_ == 0) {
 53 |             ncclGetUniqueId(&nccl_id);
 54 |         }
 55 |         
 56 |         // Broadcast NCCL ID to all ranks
 57 |         MPI_Bcast(&nccl_id, sizeof(ncclUniqueId), MPI_BYTE, 0, MPI_COMM_WORLD);
 58 |         
 59 |         // Initialize NCCL communicator
 60 |         ncclCommInitRank(&nccl_comm_, world_size_, nccl_id, rank_);
 61 |     }
 62 | 
 63 |     bool initialized_;
 64 |     int rank_;
 65 |     int world_size_;
 66 |     ncclComm_t nccl_comm_;
 67 | };
 68 | 
 69 | // Global synchronization
 70 | void synchronize() {
 71 |     MPI_Barrier(MPI_COMM_WORLD);
 72 |     cudaStreamSynchronize(nullptr);
 73 | }
 74 | 
 75 | // All-reduce operation for gradients
 76 | void allReduceGradients(void* data, size_t count, ncclDataType_t dtype, cudaStream_t stream) {
 77 |     auto& ctx = MPIContext::getInstance();
 78 |     ncclAllReduce(
 79 |         data,                  // sendbuff
 80 |         data,                  // recvbuff
 81 |         count,                // count
 82 |         dtype,                // datatype
 83 |         ncclSum,             // reduction operation
 84 |         ctx.getNCCLComm(),   // communicator
 85 |         stream               // CUDA stream
 86 |     );
 87 | }
 88 | 
 89 | // Broadcast parameters from rank 0
 90 | void broadcastParameters(void* data, size_t count, ncclDataType_t dtype, cudaStream_t stream) {
 91 |     auto& ctx = MPIContext::getInstance();
 92 |     ncclBroadcast(
 93 |         data,                // sendbuff
 94 |         data,                // recvbuff
 95 |         count,               // count
 96 |         dtype,               // datatype
 97 |         0,                   // root rank
 98 |         ctx.getNCCLComm(),  // communicator
 99 |         stream              // CUDA stream
100 |     );
101 | }
102 | 
103 | // Scatter data across ranks
104 | void scatterData(const void* send_data, void* recv_data, size_t count_per_rank, 
105 |                 ncclDataType_t dtype, cudaStream_t stream) {
106 |     auto& ctx = MPIContext::getInstance();
107 |     if (ctx.getRank() == 0) {
108 |         for (int i = 0; i < ctx.getWorldSize(); ++i) {
109 |             if (i == 0) {
110 |                 // Copy local data
111 |                 cudaMemcpyAsync(
112 |                     recv_data,
113 |                     send_data,
114 |                     count_per_rank * ncclTypeSize(dtype),
115 |                     cudaMemcpyDeviceToDevice,
116 |                     stream
117 |                 );
118 |             } else {
119 |                 // Send to other ranks
120 |                 MPI_Send(
121 |                     static_cast<const char*>(send_data) + i * count_per_rank * ncclTypeSize(dtype),
122 |                     count_per_rank * ncclTypeSize(dtype),
123 |                     MPI_BYTE,
124 |                     i,
125 |                     0,
126 |                     MPI_COMM_WORLD
127 |                 );
128 |             }
129 |         }
130 |     } else {
131 |         // Receive data from rank 0
132 |         MPI_Recv(
133 |             recv_data,
134 |             count_per_rank * ncclTypeSize(dtype),
135 |             MPI_BYTE,
136 |             0,
137 |             0,
138 |             MPI_COMM_WORLD,
139 |             MPI_STATUS_IGNORE
140 |         );
141 |     }
142 | }
143 | 
144 | // Gather data from all ranks
145 | void gatherData(const void* send_data, void* recv_data, size_t count_per_rank,
146 |                ncclDataType_t dtype, cudaStream_t stream) {
147 |     auto& ctx = MPIContext::getInstance();
148 |     if (ctx.getRank() == 0) {
149 |         // Copy local data
150 |         cudaMemcpyAsync(
151 |             recv_data,
152 |             send_data,
153 |             count_per_rank * ncclTypeSize(dtype),
154 |             cudaMemcpyDeviceToDevice,
155 |             stream
156 |         );
157 |         
158 |         // Receive from other ranks
159 |         for (int i = 1; i < ctx.getWorldSize(); ++i) {
160 |             MPI_Recv(
161 |                 static_cast<char*>(recv_data) + i * count_per_rank * ncclTypeSize(dtype),
162 |                 count_per_rank * ncclTypeSize(dtype),
163 |                 MPI_BYTE,
164 |                 i,
165 |                 0,
166 |                 MPI_COMM_WORLD,
167 |                 MPI_STATUS_IGNORE
168 |             );
169 |         }
170 |     } else {
171 |         // Send to rank 0
172 |         MPI_Send(
173 |             send_data,
174 |             count_per_rank * ncclTypeSize(dtype),
175 |             MPI_BYTE,
176 |             0,
177 |             0,
178 |             MPI_COMM_WORLD
179 |         );
180 |     }
181 | }
182 | 
183 | // Initialize MPI environment
184 | void initializeMPI() {
185 |     MPIContext::getInstance().initialize();
186 | }
187 | 
188 | // Get current rank
189 | int getCurrentRank() {
190 |     return MPIContext::getInstance().getRank();
191 | }
192 | 
193 | // Get world size
194 | int getWorldSize() {
195 |     return MPIContext::getInstance().getWorldSize();
196 | }
197 | 
198 | // Check if current process is master
199 | bool isMaster() {
200 |     return getCurrentRank() == 0;
201 | }
202 | 
203 | } // namespace parallel
204 | } // namespace ltm
205 | 


--------------------------------------------------------------------------------
/include/core/quantization/quantizer.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cuda_runtime.h>
  4 | #include <vector>
  5 | #include "core/utils/tensor.cuh"
  6 | 
  7 | namespace ltm {
  8 | namespace quantization {
  9 | 
 10 | enum class QuantizationPrecision {
 11 |     INT8,
 12 |     INT4,
 13 |     FP16,
 14 |     BF16
 15 | };
 16 | 
 17 | enum class CalibrationMethod {
 18 |     MINMAX,
 19 |     PERCENTILE,
 20 |     MSE,
 21 |     ENTROPY
 22 | };
 23 | 
 24 | struct QuantizationConfig {
 25 |     // General settings
 26 |     bool enabled = true;
 27 |     QuantizationPrecision precision = QuantizationPrecision::INT8;
 28 |     bool per_channel = true;
 29 |     bool symmetric = true;
 30 |     int channel_axis = 0;
 31 |     
 32 |     // Calibration settings
 33 |     CalibrationMethod calibration_method = CalibrationMethod::MINMAX;
 34 |     float percentile = 99.9f;  // For percentile calibration
 35 |     int num_samples = 1000;    // Number of samples for calibration
 36 |     
 37 |     // Dynamic quantization
 38 |     bool use_dynamic_ranges = false;
 39 |     int window_size = 1024;    // For dynamic range estimation
 40 |     
 41 |     // Mixed precision settings
 42 |     bool enable_mixed_precision = false;
 43 |     float sensitivity_threshold = 0.1f;  // For mixed precision decisions
 44 |     
 45 |     // Performance settings
 46 |     bool use_cuda_graphs = true;
 47 |     int num_cuda_streams = 4;
 48 |     
 49 |     // Optimization flags
 50 |     bool fuse_quantize_dequantize = true;
 51 |     bool cache_quantization_params = true;
 52 | };
 53 | 
 54 | // Forward declarations
 55 | template<typename T> class Quantizer;
 56 | 
 57 | // Interface for quantized tensors
 58 | template<typename T>
 59 | class QuantizedTensor {
 60 | public:
 61 |     QuantizedTensor(const std::vector<int>& shape, const QuantizationConfig& config)
 62 |         : shape_(shape), config_(config) {
 63 |         // Calculate size
 64 |         size_t num_elements = 1;
 65 |         for (int dim : shape) {
 66 |             num_elements *= dim;
 67 |         }
 68 |         
 69 |         // Allocate storage
 70 |         CUDA_CHECK(cudaMalloc(&data_, num_elements * sizeof(int8_t)));
 71 |         
 72 |         // Allocate space for quantization parameters
 73 |         if (config.per_channel) {
 74 |             int num_channels = shape[config.channel_axis];
 75 |             scales_.resize(num_channels);
 76 |             zero_points_.resize(num_channels);
 77 |         } else {
 78 |             scales_.resize(1);
 79 |             zero_points_.resize(1);
 80 |         }
 81 |     }
 82 | 
 83 |     ~QuantizedTensor() {
 84 |         if (data_) {
 85 |             CUDA_CHECK(cudaFree(data_));
 86 |         }
 87 |     }
 88 | 
 89 |     // Getters
 90 |     int8_t* data() const { return data_; }
 91 |     const std::vector<int>& shape() const { return shape_; }
 92 |     const std::vector<float>& scales() const { return scales_; }
 93 |     const std::vector<float>& zeroPoints() const { return zero_points_; }
 94 |     const QuantizationConfig& config() const { return config_; }
 95 | 
 96 |     // Utility functions
 97 |     size_t numel() const {
 98 |         size_t n = 1;
 99 |         for (int dim : shape_) {
100 |             n *= dim;
101 |         }
102 |         return n;
103 |     }
104 | 
105 |     void setQuantizationParams(
106 |         const std::vector<float>& scales,
107 |         const std::vector<float>& zero_points
108 |     ) {
109 |         scales_ = scales;
110 |         zero_points_ = zero_points;
111 |     }
112 | 
113 | private:
114 |     int8_t* data_ = nullptr;
115 |     std::vector<int> shape_;
116 |     std::vector<float> scales_;
117 |     std::vector<float> zero_points_;
118 |     QuantizationConfig config_;
119 |     
120 |     friend class Quantizer<T>;
121 | };
122 | 
123 | // Interface for quantization calibration
124 | class QuantizationCalibrator {
125 | public:
126 |     virtual ~QuantizationCalibrator() = default;
127 |     
128 |     virtual void collectStats(const void* data, size_t size) = 0;
129 |     virtual void computeRanges(float& min_val, float& max_val) = 0;
130 |     virtual void reset() = 0;
131 | };
132 | 
133 | // MinMax calibrator
134 | class MinMaxCalibrator : public QuantizationCalibrator {
135 | public:
136 |     MinMaxCalibrator() : min_val_(FLT_MAX), max_val_(-FLT_MAX) {}
137 |     
138 |     void collectStats(const void* data, size_t size) override;
139 |     void computeRanges(float& min_val, float& max_val) override;
140 |     void reset() override;
141 | 
142 | private:
143 |     float min_val_;
144 |     float max_val_;
145 | };
146 | 
147 | // Percentile calibrator
148 | class PercentileCalibrator : public QuantizationCalibrator {
149 | public:
150 |     explicit PercentileCalibrator(float percentile = 99.9f)
151 |         : percentile_(percentile) {}
152 |     
153 |     void collectStats(const void* data, size_t size) override;
154 |     void computeRanges(float& min_val, float& max_val) override;
155 |     void reset() override;
156 | 
157 | private:
158 |     float percentile_;
159 |     std::vector<float> values_;
160 | };
161 | 
162 | // MSE calibrator
163 | class MSECalibrator : public QuantizationCalibrator {
164 | public:
165 |     void collectStats(const void* data, size_t size) override;
166 |     void computeRanges(float& min_val, float& max_val) override;
167 |     void reset() override;
168 | 
169 | private:
170 |     std::vector<float> values_;
171 |     float optimal_min_ = 0.0f;
172 |     float optimal_max_ = 0.0f;
173 | };
174 | 
175 | // Main quantizer interface
176 | template<typename T>
177 | class Quantizer {
178 | public:
179 |     explicit Quantizer(const QuantizationConfig& config);
180 |     ~Quantizer();
181 |     
182 |     // Quantization
183 |     void quantize(const Tensor<T>& input, QuantizedTensor<T>& output);
184 |     void dequantize(const QuantizedTensor<T>& input, Tensor<T>& output);
185 |     
186 |     // Calibration
187 |     void calibrate(const std::vector<Tensor<T>>& calibration_data);
188 |     void resetCalibration();
189 |     
190 |     // Dynamic quantization
191 |     void updateDynamicRanges(const Tensor<T>& input);
192 |     
193 |     // Mixed precision
194 |     void analyzeSensitivity(const Tensor<T>& input, const Tensor<T>& grad);
195 |     bool shouldQuantize(const std::string& layer_name) const;
196 |     
197 |     // Utility functions
198 |     const QuantizationConfig& config() const { return config_; }
199 |     void setConfig(const QuantizationConfig& config) { config_ = config; }
200 |     
201 |     // Stream management
202 |     void setStream(cudaStream_t stream) { stream_ = stream; }
203 |     cudaStream_t getStream() const { return stream_; }
204 | 
205 | private:
206 |     // Implementation details in quantizer.cu
207 |     QuantizationConfig config_;
208 |     cudaStream_t stream_;
209 |     std::unique_ptr<QuantizationCalibrator> calibrator_;
210 |     
211 |     // Cached parameters
212 |     std::vector<float> scales_;
213 |     std::vector<float> zero_points_;
214 |     
215 |     // Mixed precision state
216 |     std::unordered_map<std::string, float> layer_sensitivity_;
217 | };
218 | 
219 | } // namespace quantization
220 | } // namespace ltm
221 | 


--------------------------------------------------------------------------------
/include/core/attention/memory_attention.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cuda_runtime.h>
  4 | #include <vector>
  5 | #include <memory>
  6 | #include "core/utils/tensor.cuh"
  7 | #include "core/utils/cuda_utils.cuh"
  8 | #include "core/ltm/memory_bank.cuh"
  9 | 
 10 | namespace ltm {
 11 | namespace attention {
 12 | 
 13 | /**
 14 |  * @brief Configuration for memory attention
 15 |  */
 16 | struct MemoryAttentionConfig {
 17 |     int hidden_dim = 768;            // Hidden dimension
 18 |     int num_heads = 12;              // Number of attention heads
 19 |     int head_dim = 64;               // Dimension per head
 20 |     float dropout_prob = 0.1f;       // Attention dropout probability
 21 |     bool use_bias = true;           // Use bias in projections
 22 |     bool scale_by_dim = true;       // Scale attention by sqrt(head_dim)
 23 |     bool use_rotary = true;         // Use rotary position embeddings
 24 |     bool use_alibi = false;         // Use ALiBi position bias
 25 |     bool use_memory_compression = true;  // Compress memory before attention
 26 |     float memory_compression_ratio = 0.5f;  // Memory compression ratio
 27 |     bool use_memory_gating = true;  // Gate memory attention contribution
 28 |     int max_memory_length = 16384;  // Maximum memory context length
 29 | };
 30 | 
 31 | /**
 32 |  * @brief Memory attention module
 33 |  * 
 34 |  * Implements attention between the current input context and the memory bank,
 35 |  * allowing the model to access and integrate information from long-term memory.
 36 |  * 
 37 |  * @tparam T Data type (float or half)
 38 |  */
 39 | template<typename T>
 40 | class MemoryAttention {
 41 | public:
 42 |     /**
 43 |      * @brief Create memory attention module
 44 |      * 
 45 |      * @param config Attention configuration
 46 |      * @param memory_bank Memory bank reference
 47 |      */
 48 |     MemoryAttention(
 49 |         const MemoryAttentionConfig& config,
 50 |         memory::MemoryBank<T>& memory_bank
 51 |     );
 52 | 
 53 |     /**
 54 |      * @brief Initialize parameters
 55 |      * 
 56 |      * @param stream CUDA stream
 57 |      */
 58 |     void initialize(cudaStream_t stream = nullptr);
 59 | 
 60 |     /**
 61 |      * @brief Forward pass
 62 |      * 
 63 |      * @param input Input tensor [batch_size, seq_len, hidden_dim]
 64 |      * @param output Output tensor [batch_size, seq_len, hidden_dim]
 65 |      * @param attention_mask Optional attention mask
 66 |      * @param stream CUDA stream
 67 |      */
 68 |     void forward(
 69 |         const Tensor<T>& input,
 70 |         Tensor<T>& output,
 71 |         const Tensor<T>* attention_mask = nullptr,
 72 |         cudaStream_t stream = nullptr
 73 |     );
 74 | 
 75 |     /**
 76 |      * @brief Backward pass
 77 |      * 
 78 |      * @param grad_output Gradient w.r.t. output [batch_size, seq_len, hidden_dim]
 79 |      * @param grad_input Gradient w.r.t. input [batch_size, seq_len, hidden_dim]
 80 |      * @param stream CUDA stream
 81 |      */
 82 |     void backward(
 83 |         const Tensor<T>& grad_output,
 84 |         Tensor<T>& grad_input,
 85 |         cudaStream_t stream = nullptr
 86 |     );
 87 | 
 88 |     /**
 89 |      * @brief Update parameters
 90 |      * 
 91 |      * @param learning_rate Learning rate
 92 |      * @param stream CUDA stream
 93 |      */
 94 |     void updateParameters(float learning_rate, cudaStream_t stream = nullptr);
 95 | 
 96 |     /**
 97 |      * @brief Get attention configuration
 98 |      * 
 99 |      * @return const MemoryAttentionConfig& Configuration
100 |      */
101 |     const MemoryAttentionConfig& getConfig() const { return config_; }
102 | 
103 |     /**
104 |      * @brief Set attention configuration
105 |      * 
106 |      * @param config New configuration
107 |      */
108 |     void setConfig(const MemoryAttentionConfig& config) { config_ = config; }
109 | 
110 |     /**
111 |      * @brief Get attention statistics
112 |      * 
113 |      * @return std::vector<float> Attention statistics
114 |      */
115 |     std::vector<float> getStats() const;
116 | 
117 |     /**
118 |      * @brief Save attention parameters
119 |      * 
120 |      * @param path Path to save parameters
121 |      */
122 |     void save(const std::string& path) const;
123 | 
124 |     /**
125 |      * @brief Load attention parameters
126 |      * 
127 |      * @param path Path to load parameters from
128 |      */
129 |     void load(const std::string& path);
130 | 
131 | private:
132 |     // Model parameters
133 |     Tensor<T> query_weight_;     // [hidden_dim, hidden_dim]
134 |     Tensor<T> key_weight_;       // [hidden_dim, hidden_dim]
135 |     Tensor<T> value_weight_;     // [hidden_dim, hidden_dim]
136 |     Tensor<T> output_weight_;    // [hidden_dim, hidden_dim]
137 |     
138 |     // Optional bias parameters
139 |     Tensor<T> query_bias_;       // [hidden_dim]
140 |     Tensor<T> key_bias_;         // [hidden_dim]
141 |     Tensor<T> value_bias_;       // [hidden_dim]
142 |     Tensor<T> output_bias_;      // [hidden_dim]
143 |     
144 |     // Memory compression parameters
145 |     Tensor<T> memory_proj_;      // [hidden_dim, compressed_dim]
146 |     Tensor<T> memory_gate_;      // [hidden_dim, 1]
147 |     
148 |     // Parameter gradients
149 |     Tensor<T> query_weight_grad_;
150 |     Tensor<T> key_weight_grad_;
151 |     Tensor<T> value_weight_grad_;
152 |     Tensor<T> output_weight_grad_;
153 |     Tensor<T> query_bias_grad_;
154 |     Tensor<T> key_bias_grad_;
155 |     Tensor<T> value_bias_grad_;
156 |     Tensor<T> output_bias_grad_;
157 |     Tensor<T> memory_proj_grad_;
158 |     Tensor<T> memory_gate_grad_;
159 |     
160 |     // Intermediate buffers
161 |     Tensor<T> query_;           // [batch_size, seq_len, hidden_dim]
162 |     Tensor<T> key_;            // [batch_size, mem_len, hidden_dim]
163 |     Tensor<T> value_;          // [batch_size, mem_len, hidden_dim]
164 |     Tensor<T> attention_scores_;  // [batch_size, num_heads, seq_len, mem_len]
165 |     Tensor<T> attention_probs_;   // [batch_size, num_heads, seq_len, mem_len]
166 |     Tensor<T> attention_output_;  // [batch_size, seq_len, hidden_dim]
167 |     Tensor<T> memory_gate_scores_;  // [batch_size, seq_len, 1]
168 |     
169 |     // Configuration and state
170 |     MemoryAttentionConfig config_;
171 |     memory::MemoryBank<T>& memory_bank_;
172 |     
173 |     // Helper functions
174 |     void initializeParameters(cudaStream_t stream);
175 |     
176 |     void projectQKV(
177 |         const Tensor<T>& input,
178 |         cudaStream_t stream
179 |     );
180 |     
181 |     void computeAttentionScores(
182 |         const Tensor<T>* mask,
183 |         cudaStream_t stream
184 |     );
185 |     
186 |     void applyAttention(cudaStream_t stream);
187 |     
188 |     void projectOutput(
189 |         Tensor<T>& output,
190 |         cudaStream_t stream
191 |     );
192 |     
193 |     void compressMemory(cudaStream_t stream);
194 |     
195 |     void computeMemoryGating(
196 |         const Tensor<T>& input,
197 |         cudaStream_t stream
198 |     );
199 |     
200 |     void applyPositionEmbeddings(cudaStream_t stream);
201 | };
202 | 
203 | // Explicit instantiations
204 | extern template class MemoryAttention<float>;
205 | extern template class MemoryAttention<half>;
206 | 
207 | } // namespace attention
208 | } // namespace ltm
209 | 


--------------------------------------------------------------------------------
/include/core/attention/flash_attention.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cuda_runtime.h>
  4 | #include <vector>
  5 | #include <memory>
  6 | #include "core/utils/tensor.cuh"
  7 | #include "core/utils/cuda_utils.cuh"
  8 | 
  9 | namespace ltm {
 10 | namespace attention {
 11 | 
 12 | /**
 13 |  * @brief Configuration for flash attention
 14 |  */
 15 | struct FlashAttentionConfig {
 16 |     int hidden_dim = 768;            // Hidden dimension
 17 |     int num_heads = 12;              // Number of attention heads
 18 |     int head_dim = 64;               // Dimension per head
 19 |     float dropout_prob = 0.1f;       // Attention dropout probability
 20 |     bool use_bias = true;           // Use bias in projections
 21 |     bool scale_by_dim = true;       // Scale attention by sqrt(head_dim)
 22 |     bool causal = false;            // Use causal attention mask
 23 |     int block_size = 64;            // Block size for tiling
 24 |     int chunk_size = 1024;          // Chunk size for memory efficiency
 25 |     bool use_alibi = false;         // Use ALiBi position bias
 26 |     bool use_rope = true;           // Use rotary position embeddings
 27 |     bool fuse_qkv = true;          // Fuse QKV projections
 28 |     bool fuse_softmax = true;      // Fuse softmax computation
 29 |     bool use_flash_v2 = true;      // Use FlashAttention v2 optimizations
 30 | };
 31 | 
 32 | /**
 33 |  * @brief Flash attention module
 34 |  * 
 35 |  * Implements efficient attention computation with minimal memory overhead
 36 |  * by processing attention in chunks and avoiding materializing the full
 37 |  * attention matrix.
 38 |  * 
 39 |  * @tparam T Data type (float or half)
 40 |  */
 41 | template<typename T>
 42 | class FlashAttention {
 43 | public:
 44 |     /**
 45 |      * @brief Create flash attention module
 46 |      * 
 47 |      * @param config Attention configuration
 48 |      */
 49 |     explicit FlashAttention(const FlashAttentionConfig& config);
 50 | 
 51 |     /**
 52 |      * @brief Initialize parameters
 53 |      * 
 54 |      * @param stream CUDA stream
 55 |      */
 56 |     void initialize(cudaStream_t stream = nullptr);
 57 | 
 58 |     /**
 59 |      * @brief Forward pass
 60 |      * 
 61 |      * @param input Input tensor [batch_size, seq_len, hidden_dim]
 62 |      * @param output Output tensor [batch_size, seq_len, hidden_dim]
 63 |      * @param attention_mask Optional attention mask
 64 |      * @param stream CUDA stream
 65 |      */
 66 |     void forward(
 67 |         const Tensor<T>& input,
 68 |         Tensor<T>& output,
 69 |         const Tensor<T>* attention_mask = nullptr,
 70 |         cudaStream_t stream = nullptr
 71 |     );
 72 | 
 73 |     /**
 74 |      * @brief Backward pass
 75 |      * 
 76 |      * @param grad_output Gradient w.r.t. output [batch_size, seq_len, hidden_dim]
 77 |      * @param grad_input Gradient w.r.t. input [batch_size, seq_len, hidden_dim]
 78 |      * @param stream CUDA stream
 79 |      */
 80 |     void backward(
 81 |         const Tensor<T>& grad_output,
 82 |         Tensor<T>& grad_input,
 83 |         cudaStream_t stream = nullptr
 84 |     );
 85 | 
 86 |     /**
 87 |      * @brief Update parameters
 88 |      * 
 89 |      * @param learning_rate Learning rate
 90 |      * @param stream CUDA stream
 91 |      */
 92 |     void updateParameters(float learning_rate, cudaStream_t stream = nullptr);
 93 | 
 94 |     /**
 95 |      * @brief Get attention configuration
 96 |      * 
 97 |      * @return const FlashAttentionConfig& Configuration
 98 |      */
 99 |     const FlashAttentionConfig& getConfig() const { return config_; }
100 | 
101 |     /**
102 |      * @brief Set attention configuration
103 |      * 
104 |      * @param config New configuration
105 |      */
106 |     void setConfig(const FlashAttentionConfig& config) { config_ = config; }
107 | 
108 |     /**
109 |      * @brief Get attention statistics
110 |      * 
111 |      * @return std::vector<float> Attention statistics
112 |      */
113 |     std::vector<float> getStats() const;
114 | 
115 |     /**
116 |      * @brief Save attention parameters
117 |      * 
118 |      * @param path Path to save parameters
119 |      */
120 |     void save(const std::string& path) const;
121 | 
122 |     /**
123 |      * @brief Load attention parameters
124 |      * 
125 |      * @param path Path to load parameters from
126 |      */
127 |     void load(const std::string& path);
128 | 
129 | private:
130 |     // Model parameters
131 |     Tensor<T> qkv_weight_;      // [3, hidden_dim, hidden_dim]
132 |     Tensor<T> output_weight_;   // [hidden_dim, hidden_dim]
133 |     
134 |     // Optional bias parameters
135 |     Tensor<T> qkv_bias_;        // [3, hidden_dim]
136 |     Tensor<T> output_bias_;     // [hidden_dim]
137 |     
138 |     // Parameter gradients
139 |     Tensor<T> qkv_weight_grad_;
140 |     Tensor<T> output_weight_grad_;
141 |     Tensor<T> qkv_bias_grad_;
142 |     Tensor<T> output_bias_grad_;
143 |     
144 |     // Intermediate buffers
145 |     Tensor<T> qkv_;            // [batch_size, seq_len, 3, hidden_dim]
146 |     Tensor<T> query_;          // [batch_size, num_heads, seq_len, head_dim]
147 |     Tensor<T> key_;           // [batch_size, num_heads, seq_len, head_dim]
148 |     Tensor<T> value_;         // [batch_size, num_heads, seq_len, head_dim]
149 |     Tensor<T> attention_output_;  // [batch_size, seq_len, hidden_dim]
150 |     
151 |     // Tiling state
152 |     struct TileInfo {
153 |         int block_size;       // Block size for tiling
154 |         int num_blocks;       // Number of blocks
155 |         int chunk_size;       // Chunk size for memory efficiency
156 |         int num_chunks;       // Number of chunks
157 |         float scaling;        // Attention scaling factor
158 |     };
159 |     TileInfo tile_info_;
160 |     
161 |     // Configuration
162 |     FlashAttentionConfig config_;
163 |     
164 |     // Helper functions
165 |     void initializeParameters(cudaStream_t stream);
166 |     
167 |     void projectQKV(
168 |         const Tensor<T>& input,
169 |         cudaStream_t stream
170 |     );
171 |     
172 |     void computeAttention(
173 |         const Tensor<T>* mask,
174 |         cudaStream_t stream
175 |     );
176 |     
177 |     void projectOutput(
178 |         Tensor<T>& output,
179 |         cudaStream_t stream
180 |     );
181 |     
182 |     void applyPositionEmbeddings(cudaStream_t stream);
183 |     
184 |     void computeBlockMaxima(
185 |         const Tensor<T>& block,
186 |         Tensor<T>& maxima,
187 |         cudaStream_t stream
188 |     );
189 |     
190 |     void computeBlockSoftmax(
191 |         const Tensor<T>& block,
192 |         const Tensor<T>& maxima,
193 |         Tensor<T>& softmax,
194 |         cudaStream_t stream
195 |     );
196 |     
197 |     void computeChunkAttention(
198 |         const Tensor<T>& query_chunk,
199 |         const Tensor<T>& key_chunk,
200 |         const Tensor<T>& value_chunk,
201 |         Tensor<T>& output_chunk,
202 |         const Tensor<T>* mask_chunk,
203 |         cudaStream_t stream
204 |     );
205 |     
206 |     void updateOutputChunk(
207 |         const Tensor<T>& chunk_output,
208 |         Tensor<T>& final_output,
209 |         int chunk_idx,
210 |         cudaStream_t stream
211 |     );
212 | };
213 | 
214 | // Explicit instantiations
215 | extern template class FlashAttention<float>;
216 | extern template class FlashAttention<half>;
217 | 
218 | } // namespace attention
219 | } // namespace ltm
220 | 


--------------------------------------------------------------------------------
/include/core/utils/cuda_utils.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cuda_runtime.h>
  4 | #include <cuda_fp16.h>
  5 | #include <cublas_v2.h>
  6 | #include <stdexcept>
  7 | #include <string>
  8 | #include <sstream>
  9 | 
 10 | namespace ltm {
 11 | 
 12 | /**
 13 |  * @brief Check CUDA error and throw exception if any
 14 |  * 
 15 |  * @param error CUDA error code
 16 |  * @param file Source file name
 17 |  * @param line Source line number
 18 |  */
 19 | inline void checkCudaError(cudaError_t error, const char* file, int line) {
 20 |     if (error != cudaSuccess) {
 21 |         std::stringstream ss;
 22 |         ss << "CUDA error " << cudaGetErrorString(error)
 23 |            << " at " << file << ":" << line;
 24 |         throw std::runtime_error(ss.str());
 25 |     }
 26 | }
 27 | 
 28 | /**
 29 |  * @brief Check cuBLAS error and throw exception if any
 30 |  * 
 31 |  * @param error cuBLAS error code
 32 |  * @param file Source file name
 33 |  * @param line Source line number
 34 |  */
 35 | inline void checkCublasError(cublasStatus_t error, const char* file, int line) {
 36 |     if (error != CUBLAS_STATUS_SUCCESS) {
 37 |         std::stringstream ss;
 38 |         ss << "cuBLAS error " << error
 39 |            << " at " << file << ":" << line;
 40 |         throw std::runtime_error(ss.str());
 41 |     }
 42 | }
 43 | 
 44 | // Macro for CUDA error checking
 45 | #define CUDA_CHECK(err) checkCudaError(err, __FILE__, __LINE__)
 46 | 
 47 | // Macro for cuBLAS error checking
 48 | #define CUBLAS_CHECK(err) checkCublasError(err, __FILE__, __LINE__)
 49 | 
 50 | /**
 51 |  * @brief Get optimal block size for CUDA kernel
 52 |  * 
 53 |  * @param func Kernel function pointer
 54 |  * @param dynamic_smem_size Dynamic shared memory size
 55 |  * @return int Optimal block size
 56 |  */
 57 | template<typename F>
 58 | inline int getOptimalBlockSize(F func, size_t dynamic_smem_size = 0) {
 59 |     int min_grid_size;
 60 |     int block_size;
 61 |     CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(
 62 |         &min_grid_size,
 63 |         &block_size,
 64 |         func,
 65 |         dynamic_smem_size
 66 |     ));
 67 |     return block_size;
 68 | }
 69 | 
 70 | /**
 71 |  * @brief Get grid size for given problem size and block size
 72 |  * 
 73 |  * @param n Total number of elements
 74 |  * @param block_size Thread block size
 75 |  * @return int Grid size
 76 |  */
 77 | inline int getGridSize(int n, int block_size) {
 78 |     return (n + block_size - 1) / block_size;
 79 | }
 80 | 
 81 | /**
 82 |  * @brief Convert data type to float
 83 |  * 
 84 |  * @tparam T Input type
 85 |  * @param x Input value
 86 |  * @return float Converted value
 87 |  */
 88 | template<typename T>
 89 | __device__ __forceinline__ float type2float(T x);
 90 | 
 91 | // Specializations for different types
 92 | template<>
 93 | __device__ __forceinline__ float type2float<float>(float x) {
 94 |     return x;
 95 | }
 96 | 
 97 | template<>
 98 | __device__ __forceinline__ float type2float<half>(half x) {
 99 |     return __half2float(x);
100 | }
101 | 
102 | /**
103 |  * @brief Convert float to target type
104 |  * 
105 |  * @tparam T Target type
106 |  * @param x Float value
107 |  * @return T Converted value
108 |  */
109 | template<typename T>
110 | __device__ __forceinline__ T cuda_cast(float x);
111 | 
112 | // Specializations for different types
113 | template<>
114 | __device__ __forceinline__ float cuda_cast<float>(float x) {
115 |     return x;
116 | }
117 | 
118 | template<>
119 | __device__ __forceinline__ half cuda_cast<half>(float x) {
120 |     return __float2half(x);
121 | }
122 | 
123 | /**
124 |  * @brief CUDA memory deleter for unique_ptr
125 |  */
126 | struct CudaDeleter {
127 |     void operator()(void* ptr) const {
128 |         if (ptr) {
129 |             CUDA_CHECK(cudaFree(ptr));
130 |         }
131 |     }
132 | };
133 | 
134 | /**
135 |  * @brief Unique pointer for CUDA memory
136 |  */
137 | template<typename T>
138 | using CudaUniquePtr = std::unique_ptr<T, CudaDeleter>;
139 | 
140 | /**
141 |  * @brief Allocate CUDA memory with unique_ptr
142 |  * 
143 |  * @tparam T Data type
144 |  * @param size Number of elements
145 |  * @return CudaUniquePtr<T> Unique pointer to allocated memory
146 |  */
147 | template<typename T>
148 | CudaUniquePtr<T> cudaMakeUnique(size_t size) {
149 |     T* ptr;
150 |     CUDA_CHECK(cudaMalloc(&ptr, size * sizeof(T)));
151 |     return CudaUniquePtr<T>(ptr);
152 | }
153 | 
154 | /**
155 |  * @brief CUDA stream wrapper
156 |  */
157 | class CudaStream {
158 | public:
159 |     CudaStream() {
160 |         CUDA_CHECK(cudaStreamCreate(&stream_));
161 |     }
162 | 
163 |     ~CudaStream() {
164 |         if (stream_) {
165 |             CUDA_CHECK(cudaStreamDestroy(stream_));
166 |         }
167 |     }
168 | 
169 |     // Disable copy
170 |     CudaStream(const CudaStream&) = delete;
171 |     CudaStream& operator=(const CudaStream&) = delete;
172 | 
173 |     // Enable move
174 |     CudaStream(CudaStream&& other) noexcept : stream_(other.stream_) {
175 |         other.stream_ = nullptr;
176 |     }
177 | 
178 |     CudaStream& operator=(CudaStream&& other) noexcept {
179 |         if (this != &other) {
180 |             if (stream_) {
181 |                 CUDA_CHECK(cudaStreamDestroy(stream_));
182 |             }
183 |             stream_ = other.stream_;
184 |             other.stream_ = nullptr;
185 |         }
186 |         return *this;
187 |     }
188 | 
189 |     /**
190 |      * @brief Get raw CUDA stream
191 |      */
192 |     cudaStream_t get() const { return stream_; }
193 | 
194 |     /**
195 |      * @brief Synchronize stream
196 |      */
197 |     void synchronize() const {
198 |         CUDA_CHECK(cudaStreamSynchronize(stream_));
199 |     }
200 | 
201 | private:
202 |     cudaStream_t stream_ = nullptr;
203 | };
204 | 
205 | /**
206 |  * @brief CUDA event wrapper
207 |  */
208 | class CudaEvent {
209 | public:
210 |     CudaEvent() {
211 |         CUDA_CHECK(cudaEventCreate(&event_));
212 |     }
213 | 
214 |     ~CudaEvent() {
215 |         if (event_) {
216 |             CUDA_CHECK(cudaEventDestroy(event_));
217 |         }
218 |     }
219 | 
220 |     // Disable copy
221 |     CudaEvent(const CudaEvent&) = delete;
222 |     CudaEvent& operator=(const CudaEvent&) = delete;
223 | 
224 |     // Enable move
225 |     CudaEvent(CudaEvent&& other) noexcept : event_(other.event_) {
226 |         other.event_ = nullptr;
227 |     }
228 | 
229 |     CudaEvent& operator=(CudaEvent&& other) noexcept {
230 |         if (this != &other) {
231 |             if (event_) {
232 |                 CUDA_CHECK(cudaEventDestroy(event_));
233 |             }
234 |             event_ = other.event_;
235 |             other.event_ = nullptr;
236 |         }
237 |         return *this;
238 |     }
239 | 
240 |     /**
241 |      * @brief Get raw CUDA event
242 |      */
243 |     cudaEvent_t get() const { return event_; }
244 | 
245 |     /**
246 |      * @brief Record event on stream
247 |      */
248 |     void record(cudaStream_t stream = nullptr) {
249 |         CUDA_CHECK(cudaEventRecord(event_, stream));
250 |     }
251 | 
252 |     /**
253 |      * @brief Synchronize event
254 |      */
255 |     void synchronize() {
256 |         CUDA_CHECK(cudaEventSynchronize(event_));
257 |     }
258 | 
259 | private:
260 |     cudaEvent_t event_ = nullptr;
261 | };
262 | 
263 | /**
264 |  * @brief Get available GPU memory
265 |  * 
266 |  * @param device_id GPU device ID
267 |  * @return std::pair<size_t, size_t> Free and total memory in bytes
268 |  */
269 | inline std::pair<size_t, size_t> getGpuMemoryInfo(int device_id = 0) {
270 |     size_t free_memory, total_memory;
271 |     CUDA_CHECK(cudaSetDevice(device_id));
272 |     CUDA_CHECK(cudaMemGetInfo(&free_memory, &total_memory));
273 |     return {free_memory, total_memory};
274 | }
275 | 
276 | /**
277 |  * @brief Set GPU device with error checking
278 |  * 
279 |  * @param device_id GPU device ID
280 |  */
281 | inline void setDevice(int device_id) {
282 |     CUDA_CHECK(cudaSetDevice(device_id));
283 | }
284 | 
285 | /**
286 |  * @brief Get current GPU device
287 |  * 
288 |  * @return int Current device ID
289 |  */
290 | inline int getCurrentDevice() {
291 |     int device_id;
292 |     CUDA_CHECK(cudaGetDevice(&device_id));
293 |     return device_id;
294 | }
295 | 
296 | /**
297 |  * @brief Get number of available GPUs
298 |  * 
299 |  * @return int Number of GPUs
300 |  */
301 | inline int getDeviceCount() {
302 |     int count;
303 |     CUDA_CHECK(cudaGetDeviceCount(&count));
304 |     return count;
305 | }
306 | 
307 | } // namespace ltm
308 | 


--------------------------------------------------------------------------------
/include/core/transformer/titan_inspired_block.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cuda_runtime.h>
  4 | #include <vector>
  5 | #include <memory>
  6 | #include "core/utils/tensor.cuh"
  7 | #include "core/utils/cuda_utils.cuh"
  8 | #include "core/attention/flash_attention.cuh"
  9 | #include "core/attention/memory_attention.cuh"
 10 | #include "core/ltm/memory_bank.cuh"
 11 | #include "core/ltm/compression_gate.cuh"
 12 | 
 13 | namespace ltm {
 14 | namespace transformer {
 15 | 
 16 | /**
 17 |  * @brief Configuration for Titan-inspired transformer block
 18 |  */
 19 | struct TitanBlockConfig {
 20 |     // Model dimensions
 21 |     int hidden_dim = 768;            // Hidden dimension
 22 |     int ffn_dim = 3072;             // Feed-forward dimension
 23 |     int num_heads = 12;              // Number of attention heads
 24 |     int head_dim = 64;              // Dimension per head
 25 |     
 26 |     // Memory configuration
 27 |     int memory_slots = 512;         // Number of memory slots
 28 |     int memory_dim = 64;           // Memory slot dimension
 29 |     float memory_update_rate = 0.9f;  // Memory update rate
 30 |     bool use_memory_compression = true;  // Use memory compression
 31 |     float memory_compression_ratio = 0.5f;  // Memory compression ratio
 32 |     
 33 |     // Attention configuration
 34 |     bool use_flash_attention = true;  // Use flash attention
 35 |     bool use_alibi = false;         // Use ALiBi position bias
 36 |     bool use_rotary = true;         // Use rotary embeddings
 37 |     float dropout_prob = 0.1f;      // Dropout probability
 38 |     
 39 |     // Architecture configuration
 40 |     bool use_parallel_attention = true;  // Run attention layers in parallel
 41 |     bool use_memory_gating = true;     // Use gating for memory integration
 42 |     bool use_layer_norm = true;        // Use layer normalization
 43 |     bool use_bias = true;              // Use bias terms
 44 |     bool fuse_operations = true;       // Fuse compatible operations
 45 |     
 46 |     // Training configuration
 47 |     bool learn_memory = true;          // Learn memory parameters
 48 |     bool learn_compression = true;     // Learn compression parameters
 49 |     bool gradient_checkpointing = true;  // Use gradient checkpointing
 50 | };
 51 | 
 52 | /**
 53 |  * @brief Titan-inspired transformer block
 54 |  * 
 55 |  * Implements a transformer block with long-term memory capabilities,
 56 |  * inspired by Google's Titan architecture. Integrates flash attention,
 57 |  * memory bank, and compression mechanisms.
 58 |  * 
 59 |  * @tparam T Data type (float or half)
 60 |  */
 61 | template<typename T>
 62 | class TitanBlock {
 63 | public:
 64 |     /**
 65 |      * @brief Create Titan-inspired transformer block
 66 |      * 
 67 |      * @param config Block configuration
 68 |      */
 69 |     explicit TitanBlock(const TitanBlockConfig& config);
 70 | 
 71 |     /**
 72 |      * @brief Initialize parameters
 73 |      * 
 74 |      * @param stream CUDA stream
 75 |      */
 76 |     void initialize(cudaStream_t stream = nullptr);
 77 | 
 78 |     /**
 79 |      * @brief Forward pass
 80 |      * 
 81 |      * @param input Input tensor [batch_size, seq_len, hidden_dim]
 82 |      * @param output Output tensor [batch_size, seq_len, hidden_dim]
 83 |      * @param attention_mask Optional attention mask
 84 |      * @param stream CUDA stream
 85 |      */
 86 |     void forward(
 87 |         const Tensor<T>& input,
 88 |         Tensor<T>& output,
 89 |         const Tensor<T>* attention_mask = nullptr,
 90 |         cudaStream_t stream = nullptr
 91 |     );
 92 | 
 93 |     /**
 94 |      * @brief Backward pass
 95 |      * 
 96 |      * @param grad_output Gradient w.r.t. output [batch_size, seq_len, hidden_dim]
 97 |      * @param grad_input Gradient w.r.t. input [batch_size, seq_len, hidden_dim]
 98 |      * @param stream CUDA stream
 99 |      */
100 |     void backward(
101 |         const Tensor<T>& grad_output,
102 |         Tensor<T>& grad_input,
103 |         cudaStream_t stream = nullptr
104 |     );
105 | 
106 |     /**
107 |      * @brief Update parameters
108 |      * 
109 |      * @param learning_rate Learning rate
110 |      * @param stream CUDA stream
111 |      */
112 |     void updateParameters(float learning_rate, cudaStream_t stream = nullptr);
113 | 
114 |     /**
115 |      * @brief Get block configuration
116 |      * 
117 |      * @return const TitanBlockConfig& Configuration
118 |      */
119 |     const TitanBlockConfig& getConfig() const { return config_; }
120 | 
121 |     /**
122 |      * @brief Set block configuration
123 |      * 
124 |      * @param config New configuration
125 |      */
126 |     void setConfig(const TitanBlockConfig& config) { config_ = config; }
127 | 
128 |     /**
129 |      * @brief Get memory bank
130 |      * 
131 |      * @return memory::MemoryBank<T>& Memory bank reference
132 |      */
133 |     memory::MemoryBank<T>& getMemoryBank() { return *memory_bank_; }
134 | 
135 |     /**
136 |      * @brief Get block statistics
137 |      * 
138 |      * @return std::vector<float> Block statistics
139 |      */
140 |     std::vector<float> getStats() const;
141 | 
142 |     /**
143 |      * @brief Save block parameters
144 |      * 
145 |      * @param path Path to save parameters
146 |      */
147 |     void save(const std::string& path) const;
148 | 
149 |     /**
150 |      * @brief Load block parameters
151 |      * 
152 |      * @param path Path to load parameters from
153 |      */
154 |     void load(const std::string& path);
155 | 
156 | private:
157 |     // Core components
158 |     std::unique_ptr<attention::FlashAttention<T>> self_attention_;
159 |     std::unique_ptr<attention::MemoryAttention<T>> memory_attention_;
160 |     std::unique_ptr<memory::MemoryBank<T>> memory_bank_;
161 |     std::unique_ptr<memory::CompressionGate<T>> compression_gate_;
162 |     
163 |     // Feed-forward parameters
164 |     Tensor<T> ffn_weight1_;    // [hidden_dim, ffn_dim]
165 |     Tensor<T> ffn_weight2_;    // [ffn_dim, hidden_dim]
166 |     Tensor<T> ffn_bias1_;      // [ffn_dim]
167 |     Tensor<T> ffn_bias2_;      // [hidden_dim]
168 |     
169 |     // Layer normalization parameters
170 |     Tensor<T> norm1_weight_;   // [hidden_dim]
171 |     Tensor<T> norm1_bias_;     // [hidden_dim]
172 |     Tensor<T> norm2_weight_;   // [hidden_dim]
173 |     Tensor<T> norm2_bias_;     // [hidden_dim]
174 |     
175 |     // Memory gating parameters
176 |     Tensor<T> memory_gate_weight_;  // [hidden_dim, hidden_dim]
177 |     Tensor<T> memory_gate_bias_;    // [hidden_dim]
178 |     
179 |     // Parameter gradients
180 |     Tensor<T> ffn_weight1_grad_;
181 |     Tensor<T> ffn_weight2_grad_;
182 |     Tensor<T> ffn_bias1_grad_;
183 |     Tensor<T> ffn_bias2_grad_;
184 |     Tensor<T> norm1_weight_grad_;
185 |     Tensor<T> norm1_bias_grad_;
186 |     Tensor<T> norm2_weight_grad_;
187 |     Tensor<T> norm2_bias_grad_;
188 |     Tensor<T> memory_gate_weight_grad_;
189 |     Tensor<T> memory_gate_bias_grad_;
190 |     
191 |     // Intermediate buffers
192 |     Tensor<T> attention_output_;
193 |     Tensor<T> memory_output_;
194 |     Tensor<T> ffn_intermediate_;
195 |     Tensor<T> normalized_input_;
196 |     Tensor<T> memory_gate_scores_;
197 |     
198 |     // Configuration
199 |     TitanBlockConfig config_;
200 |     
201 |     // Helper functions
202 |     void initializeParameters(cudaStream_t stream);
203 |     
204 |     void computeSelfAttention(
205 |         const Tensor<T>& input,
206 |         const Tensor<T>* mask,
207 |         cudaStream_t stream
208 |     );
209 |     
210 |     void computeMemoryAttention(
211 |         const Tensor<T>& input,
212 |         const Tensor<T>* mask,
213 |         cudaStream_t stream
214 |     );
215 |     
216 |     void computeFFN(
217 |         const Tensor<T>& input,
218 |         cudaStream_t stream
219 |     );
220 |     
221 |     void layerNorm(
222 |         const Tensor<T>& input,
223 |         const Tensor<T>& weight,
224 |         const Tensor<T>& bias,
225 |         Tensor<T>& output,
226 |         cudaStream_t stream
227 |     );
228 |     
229 |     void computeMemoryGating(
230 |         const Tensor<T>& input,
231 |         cudaStream_t stream
232 |     );
233 |     
234 |     void integrateMemory(
235 |         const Tensor<T>& input,
236 |         Tensor<T>& output,
237 |         cudaStream_t stream
238 |     );
239 | };
240 | 
241 | // Explicit instantiations
242 | extern template class TitanBlock<float>;
243 | extern template class TitanBlock<half>;
244 | 
245 | } // namespace transformer
246 | } // namespace ltm
247 | 


--------------------------------------------------------------------------------
/src/core/ops/mma_ops.cu:
--------------------------------------------------------------------------------
  1 | #include <cutlass/gemm/device/gemm.h>
  2 | #include <cutlass/epilogue/thread/linear_combination.h>
  3 | #include <cutlass/layout/matrix.h>
  4 | #include "core/ops/mma_ops.cuh"
  5 | #include "core/utils/cuda_utils.cuh"
  6 | 
  7 | namespace ltm {
  8 | namespace ops {
  9 | 
 10 | // CUTLASS GEMM configurations
 11 | using ElementInput = cutlass::half_t;
 12 | using ElementOutput = cutlass::half_t;
 13 | using ElementAccumulator = float;
 14 | using ElementCompute = float;
 15 | 
 16 | using LayoutInputA = cutlass::layout::RowMajor;
 17 | using LayoutInputB = cutlass::layout::ColumnMajor;
 18 | using LayoutOutput = cutlass::layout::RowMajor;
 19 | 
 20 | using MMAOp = cutlass::arch::OpClassTensorOp;
 21 | using SmArch = cutlass::arch::Sm80;
 22 | 
 23 | using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
 24 | using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
 25 | using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
 26 | 
 27 | constexpr int NumStages = 3;
 28 | constexpr bool SplitKSerial = false;
 29 | 
 30 | using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
 31 |     ElementOutput,
 32 |     128 / cutlass::sizeof_bits<ElementOutput>::value,
 33 |     ElementAccumulator,
 34 |     ElementCompute
 35 | >;
 36 | 
 37 | using Gemm = cutlass::gemm::device::Gemm<
 38 |     ElementInput,
 39 |     LayoutInputA,
 40 |     ElementInput,
 41 |     LayoutInputB,
 42 |     ElementOutput,
 43 |     LayoutOutput,
 44 |     ElementAccumulator,
 45 |     MMAOp,
 46 |     SmArch,
 47 |     ThreadblockShape,
 48 |     WarpShape,
 49 |     InstructionShape,
 50 |     EpilogueOp,
 51 |     cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
 52 |     NumStages,
 53 |     128 / cutlass::sizeof_bits<ElementInput>::value,
 54 |     128 / cutlass::sizeof_bits<ElementInput>::value,
 55 |     SplitKSerial
 56 | >;
 57 | 
 58 | template<typename T>
 59 | void matmul(
 60 |     const Tensor<T>& A,
 61 |     const Tensor<T>& B,
 62 |     Tensor<T>& C,
 63 |     bool transpose_a,
 64 |     bool transpose_b,
 65 |     float alpha,
 66 |     float beta,
 67 |     cudaStream_t stream
 68 | ) {
 69 |     // Get dimensions
 70 |     int m = A.shape()[0];
 71 |     int k = transpose_a ? A.shape()[0] : A.shape()[1];
 72 |     int n = transpose_b ? B.shape()[0] : B.shape()[1];
 73 |     
 74 |     // Create GEMM configuration
 75 |     typename Gemm::Arguments args(
 76 |         {m, n, k},                                // Problem size
 77 |         {reinterpret_cast<ElementInput*>(const_cast<T*>(A.data())), k},  // A
 78 |         {reinterpret_cast<ElementInput*>(const_cast<T*>(B.data())), n},  // B
 79 |         {reinterpret_cast<ElementOutput*>(C.data()), n},                  // C
 80 |         {reinterpret_cast<ElementOutput*>(C.data()), n},                  // D
 81 |         {alpha, beta}                             // alpha, beta
 82 |     );
 83 |     
 84 |     // Initialize GEMM object
 85 |     Gemm gemm_op;
 86 |     
 87 |     // Launch kernel
 88 |     cutlass::Status status = gemm_op(args, nullptr, stream);
 89 |     if (status != cutlass::Status::kSuccess) {
 90 |         throw std::runtime_error("CUTLASS GEMM failed");
 91 |     }
 92 | }
 93 | 
 94 | // Fused MMA + GELU
 95 | template<typename T>
 96 | void mmaGelu(
 97 |     const Tensor<T>& A,
 98 |     const Tensor<T>& B,
 99 |     Tensor<T>& C,
100 |     bool transpose_a,
101 |     bool transpose_b,
102 |     cudaStream_t stream
103 | ) {
104 |     // Custom epilogue with GELU activation
105 |     using GeluEpilogueOp = cutlass::epilogue::thread::LinearCombinationGELU<
106 |         ElementOutput,
107 |         128 / cutlass::sizeof_bits<ElementOutput>::value,
108 |         ElementAccumulator,
109 |         ElementCompute
110 |     >;
111 |     
112 |     using GemmGelu = cutlass::gemm::device::Gemm<
113 |         ElementInput,
114 |         LayoutInputA,
115 |         ElementInput,
116 |         LayoutInputB,
117 |         ElementOutput,
118 |         LayoutOutput,
119 |         ElementAccumulator,
120 |         MMAOp,
121 |         SmArch,
122 |         ThreadblockShape,
123 |         WarpShape,
124 |         InstructionShape,
125 |         GeluEpilogueOp,
126 |         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
127 |         NumStages,
128 |         128 / cutlass::sizeof_bits<ElementInput>::value,
129 |         128 / cutlass::sizeof_bits<ElementInput>::value,
130 |         SplitKSerial
131 |     >;
132 |     
133 |     // Get dimensions
134 |     int m = A.shape()[0];
135 |     int k = transpose_a ? A.shape()[0] : A.shape()[1];
136 |     int n = transpose_b ? B.shape()[0] : B.shape()[1];
137 |     
138 |     // Create GEMM configuration
139 |     typename GemmGelu::Arguments args(
140 |         {m, n, k},
141 |         {reinterpret_cast<ElementInput*>(const_cast<T*>(A.data())), k},
142 |         {reinterpret_cast<ElementInput*>(const_cast<T*>(B.data())), n},
143 |         {reinterpret_cast<ElementOutput*>(C.data()), n},
144 |         {reinterpret_cast<ElementOutput*>(C.data()), n},
145 |         {1.0f, 0.0f}
146 |     );
147 |     
148 |     // Initialize GEMM object
149 |     GemmGelu gemm_op;
150 |     
151 |     // Launch kernel
152 |     cutlass::Status status = gemm_op(args, nullptr, stream);
153 |     if (status != cutlass::Status::kSuccess) {
154 |         throw std::runtime_error("CUTLASS GEMM+GELU failed");
155 |     }
156 | }
157 | 
158 | // Fused MMA + Dropout
159 | template<typename T>
160 | void mmaDropout(
161 |     const Tensor<T>& A,
162 |     const Tensor<T>& B,
163 |     Tensor<T>& C,
164 |     float dropout_prob,
165 |     unsigned long long seed,
166 |     bool transpose_a,
167 |     bool transpose_b,
168 |     cudaStream_t stream
169 | ) {
170 |     // Custom epilogue with dropout
171 |     using DropoutEpilogueOp = cutlass::epilogue::thread::LinearCombinationDropout<
172 |         ElementOutput,
173 |         128 / cutlass::sizeof_bits<ElementOutput>::value,
174 |         ElementAccumulator,
175 |         ElementCompute
176 |     >;
177 |     
178 |     using GemmDropout = cutlass::gemm::device::Gemm<
179 |         ElementInput,
180 |         LayoutInputA,
181 |         ElementInput,
182 |         LayoutInputB,
183 |         ElementOutput,
184 |         LayoutOutput,
185 |         ElementAccumulator,
186 |         MMAOp,
187 |         SmArch,
188 |         ThreadblockShape,
189 |         WarpShape,
190 |         InstructionShape,
191 |         DropoutEpilogueOp,
192 |         cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
193 |         NumStages,
194 |         128 / cutlass::sizeof_bits<ElementInput>::value,
195 |         128 / cutlass::sizeof_bits<ElementInput>::value,
196 |         SplitKSerial
197 |     >;
198 |     
199 |     // Get dimensions
200 |     int m = A.shape()[0];
201 |     int k = transpose_a ? A.shape()[0] : A.shape()[1];
202 |     int n = transpose_b ? B.shape()[0] : B.shape()[1];
203 |     
204 |     // Create GEMM configuration
205 |     typename GemmDropout::Arguments args(
206 |         {m, n, k},
207 |         {reinterpret_cast<ElementInput*>(const_cast<T*>(A.data())), k},
208 |         {reinterpret_cast<ElementInput*>(const_cast<T*>(B.data())), n},
209 |         {reinterpret_cast<ElementOutput*>(C.data()), n},
210 |         {reinterpret_cast<ElementOutput*>(C.data()), n},
211 |         {1.0f, 0.0f},
212 |         dropout_prob,
213 |         seed
214 |     );
215 |     
216 |     // Initialize GEMM object
217 |     GemmDropout gemm_op;
218 |     
219 |     // Launch kernel
220 |     cutlass::Status status = gemm_op(args, nullptr, stream);
221 |     if (status != cutlass::Status::kSuccess) {
222 |         throw std::runtime_error("CUTLASS GEMM+Dropout failed");
223 |     }
224 | }
225 | 
226 | // Explicit instantiations
227 | template void matmul<float>(
228 |     const Tensor<float>&, const Tensor<float>&, Tensor<float>&,
229 |     bool, bool, float, float, cudaStream_t
230 | );
231 | template void matmul<half>(
232 |     const Tensor<half>&, const Tensor<half>&, Tensor<half>&,
233 |     bool, bool, float, float, cudaStream_t
234 | );
235 | 
236 | template void mmaGelu<float>(
237 |     const Tensor<float>&, const Tensor<float>&, Tensor<float>&,
238 |     bool, bool, cudaStream_t
239 | );
240 | template void mmaGelu<half>(
241 |     const Tensor<half>&, const Tensor<half>&, Tensor<half>&,
242 |     bool, bool, cudaStream_t
243 | );
244 | 
245 | template void mmaDropout<float>(
246 |     const Tensor<float>&, const Tensor<float>&, Tensor<float>&,
247 |     float, unsigned long long, bool, bool, cudaStream_t
248 | );
249 | template void mmaDropout<half>(
250 |     const Tensor<half>&, const Tensor<half>&, Tensor<half>&,
251 |     float, unsigned long long, bool, bool, cudaStream_t
252 | );
253 | 
254 | } // namespace ops
255 | } // namespace ltm
256 | 


--------------------------------------------------------------------------------
/src/core/quantization/calibrator.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <thrust/device_ptr.h>
  3 | #include <thrust/sort.h>
  4 | #include <thrust/reduce.h>
  5 | #include <thrust/execution_policy.h>
  6 | #include "core/quantization/quantizer.cuh"
  7 | #include "core/utils/cuda_utils.cuh"
  8 | 
  9 | namespace ltm {
 10 | namespace quantization {
 11 | 
 12 | namespace {
 13 | 
 14 | // Kernel for collecting statistics
 15 | template<typename T>
 16 | __global__ void collectStatsKernel(
 17 |     const T* __restrict__ input,
 18 |     float* __restrict__ output,
 19 |     int size
 20 | ) {
 21 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 22 |     if (idx < size) {
 23 |         output[idx] = static_cast<float>(input[idx]);
 24 |     }
 25 | }
 26 | 
 27 | // Kernel for computing MSE
 28 | __global__ void computeMSEKernel(
 29 |     const float* __restrict__ original,
 30 |     const float* __restrict__ quantized,
 31 |     float* __restrict__ mse,
 32 |     int size
 33 | ) {
 34 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 35 |     if (idx < size) {
 36 |         float diff = original[idx] - quantized[idx];
 37 |         mse[idx] = diff * diff;
 38 |     }
 39 | }
 40 | 
 41 | // Kernel for entropy calculation
 42 | __global__ void computeHistogramKernel(
 43 |     const float* __restrict__ input,
 44 |     int* __restrict__ histogram,
 45 |     float min_val,
 46 |     float max_val,
 47 |     int num_bins,
 48 |     int size
 49 | ) {
 50 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 51 |     if (idx < size) {
 52 |         float val = input[idx];
 53 |         int bin = static_cast<int>((val - min_val) / (max_val - min_val) * num_bins);
 54 |         bin = max(0, min(bin, num_bins - 1));
 55 |         atomicAdd(&histogram[bin], 1);
 56 |     }
 57 | }
 58 | 
 59 | } // anonymous namespace
 60 | 
 61 | void MinMaxCalibrator::collectStats(const void* data, size_t size) {
 62 |     // Allocate device memory for stats
 63 |     float* d_data;
 64 |     CUDA_CHECK(cudaMalloc(&d_data, size * sizeof(float)));
 65 |     
 66 |     // Copy and convert data to float
 67 |     const int block_size = 256;
 68 |     const int num_blocks = (size + block_size - 1) / block_size;
 69 |     
 70 |     collectStatsKernel<float><<<num_blocks, block_size>>>(
 71 |         static_cast<const float*>(data),
 72 |         d_data,
 73 |         size
 74 |     );
 75 |     
 76 |     // Find min/max using Thrust
 77 |     thrust::device_ptr<float> d_ptr(d_data);
 78 |     auto minmax = thrust::minmax_element(thrust::device, d_ptr, d_ptr + size);
 79 |     
 80 |     float local_min = *minmax.first;
 81 |     float local_max = *minmax.second;
 82 |     
 83 |     min_val_ = std::min(min_val_, local_min);
 84 |     max_val_ = std::max(max_val_, local_max);
 85 |     
 86 |     CUDA_CHECK(cudaFree(d_data));
 87 | }
 88 | 
 89 | void MinMaxCalibrator::computeRanges(float& min_val, float& max_val) {
 90 |     min_val = min_val_;
 91 |     max_val = max_val_;
 92 | }
 93 | 
 94 | void MinMaxCalibrator::reset() {
 95 |     min_val_ = FLT_MAX;
 96 |     max_val_ = -FLT_MAX;
 97 | }
 98 | 
 99 | void PercentileCalibrator::collectStats(const void* data, size_t size) {
100 |     // Allocate device memory
101 |     float* d_data;
102 |     CUDA_CHECK(cudaMalloc(&d_data, size * sizeof(float)));
103 |     
104 |     // Copy and convert data
105 |     const int block_size = 256;
106 |     const int num_blocks = (size + block_size - 1) / block_size;
107 |     
108 |     collectStatsKernel<float><<<num_blocks, block_size>>>(
109 |         static_cast<const float*>(data),
110 |         d_data,
111 |         size
112 |     );
113 |     
114 |     // Copy to host for percentile computation
115 |     std::vector<float> h_data(size);
116 |     CUDA_CHECK(cudaMemcpy(
117 |         h_data.data(),
118 |         d_data,
119 |         size * sizeof(float),
120 |         cudaMemcpyDeviceToHost
121 |     ));
122 |     
123 |     // Store values for later percentile computation
124 |     values_.insert(values_.end(), h_data.begin(), h_data.end());
125 |     
126 |     CUDA_CHECK(cudaFree(d_data));
127 | }
128 | 
129 | void PercentileCalibrator::computeRanges(float& min_val, float& max_val) {
130 |     if (values_.empty()) {
131 |         min_val = 0.0f;
132 |         max_val = 0.0f;
133 |         return;
134 |     }
135 |     
136 |     // Sort values
137 |     std::sort(values_.begin(), values_.end());
138 |     
139 |     // Compute percentile indices
140 |     size_t lower_idx = static_cast<size_t>((100.0f - percentile_) * values_.size() / 100.0f);
141 |     size_t upper_idx = static_cast<size_t>(percentile_ * values_.size() / 100.0f);
142 |     
143 |     // Get percentile values
144 |     min_val = values_[lower_idx];
145 |     max_val = values_[upper_idx];
146 | }
147 | 
148 | void PercentileCalibrator::reset() {
149 |     values_.clear();
150 | }
151 | 
152 | void MSECalibrator::collectStats(const void* data, size_t size) {
153 |     // Store data for MSE computation
154 |     const float* float_data = static_cast<const float*>(data);
155 |     values_.insert(values_.end(), float_data, float_data + size);
156 | }
157 | 
158 | void MSECalibrator::computeRanges(float& min_val, float& max_val) {
159 |     if (values_.empty()) {
160 |         min_val = optimal_min_;
161 |         max_val = optimal_max_;
162 |         return;
163 |     }
164 |     
165 |     // Allocate device memory
166 |     float* d_original;
167 |     float* d_quantized;
168 |     float* d_mse;
169 |     CUDA_CHECK(cudaMalloc(&d_original, values_.size() * sizeof(float)));
170 |     CUDA_CHECK(cudaMalloc(&d_quantized, values_.size() * sizeof(float)));
171 |     CUDA_CHECK(cudaMalloc(&d_mse, values_.size() * sizeof(float)));
172 |     
173 |     // Copy original data to device
174 |     CUDA_CHECK(cudaMemcpy(
175 |         d_original,
176 |         values_.data(),
177 |         values_.size() * sizeof(float),
178 |         cudaMemcpyHostToDevice
179 |     ));
180 |     
181 |     // Grid/block configuration
182 |     const int block_size = 256;
183 |     const int num_blocks = (values_.size() + block_size - 1) / block_size;
184 |     
185 |     // Search for optimal range
186 |     float best_mse = FLT_MAX;
187 |     const int num_trials = 100;
188 |     
189 |     for (int i = 0; i < num_trials; ++i) {
190 |         // Try different ranges
191 |         float trial_min = thrust::reduce(
192 |             thrust::device,
193 |             thrust::device_pointer_cast(d_original),
194 |             thrust::device_pointer_cast(d_original + values_.size()),
195 |             FLT_MAX,
196 |             thrust::minimum<float>()
197 |         );
198 |         
199 |         float trial_max = thrust::reduce(
200 |             thrust::device,
201 |             thrust::device_pointer_cast(d_original),
202 |             thrust::device_pointer_cast(d_original + values_.size()),
203 |             -FLT_MAX,
204 |             thrust::maximum<float>()
205 |         );
206 |         
207 |         // Simulate quantization
208 |         float scale = (trial_max - trial_min) / 255.0f;
209 |         
210 |         // Quantize and dequantize
211 |         linearQuantizeKernel<float><<<num_blocks, block_size>>>(
212 |             d_original,
213 |             reinterpret_cast<int8_t*>(d_quantized),
214 |             scale,
215 |             -trial_min / scale + 128.0f,
216 |             values_.size()
217 |         );
218 |         
219 |         linearDequantizeKernel<float><<<num_blocks, block_size>>>(
220 |             reinterpret_cast<int8_t*>(d_quantized),
221 |             d_quantized,
222 |             scale,
223 |             -trial_min / scale + 128.0f,
224 |             values_.size()
225 |         );
226 |         
227 |         // Compute MSE
228 |         computeMSEKernel<<<num_blocks, block_size>>>(
229 |             d_original,
230 |             d_quantized,
231 |             d_mse,
232 |             values_.size()
233 |         );
234 |         
235 |         float total_mse = thrust::reduce(
236 |             thrust::device,
237 |             thrust::device_pointer_cast(d_mse),
238 |             thrust::device_pointer_cast(d_mse + values_.size()),
239 |             0.0f,
240 |             thrust::plus<float>()
241 |         );
242 |         
243 |         // Update best range
244 |         if (total_mse < best_mse) {
245 |             best_mse = total_mse;
246 |             optimal_min_ = trial_min;
247 |             optimal_max_ = trial_max;
248 |         }
249 |     }
250 |     
251 |     // Clean up
252 |     CUDA_CHECK(cudaFree(d_original));
253 |     CUDA_CHECK(cudaFree(d_quantized));
254 |     CUDA_CHECK(cudaFree(d_mse));
255 |     
256 |     min_val = optimal_min_;
257 |     max_val = optimal_max_;
258 | }
259 | 
260 | void MSECalibrator::reset() {
261 |     values_.clear();
262 |     optimal_min_ = 0.0f;
263 |     optimal_max_ = 0.0f;
264 | }
265 | 
266 | } // namespace quantization
267 | } // namespace ltm
268 | 


--------------------------------------------------------------------------------
/docs/design/architecture.md:
--------------------------------------------------------------------------------
  1 | # LTM Transformer Architecture
  2 | 
  3 | This document describes the technical architecture of the LTM Transformer, focusing on its key components and their interactions.
  4 | 
  5 | ## Overview
  6 | 
  7 | The LTM Transformer extends the standard Transformer architecture with a novel long-term memory mechanism inspired by Google's Titan. The system is designed to efficiently handle extended context windows while maintaining reasonable memory usage through compression and efficient attention mechanisms.
  8 | 
  9 | ## System Components
 10 | 
 11 | ```mermaid
 12 | graph TD
 13 |     A[Input Tokens] --> B[Embeddings]
 14 |     B --> C[Transformer Block]
 15 |     C --> D[Memory Bank]
 16 |     D --> C
 17 |     C --> E[Output]
 18 |     
 19 |     subgraph "Transformer Block"
 20 |         F[Flash Attention] --> G[Memory Attention]
 21 |         G --> H[Feed Forward]
 22 |     end
 23 | ```
 24 | 
 25 | ### Core Components
 26 | 
 27 | 1. **Memory Bank**
 28 |    - Stores compressed representations of past context
 29 |    - Dimensions: `[memory_slots × memory_dim]`
 30 |    - Updated through a gating mechanism
 31 |    - Persists across inference steps
 32 | 
 33 | 2. **Compression Gate**
 34 |    - Compresses hidden states into memory representations
 35 |    - Uses learned parameters to determine importance
 36 |    - Implements selective update mechanism
 37 | 
 38 | 3. **Memory Attention**
 39 |    - Efficient cross-attention between current context and memory
 40 |    - Optimized for sparse access patterns
 41 |    - Integrated with FlashAttention
 42 | 
 43 | 4. **Flash Attention**
 44 |    - Memory-efficient attention implementation
 45 |    - Tiled matrix multiplication
 46 |    - Fused softmax operations
 47 | 
 48 | ## Memory Management
 49 | 
 50 | ### Memory Bank Design
 51 | 
 52 | ```
 53 | Memory Bank Structure:
 54 | [M × D] matrix where:
 55 | - M: Number of memory slots (default: 512)
 56 | - D: Memory dimension (default: 64)
 57 | ```
 58 | 
 59 | Memory slots are updated using a gating mechanism:
 60 | 
 61 | ```python
 62 | # Pseudo-code for memory update
 63 | def update_memory(memory_bank, new_content, gate_values):
 64 |     # gate_values: [0,1] determining update importance
 65 |     memory_bank = (1 - gate_values) * memory_bank + gate_values * new_content
 66 | ```
 67 | 
 68 | ### Compression Mechanism
 69 | 
 70 | The compression gate uses a learned transformation:
 71 | 
 72 | ```python
 73 | # Pseudo-code for compression
 74 | def compress_state(hidden_state):
 75 |     # Project to lower dimension
 76 |     compressed = self.compress_proj(hidden_state)
 77 |     
 78 |     # Generate importance scores
 79 |     scores = self.importance_scorer(compressed)
 80 |     
 81 |     # Apply gating
 82 |     gated = compressed * torch.sigmoid(scores)
 83 |     
 84 |     return gated
 85 | ```
 86 | 
 87 | ## Attention Mechanisms
 88 | 
 89 | ### Flash Attention
 90 | 
 91 | Implements memory-efficient attention:
 92 | 
 93 | 1. Block-wise computation to fit in fast memory
 94 | 2. Fused operations to reduce memory access
 95 | 3. Recomputation of attention if needed
 96 | 
 97 | ```python
 98 | # Pseudo-code for flash attention
 99 | def flash_attention(Q, K, V):
100 |     # Split into blocks that fit in SRAM
101 |     for q_block in Q.blocks():
102 |         for k_block, v_block in zip(K.blocks(), V.blocks()):
103 |             # Compute attention for current block
104 |             scores = scaled_dot_product(q_block, k_block)
105 |             output = scores @ v_block
106 |             
107 |             # Accumulate results
108 |             update_output(output)
109 | ```
110 | 
111 | ### Memory Attention
112 | 
113 | Efficient attention between current context and memory bank:
114 | 
115 | ```python
116 | # Pseudo-code for memory attention
117 | def memory_attention(query, memory_bank):
118 |     # Project query to memory space
119 |     query_proj = self.query_proj(query)
120 |     
121 |     # Compute attention scores
122 |     scores = torch.matmul(query_proj, memory_bank.transpose(-2, -1))
123 |     
124 |     # Apply attention
125 |     attended_memory = torch.matmul(scores, memory_bank)
126 |     
127 |     return attended_memory
128 | ```
129 | 
130 | ## Parallel Processing
131 | 
132 | ### Tensor Parallelism
133 | 
134 | Splits attention heads and feed-forward layers across GPUs:
135 | 
136 | ```mermaid
137 | graph LR
138 |     A[Input] --> B[GPU 1: Heads 1-4]
139 |     A --> C[GPU 2: Heads 5-8]
140 |     A --> D[GPU 3: Heads 9-12]
141 |     B --> E[All-Reduce]
142 |     C --> E
143 |     D --> E
144 |     E --> F[Output]
145 | ```
146 | 
147 | ### Pipeline Parallelism
148 | 
149 | Splits transformer layers across GPUs:
150 | 
151 | ```mermaid
152 | graph LR
153 |     A[Input] --> B[GPU 1: Layers 1-8]
154 |     B --> C[GPU 2: Layers 9-16]
155 |     C --> D[GPU 3: Layers 17-24]
156 |     D --> E[Output]
157 | ```
158 | 
159 | ## Optimization Techniques
160 | 
161 | ### Memory Optimizations
162 | 
163 | 1. **Gradient Checkpointing**
164 |    - Trades computation for memory
165 |    - Selectively recomputes activations
166 |    - Configurable granularity
167 | 
168 | 2. **Mixed Precision Training**
169 |    - FP16/BF16 for most operations
170 |    - FP32 for critical computations
171 |    - Dynamic loss scaling
172 | 
173 | 3. **Kernel Fusion**
174 |    - Combines multiple operations
175 |    - Reduces memory bandwidth usage
176 |    - Custom CUDA implementations
177 | 
178 | ### Performance Optimizations
179 | 
180 | 1. **Fused CUDA Kernels**
181 |    ```cpp
182 |    // Example of fused LayerNorm + Dropout + ReLU
183 |    template <typename T>
184 |    __global__ void fused_layernorm_dropout_relu(
185 |        T* output,
186 |        const T* input,
187 |        const T* gamma,
188 |        const T* beta,
189 |        float dropout_prob,
190 |        int n) {
191 |        // Kernel implementation
192 |    }
193 |    ```
194 | 
195 | 2. **Memory Access Patterns**
196 |    - Coalesced memory access
197 |    - Shared memory usage
198 |    - Register-level optimizations
199 | 
200 | 3. **Workload Balancing**
201 |    - Dynamic batch sizing
202 |    - Adaptive sequence lengths
203 |    - Load-based scheduling
204 | 
205 | ## Configuration
206 | 
207 | Example configuration:
208 | 
209 | ```yaml
210 | model:
211 |   hidden_size: 768
212 |   num_attention_heads: 12
213 |   num_hidden_layers: 12
214 |   memory_slots: 512
215 |   memory_dim: 64
216 |   
217 | optimization:
218 |   use_flash_attention: true
219 |   use_memory_compression: true
220 |   memory_compression_ratio: 0.5
221 |   
222 | training:
223 |   mixed_precision: true
224 |   gradient_checkpointing: true
225 |   tensor_parallel_size: 4
226 |   pipeline_parallel_size: 2
227 | ```
228 | 
229 | ## Integration Points
230 | 
231 | ### Python Bindings
232 | 
233 | ```python
234 | # High-level Python interface
235 | class TitanModel:
236 |     def __init__(self, config):
237 |         self.core = TitanModelImpl(config)  # C++ implementation
238 |         self.setup_memory_bank()
239 |     
240 |     def forward(self, input_ids):
241 |         return self.core.forward(input_ids)
242 | ```
243 | 
244 | ### C++ Core
245 | 
246 | ```cpp
247 | // Low-level C++ implementation
248 | class TitanModelImpl {
249 |     void forward(torch::Tensor input) {
250 |         // Implementation
251 |     }
252 |     
253 |     void update_memory(torch::Tensor new_content) {
254 |         // Implementation
255 |     }
256 | };
257 | ```
258 | 
259 | ## Performance Characteristics
260 | 
261 | ### Memory Usage
262 | 
263 | | Operation              | Memory Complexity    | Optimization        |
264 | |-----------------------|---------------------|-------------------|
265 | | Standard Attention    | O(n²)               | -                 |
266 | | Flash Attention      | O(n)                | Block-wise compute|
267 | | Memory Bank          | O(m)                | m << n            |
268 | | Memory Attention     | O(n×m)              | m << n            |
269 | 
270 | ### Computational Complexity
271 | 
272 | | Operation              | Time Complexity      | Notes             |
273 | |-----------------------|---------------------|-------------------|
274 | | Standard Attention    | O(n²d)              | n: seq length     |
275 | | Flash Attention      | O(n²d/B)            | B: block size     |
276 | | Memory Compression   | O(nd)               | d: hidden dim     |
277 | | Memory Attention     | O(nmd)              | m: memory slots   |
278 | 
279 | ## Future Improvements
280 | 
281 | 1. **Dynamic Memory Allocation**
282 |    - Adaptive memory slot sizing
283 |    - Content-based memory management
284 |    - Memory pruning strategies
285 | 
286 | 2. **Advanced Parallelism**
287 |    - Sequence parallelism
288 |    - Expert parallelism
289 |    - Hybrid parallelism strategies
290 | 
291 | 3. **Optimization Opportunities**
292 |    - Sparse attention patterns
293 |    - Adaptive precision training
294 |    - Hardware-specific optimizations
295 | 


--------------------------------------------------------------------------------
/python_bindings/src/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <pybind11/pybind11.h>
  2 | #include <pybind11/stl.h>
  3 | #include <pybind11/numpy.h>
  4 | #include <torch/extension.h>
  5 | 
  6 | #include "core/transformer/titan_inspired_block.cuh"
  7 | #include "core/attention/flash_attention.cuh"
  8 | #include "core/attention/memory_attention.cuh"
  9 | #include "core/ltm/memory_bank.cuh"
 10 | #include "core/ltm/compression_gate.cuh"
 11 | 
 12 | namespace py = pybind11;
 13 | 
 14 | // Helper functions for tensor conversion
 15 | torch::Tensor numpy_to_torch(py::array_t<float> array) {
 16 |     py::buffer_info buf = array.request();
 17 |     auto tensor = torch::from_blob(
 18 |         buf.ptr,
 19 |         {buf.shape.begin(), buf.shape.end()},
 20 |         torch::TensorOptions().dtype(torch::kFloat32)
 21 |     );
 22 |     return tensor.clone();
 23 | }
 24 | 
 25 | py::array_t<float> torch_to_numpy(const torch::Tensor& tensor) {
 26 |     tensor = tensor.contiguous().cpu();
 27 |     return py::array_t<float>(
 28 |         tensor.sizes().vec(),
 29 |         tensor.data_ptr<float>()
 30 |     );
 31 | }
 32 | 
 33 | // Wrapper classes for C++ implementations
 34 | class TitanModelImpl {
 35 | public:
 36 |     TitanModelImpl(const ltm::transformer::TitanBlockConfig& config)
 37 |         : model_(config) {}
 38 | 
 39 |     py::dict forward(
 40 |         torch::Tensor input,
 41 |         torch::optional<torch::Tensor> attention_mask = torch::nullopt,
 42 |         torch::optional<std::vector<torch::Tensor>> past_key_values = torch::nullopt,
 43 |         bool use_cache = false,
 44 |         bool output_attentions = false,
 45 |         bool output_hidden_states = false
 46 |     ) {
 47 |         auto outputs = model_.forward(
 48 |             input,
 49 |             attention_mask,
 50 |             past_key_values,
 51 |             use_cache,
 52 |             output_attentions,
 53 |             output_hidden_states
 54 |         );
 55 | 
 56 |         py::dict result;
 57 |         result["hidden_states"] = outputs.hidden_states;
 58 |         if (use_cache) result["past_key_values"] = outputs.past_key_values;
 59 |         if (output_attentions) result["attentions"] = outputs.attentions;
 60 |         if (output_hidden_states) result["all_hidden_states"] = outputs.all_hidden_states;
 61 |         return result;
 62 |     }
 63 | 
 64 | private:
 65 |     ltm::transformer::TitanBlock<float> model_;
 66 | };
 67 | 
 68 | class BatchProcessorImpl {
 69 | public:
 70 |     BatchProcessorImpl(const py::dict& config) {
 71 |         // Initialize from Python config
 72 |     }
 73 | 
 74 |     int add_request(
 75 |         torch::Tensor input_ids,
 76 |         torch::optional<torch::Tensor> attention_mask,
 77 |         int max_new_tokens,
 78 |         py::dict kwargs
 79 |     ) {
 80 |         // Add request to batch
 81 |         return 0; // Return request ID
 82 |     }
 83 | 
 84 |     std::vector<py::dict> process_batch() {
 85 |         // Process batch and return results
 86 |         return {};
 87 |     }
 88 | 
 89 |     bool is_batch_ready() const {
 90 |         return false;
 91 |     }
 92 | 
 93 |     void clear() {
 94 |         // Clear batch
 95 |     }
 96 | 
 97 | private:
 98 |     // Implementation details
 99 | };
100 | 
101 | class CacheManagerImpl {
102 | public:
103 |     CacheManagerImpl(const py::dict& config) {
104 |         // Initialize from Python config
105 |     }
106 | 
107 |     void allocate(int batch_size, int seq_length) {
108 |         // Allocate cache
109 |     }
110 | 
111 |     void update(
112 |         torch::Tensor key,
113 |         torch::Tensor value,
114 |         torch::optional<torch::Tensor> memory
115 |     ) {
116 |         // Update cache
117 |     }
118 | 
119 |     std::tuple<torch::Tensor, torch::Tensor, torch::optional<torch::Tensor>>
120 |     get(int index) {
121 |         // Get cached states
122 |         return std::make_tuple(
123 |             torch::empty({}),
124 |             torch::empty({}),
125 |             torch::nullopt
126 |         );
127 |     }
128 | 
129 |     void clear() {
130 |         // Clear cache
131 |     }
132 | 
133 | private:
134 |     // Implementation details
135 | };
136 | 
137 | class QuantizerImpl {
138 | public:
139 |     QuantizerImpl(const py::dict& config) {
140 |         // Initialize from Python config
141 |     }
142 | 
143 |     py::object quantize(py::object model) {
144 |         // Quantize model
145 |         return model;
146 |     }
147 | 
148 |     py::dict forward(
149 |         py::object model,
150 |         torch::Tensor input_ids,
151 |         torch::optional<torch::Tensor> attention_mask,
152 |         py::dict kwargs
153 |     ) {
154 |         // Forward pass with quantized model
155 |         return py::dict();
156 |     }
157 | 
158 | private:
159 |     // Implementation details
160 | };
161 | 
162 | class InferenceEngineImpl {
163 | public:
164 |     InferenceEngineImpl(
165 |         py::object model,
166 |         const py::dict& config,
167 |         py::object batch_processor,
168 |         py::object cache_manager
169 |     ) {
170 |         // Initialize from Python objects
171 |     }
172 | 
173 |     py::object generate(
174 |         torch::Tensor input_ids,
175 |         torch::optional<torch::Tensor> attention_mask,
176 |         py::dict gen_config,
177 |         bool return_dict,
178 |         bool output_scores,
179 |         bool output_attentions
180 |     ) {
181 |         // Generate text
182 |         return py::none();
183 |     }
184 | 
185 |     py::object stream_generate(
186 |         torch::Tensor input_ids,
187 |         py::dict kwargs
188 |     ) {
189 |         // Stream generation
190 |         return py::none();
191 |     }
192 | 
193 |     torch::Tensor encode(
194 |         torch::Tensor input_ids,
195 |         torch::optional<torch::Tensor> attention_mask,
196 |         py::dict kwargs
197 |     ) {
198 |         // Encode inputs
199 |         return torch::empty({});
200 |     }
201 | 
202 | private:
203 |     // Implementation details
204 | };
205 | 
206 | PYBIND11_MODULE(_ltm, m) {
207 |     // Module docstring
208 |     m.doc() = "C++ implementations for LTM Transformer";
209 | 
210 |     // Register TitanBlockConfig
211 |     py::class_<ltm::transformer::TitanBlockConfig>(m, "TitanBlockConfig")
212 |         .def(py::init<>())
213 |         .def_readwrite("hidden_dim", &ltm::transformer::TitanBlockConfig::hidden_dim)
214 |         .def_readwrite("ffn_dim", &ltm::transformer::TitanBlockConfig::ffn_dim)
215 |         .def_readwrite("num_heads", &ltm::transformer::TitanBlockConfig::num_heads)
216 |         .def_readwrite("head_dim", &ltm::transformer::TitanBlockConfig::head_dim)
217 |         .def_readwrite("memory_slots", &ltm::transformer::TitanBlockConfig::memory_slots)
218 |         .def_readwrite("memory_dim", &ltm::transformer::TitanBlockConfig::memory_dim)
219 |         .def_readwrite("memory_update_rate", &ltm::transformer::TitanBlockConfig::memory_update_rate)
220 |         .def_readwrite("use_memory_compression", &ltm::transformer::TitanBlockConfig::use_memory_compression)
221 |         .def_readwrite("memory_compression_ratio", &ltm::transformer::TitanBlockConfig::memory_compression_ratio)
222 |         .def_readwrite("use_flash_attention", &ltm::transformer::TitanBlockConfig::use_flash_attention)
223 |         .def_readwrite("use_alibi", &ltm::transformer::TitanBlockConfig::use_alibi)
224 |         .def_readwrite("use_rotary", &ltm::transformer::TitanBlockConfig::use_rotary)
225 |         .def_readwrite("dropout_prob", &ltm::transformer::TitanBlockConfig::dropout_prob)
226 |         .def_readwrite("use_bias", &ltm::transformer::TitanBlockConfig::use_bias)
227 |         .def_readwrite("use_layer_norm", &ltm::transformer::TitanBlockConfig::use_layer_norm)
228 |         .def_readwrite("fuse_operations", &ltm::transformer::TitanBlockConfig::fuse_operations);
229 | 
230 |     // Register TitanModelImpl
231 |     py::class_<TitanModelImpl>(m, "TitanModelImpl")
232 |         .def(py::init<const ltm::transformer::TitanBlockConfig&>())
233 |         .def("forward", &TitanModelImpl::forward,
234 |             py::arg("input"),
235 |             py::arg("attention_mask") = nullptr,
236 |             py::arg("past_key_values") = nullptr,
237 |             py::arg("use_cache") = false,
238 |             py::arg("output_attentions") = false,
239 |             py::arg("output_hidden_states") = false);
240 | 
241 |     // Register BatchProcessorImpl
242 |     py::class_<BatchProcessorImpl>(m, "BatchProcessorImpl")
243 |         .def(py::init<const py::dict&>())
244 |         .def("add_request", &BatchProcessorImpl::add_request)
245 |         .def("process_batch", &BatchProcessorImpl::process_batch)
246 |         .def("is_batch_ready", &BatchProcessorImpl::is_batch_ready)
247 |         .def("clear", &BatchProcessorImpl::clear);
248 | 
249 |     // Register CacheManagerImpl
250 |     py::class_<CacheManagerImpl>(m, "CacheManagerImpl")
251 |         .def(py::init<const py::dict&>())
252 |         .def("allocate", &CacheManagerImpl::allocate)
253 |         .def("update", &CacheManagerImpl::update)
254 |         .def("get", &CacheManagerImpl::get)
255 |         .def("clear", &CacheManagerImpl::clear);
256 | 
257 |     // Register QuantizerImpl
258 |     py::class_<QuantizerImpl>(m, "QuantizerImpl")
259 |         .def(py::init<const py::dict&>())
260 |         .def("quantize", &QuantizerImpl::quantize)
261 |         .def("forward", &QuantizerImpl::forward);
262 | 
263 |     // Register InferenceEngineImpl
264 |     py::class_<InferenceEngineImpl>(m, "InferenceEngineImpl")
265 |         .def(py::init<py::object, const py::dict&, py::object, py::object>())
266 |         .def("generate", &InferenceEngineImpl::generate)
267 |         .def("stream_generate", &InferenceEngineImpl::stream_generate)
268 |         .def("encode", &InferenceEngineImpl::encode);
269 | }
270 | 


--------------------------------------------------------------------------------
/src/core/parallel/pipeline.cpp:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include <queue>
  3 | #include <thread>
  4 | #include <mutex>
  5 | #include <condition_variable>
  6 | #include <cuda_runtime.h>
  7 | #include "core/parallel/pipeline.hpp"
  8 | #include "core/utils/cuda_utils.cuh"
  9 | 
 10 | namespace ltm {
 11 | namespace parallel {
 12 | 
 13 | class PipelineStage {
 14 | public:
 15 |     PipelineStage(int device_id, int stage_id, size_t buffer_size)
 16 |         : device_id_(device_id), stage_id_(stage_id), buffer_size_(buffer_size) {
 17 |         CUDA_CHECK(cudaSetDevice(device_id_));
 18 |         CUDA_CHECK(cudaStreamCreate(&compute_stream_));
 19 |         CUDA_CHECK(cudaStreamCreate(&comm_stream_));
 20 |         
 21 |         // Allocate input/output buffers
 22 |         input_buffers_.resize(buffer_size_);
 23 |         output_buffers_.resize(buffer_size_);
 24 |         for (size_t i = 0; i < buffer_size_; ++i) {
 25 |             CUDA_CHECK(cudaMalloc(&input_buffers_[i], buffer_size_));
 26 |             CUDA_CHECK(cudaMalloc(&output_buffers_[i], buffer_size_));
 27 |         }
 28 |     }
 29 | 
 30 |     ~PipelineStage() {
 31 |         CUDA_CHECK(cudaSetDevice(device_id_));
 32 |         
 33 |         // Free buffers
 34 |         for (auto& buffer : input_buffers_) {
 35 |             CUDA_CHECK(cudaFree(buffer));
 36 |         }
 37 |         for (auto& buffer : output_buffers_) {
 38 |             CUDA_CHECK(cudaFree(buffer));
 39 |         }
 40 |         
 41 |         CUDA_CHECK(cudaStreamDestroy(compute_stream_));
 42 |         CUDA_CHECK(cudaStreamDestroy(comm_stream_));
 43 |     }
 44 | 
 45 |     void forward(const void* input, void* output, size_t size) {
 46 |         CUDA_CHECK(cudaSetDevice(device_id_));
 47 |         
 48 |         // Copy input to next available buffer
 49 |         int buffer_idx = next_buffer_index_++;
 50 |         if (next_buffer_index_ >= buffer_size_) {
 51 |             next_buffer_index_ = 0;
 52 |         }
 53 |         
 54 |         CUDA_CHECK(cudaMemcpyAsync(
 55 |             input_buffers_[buffer_idx],
 56 |             input,
 57 |             size,
 58 |             cudaMemcpyDeviceToDevice,
 59 |             comm_stream_
 60 |         ));
 61 |         
 62 |         // Wait for copy to complete
 63 |         CUDA_CHECK(cudaStreamSynchronize(comm_stream_));
 64 |         
 65 |         // Process data
 66 |         processBuffer(buffer_idx);
 67 |         
 68 |         // Copy result to output
 69 |         CUDA_CHECK(cudaMemcpyAsync(
 70 |             output,
 71 |             output_buffers_[buffer_idx],
 72 |             size,
 73 |             cudaMemcpyDeviceToDevice,
 74 |             comm_stream_
 75 |         ));
 76 |         
 77 |         // Wait for processing and copy to complete
 78 |         CUDA_CHECK(cudaStreamSynchronize(compute_stream_));
 79 |         CUDA_CHECK(cudaStreamSynchronize(comm_stream_));
 80 |     }
 81 | 
 82 |     void backward(const void* grad_output, void* grad_input, size_t size) {
 83 |         CUDA_CHECK(cudaSetDevice(device_id_));
 84 |         
 85 |         // Similar to forward, but for backward pass
 86 |         int buffer_idx = next_buffer_index_++;
 87 |         if (next_buffer_index_ >= buffer_size_) {
 88 |             next_buffer_index_ = 0;
 89 |         }
 90 |         
 91 |         CUDA_CHECK(cudaMemcpyAsync(
 92 |             input_buffers_[buffer_idx],
 93 |             grad_output,
 94 |             size,
 95 |             cudaMemcpyDeviceToDevice,
 96 |             comm_stream_
 97 |         ));
 98 |         
 99 |         CUDA_CHECK(cudaStreamSynchronize(comm_stream_));
100 |         
101 |         processBackwardBuffer(buffer_idx);
102 |         
103 |         CUDA_CHECK(cudaMemcpyAsync(
104 |             grad_input,
105 |             output_buffers_[buffer_idx],
106 |             size,
107 |             cudaMemcpyDeviceToDevice,
108 |             comm_stream_
109 |         ));
110 |         
111 |         CUDA_CHECK(cudaStreamSynchronize(compute_stream_));
112 |         CUDA_CHECK(cudaStreamSynchronize(comm_stream_));
113 |     }
114 | 
115 | private:
116 |     void processBuffer(int buffer_idx) {
117 |         // Execute model layers assigned to this stage
118 |         // This would be customized based on the model architecture
119 |         for (auto& layer : layers_) {
120 |             layer->forward(
121 |                 input_buffers_[buffer_idx],
122 |                 output_buffers_[buffer_idx],
123 |                 compute_stream_
124 |             );
125 |         }
126 |     }
127 | 
128 |     void processBackwardBuffer(int buffer_idx) {
129 |         // Execute backward pass for layers in reverse order
130 |         for (auto it = layers_.rbegin(); it != layers_.rend(); ++it) {
131 |             (*it)->backward(
132 |                 input_buffers_[buffer_idx],
133 |                 output_buffers_[buffer_idx],
134 |                 compute_stream_
135 |             );
136 |         }
137 |     }
138 | 
139 |     int device_id_;
140 |     int stage_id_;
141 |     size_t buffer_size_;
142 |     int next_buffer_index_ = 0;
143 |     
144 |     cudaStream_t compute_stream_;
145 |     cudaStream_t comm_stream_;
146 |     
147 |     std::vector<void*> input_buffers_;
148 |     std::vector<void*> output_buffers_;
149 |     std::vector<std::shared_ptr<ModelLayer>> layers_;
150 | };
151 | 
152 | class PipelineExecutor {
153 | public:
154 |     PipelineExecutor(const std::vector<int>& device_ids, size_t num_micro_batches)
155 |         : num_micro_batches_(num_micro_batches) {
156 |         // Create pipeline stages
157 |         stages_.reserve(device_ids.size());
158 |         for (size_t i = 0; i < device_ids.size(); ++i) {
159 |             stages_.emplace_back(std::make_unique<PipelineStage>(
160 |                 device_ids[i],
161 |                 i,
162 |                 num_micro_batches_
163 |             ));
164 |         }
165 |         
166 |         // Start worker threads
167 |         for (size_t i = 0; i < device_ids.size(); ++i) {
168 |             workers_.emplace_back(std::thread(&PipelineExecutor::stageWorker, this, i));
169 |         }
170 |     }
171 | 
172 |     ~PipelineExecutor() {
173 |         // Signal workers to stop
174 |         {
175 |             std::lock_guard<std::mutex> lock(mutex_);
176 |             stop_ = true;
177 |         }
178 |         cv_.notify_all();
179 |         
180 |         // Wait for workers to finish
181 |         for (auto& worker : workers_) {
182 |             if (worker.joinable()) {
183 |                 worker.join();
184 |             }
185 |         }
186 |     }
187 | 
188 |     void forward(const std::vector<Tensor<float>>& input_batch,
189 |                 std::vector<Tensor<float>>& output_batch) {
190 |         // Split input into micro-batches
191 |         auto micro_batches = splitBatch(input_batch, num_micro_batches_);
192 |         
193 |         // Process micro-batches through pipeline
194 |         for (size_t i = 0; i < num_micro_batches_; ++i) {
195 |             // Queue micro-batch for processing
196 |             {
197 |                 std::lock_guard<std::mutex> lock(mutex_);
198 |                 work_queue_.push({micro_batches[i], nullptr});
199 |             }
200 |             cv_.notify_one();
201 |         }
202 |         
203 |         // Wait for all micro-batches to complete
204 |         waitForCompletion();
205 |         
206 |         // Gather results
207 |         gatherResults(output_batch);
208 |     }
209 | 
210 |     void backward(const std::vector<Tensor<float>>& grad_output_batch,
211 |                  std::vector<Tensor<float>>& grad_input_batch) {
212 |         // Similar to forward, but for backward pass
213 |         auto micro_batches = splitBatch(grad_output_batch, num_micro_batches_);
214 |         
215 |         for (size_t i = 0; i < num_micro_batches_; ++i) {
216 |             {
217 |                 std::lock_guard<std::mutex> lock(mutex_);
218 |                 work_queue_.push({micro_batches[i], nullptr});
219 |             }
220 |             cv_.notify_one();
221 |         }
222 |         
223 |         waitForCompletion();
224 |         gatherResults(grad_input_batch);
225 |     }
226 | 
227 | private:
228 |     void stageWorker(int stage_id) {
229 |         while (true) {
230 |             WorkItem work;
231 |             
232 |             // Get next work item
233 |             {
234 |                 std::unique_lock<std::mutex> lock(mutex_);
235 |                 cv_.wait(lock, [this]() {
236 |                     return stop_ || !work_queue_.empty();
237 |                 });
238 |                 
239 |                 if (stop_ && work_queue_.empty()) {
240 |                     break;
241 |                 }
242 |                 
243 |                 work = work_queue_.front();
244 |                 work_queue_.pop();
245 |             }
246 |             
247 |             // Process work item
248 |             stages_[stage_id]->forward(
249 |                 work.input.data(),
250 |                 work.output.data(),
251 |                 work.input.numel() * sizeof(float)
252 |             );
253 |             
254 |             // Signal completion
255 |             {
256 |                 std::lock_guard<std::mutex> lock(mutex_);
257 |                 completed_items_++;
258 |             }
259 |             cv_.notify_all();
260 |         }
261 |     }
262 | 
263 |     void waitForCompletion() {
264 |         std::unique_lock<std::mutex> lock(mutex_);
265 |         cv_.wait(lock, [this]() {
266 |             return completed_items_ >= num_micro_batches_;
267 |         });
268 |         completed_items_ = 0;
269 |     }
270 | 
271 |     struct WorkItem {
272 |         Tensor<float> input;
273 |         Tensor<float> output;
274 |     };
275 | 
276 |     size_t num_micro_batches_;
277 |     std::vector<std::unique_ptr<PipelineStage>> stages_;
278 |     std::vector<std::thread> workers_;
279 |     
280 |     std::queue<WorkItem> work_queue_;
281 |     size_t completed_items_ = 0;
282 |     bool stop_ = false;
283 |     
284 |     std::mutex mutex_;
285 |     std::condition_variable cv_;
286 | };
287 | 
288 | } // namespace parallel
289 | } // namespace ltm
290 | 


--------------------------------------------------------------------------------
/docs/performance/optimization.md:
--------------------------------------------------------------------------------
  1 | # Performance Optimization Guide
  2 | 
  3 | This document provides detailed information about optimizing the LTM Transformer for maximum performance. It covers memory optimization, computational efficiency, distributed training strategies, and hardware-specific tuning.
  4 | 
  5 | ## Table of Contents
  6 | 
  7 | - [Memory Optimization](#memory-optimization)
  8 | - [Computational Optimization](#computational-optimization)
  9 | - [Distributed Training](#distributed-training)
 10 | - [Hardware-Specific Tuning](#hardware-specific-tuning)
 11 | - [Benchmarks](#benchmarks)
 12 | 
 13 | ## Memory Optimization
 14 | 
 15 | ### Memory Usage Analysis
 16 | 
 17 | | Component           | Memory Usage                | Optimization Strategy |
 18 | |--------------------|----------------------------|---------------------|
 19 | | Attention          | O(batch × seq_len²)        | FlashAttention     |
 20 | | Memory Bank        | O(memory_slots × dim)      | Compression        |
 21 | | Activations        | O(batch × seq_len × dim)   | Checkpointing      |
 22 | | Model Parameters   | O(num_layers × dim²)       | Quantization       |
 23 | 
 24 | ### FlashAttention
 25 | 
 26 | FlashAttention reduces memory usage through:
 27 | 1. Block-wise computation
 28 | 2. Recomputation of attention if needed
 29 | 3. Fused softmax operations
 30 | 
 31 | ```python
 32 | # Enable FlashAttention
 33 | config = TitanConfig(
 34 |     use_flash_attention=True,
 35 |     attention_block_size=1024  # Tune based on GPU
 36 | )
 37 | ```
 38 | 
 39 | ### Gradient Checkpointing
 40 | 
 41 | Trade computation for memory by selectively recomputing activations:
 42 | 
 43 | ```python
 44 | # Enable gradient checkpointing
 45 | model.gradient_checkpointing_enable()
 46 | 
 47 | # Configure granularity
 48 | model.config.gradient_checkpointing_granularity = "block"  # or "layer"
 49 | ```
 50 | 
 51 | ### Memory Bank Optimization
 52 | 
 53 | 1. **Adaptive Compression**
 54 |    ```python
 55 |    config = TitanConfig(
 56 |        memory_compression_ratio=0.5,  # Adjust based on needs
 57 |        use_adaptive_compression=True
 58 |    )
 59 |    ```
 60 | 
 61 | 2. **Slot Management**
 62 |    ```python
 63 |    # Monitor slot utilization
 64 |    stats = model.get_memory_stats()
 65 |    if stats["utilization"] < 0.5:
 66 |        model.reduce_memory_slots()
 67 |    ```
 68 | 
 69 | ## Computational Optimization
 70 | 
 71 | ### Kernel Fusion
 72 | 
 73 | Custom CUDA kernels that fuse multiple operations:
 74 | 
 75 | ```cpp
 76 | // Fused LayerNorm + Dropout + ReLU
 77 | template <typename T>
 78 | __global__ void fused_layernorm_dropout_relu(
 79 |     T* __restrict__ output,
 80 |     const T* __restrict__ input,
 81 |     const T* __restrict__ gamma,
 82 |     const T* __restrict__ beta,
 83 |     const float dropout_prob,
 84 |     const int n
 85 | ) {
 86 |     // Implementation
 87 | }
 88 | ```
 89 | 
 90 | ### Mixed Precision Training
 91 | 
 92 | ```python
 93 | from torch.cuda.amp import autocast, GradScaler
 94 | 
 95 | # Initialize scaler
 96 | scaler = GradScaler()
 97 | 
 98 | # Training loop
 99 | with autocast():
100 |     outputs = model(input_ids)
101 |     loss = outputs.loss
102 | 
103 | scaler.scale(loss).backward()
104 | scaler.step(optimizer)
105 | scaler.update()
106 | ```
107 | 
108 | ### Quantization
109 | 
110 | ```python
111 | # INT8 Quantization
112 | config = InferenceConfig(
113 |     quantization=dict(
114 |         bits=8,
115 |         scheme="symmetric",
116 |         granularity="per-channel"
117 |     )
118 | )
119 | 
120 | # Load and quantize
121 | model = QuantizedEngine(model, config)
122 | ```
123 | 
124 | ## Distributed Training
125 | 
126 | ### Data Parallelism
127 | 
128 | ```python
129 | # Initialize process group
130 | torch.distributed.init_process_group(backend="nccl")
131 | 
132 | # Wrap model
133 | model = DistributedDataParallel(model)
134 | 
135 | # Configure training
136 | trainer = DistributedTrainer(
137 |     model=model,
138 |     args=TrainingArguments(
139 |         per_device_train_batch_size=8,
140 |         gradient_accumulation_steps=4
141 |     )
142 | )
143 | ```
144 | 
145 | ### Tensor Parallelism
146 | 
147 | Split attention heads and feed-forward layers:
148 | 
149 | ```python
150 | # Configure tensor parallelism
151 | config = TitanConfig(
152 |     tensor_parallel_size=4,
153 |     tensor_parallel_mode="1d",  # or "2d", "2.5d", "3d"
154 |     reduce_scatter_size=128
155 | )
156 | 
157 | # Initialize model
158 | model = TitanModel(config)
159 | model.parallelize()
160 | ```
161 | 
162 | ### Pipeline Parallelism
163 | 
164 | ```python
165 | # Configure pipeline
166 | config = TitanConfig(
167 |     pipeline_parallel_size=4,
168 |     num_micro_batches=32,
169 |     pipeline_chunk_size=1
170 | )
171 | 
172 | # Training arguments
173 | args = TrainingArguments(
174 |     pipeline_parallel=True,
175 |     gradient_accumulation_steps=config.num_micro_batches
176 | )
177 | ```
178 | 
179 | ## Hardware-Specific Tuning
180 | 
181 | ### NVIDIA A100
182 | 
183 | ```python
184 | # Optimal settings for A100
185 | config = TitanConfig(
186 |     attention_block_size=128,
187 |     max_sequence_length=2048,
188 |     memory_slots=512,
189 |     use_flash_attention=True,
190 |     use_tensor_cores=True
191 | )
192 | 
193 | # CUDA kernel settings
194 | THREADS_PER_BLOCK = 256
195 | BLOCKS_PER_SM = 2
196 | ```
197 | 
198 | ### NVIDIA H100
199 | 
200 | ```python
201 | # Leverage H100 features
202 | config = TitanConfig(
203 |     fp8_training=True,
204 |     use_flash_attention_2=True,
205 |     transformer_engine=True
206 | )
207 | 
208 | # Kernel optimizations
209 | THREADS_PER_BLOCK = 512
210 | BLOCKS_PER_SM = 4
211 | ```
212 | 
213 | ## Benchmarks
214 | 
215 | ### Training Performance
216 | 
217 | | GPU         | Batch Size | Seq Length | Memory (GB) | Tokens/sec |
218 | |-------------|------------|------------|-------------|------------|
219 | | A100-80GB   | 32         | 2048       | 76         | 180K       |
220 | | H100-80GB   | 32         | 2048       | 72         | 450K       |
221 | | 8x A100     | 256        | 2048       | 608        | 1.4M       |
222 | | 8x H100     | 256        | 2048       | 576        | 3.6M       |
223 | 
224 | ### Inference Performance
225 | 
226 | | Setting           | Latency (ms) | Throughput (tokens/sec) |
227 | |-------------------|-------------|------------------------|
228 | | Base              | 42.5        | 48K                   |
229 | | +FlashAttention   | 28.3        | 72K                   |
230 | | +INT8             | 18.7        | 108K                  |
231 | | +TensorParallel   | 12.4        | 162K                  |
232 | 
233 | ### Memory Bank Performance
234 | 
235 | | Context Length | Standard (GB) | With LTM (GB) | Compression Ratio |
236 | |---------------|--------------|---------------|------------------|
237 | | 2K            | 4            | 2             | 2x               |
238 | | 8K            | 64           | 4             | 16x              |
239 | | 32K           | 1024         | 8             | 128x             |
240 | 
241 | ## Performance Tips
242 | 
243 | ### Memory Management
244 | 
245 | 1. **Monitor Memory Usage**
246 |    ```python
247 |    # Print memory stats
248 |    print(torch.cuda.memory_summary())
249 |    
250 |    # Monitor peak memory
251 |    torch.cuda.reset_peak_memory_stats()
252 |    ```
253 | 
254 | 2. **Optimize Batch Size**
255 |    ```python
256 |    # Find optimal batch size
257 |    from ltm.utils import find_optimal_batch_size
258 |    
259 |    batch_size = find_optimal_batch_size(
260 |        model,
261 |        starting_batch_size=32,
262 |        gpu_target_utilization=0.85
263 |    )
264 |    ```
265 | 
266 | 3. **Memory Profiling**
267 |    ```python
268 |    # Profile memory usage
269 |    with torch.profiler.profile() as prof:
270 |        outputs = model(input_ids)
271 |    print(prof.key_averages().table())
272 |    ```
273 | 
274 | ### Training Optimization
275 | 
276 | 1. **Gradient Accumulation**
277 |    ```python
278 |    # Effective batch size = batch_size * grad_accum
279 |    args = TrainingArguments(
280 |        per_device_train_batch_size=8,
281 |        gradient_accumulation_steps=4  # Effective batch size = 32
282 |    )
283 |    ```
284 | 
285 | 2. **Learning Rate Scaling**
286 |    ```python
287 |    # Scale learning rate with batch size
288 |    base_lr = 5e-5
289 |    effective_batch_size = batch_size * grad_accum * world_size
290 |    lr = base_lr * (effective_batch_size / 256)
291 |    ```
292 | 
293 | 3. **Optimizer Settings**
294 |    ```python
295 |    # Memory-efficient optimizer
296 |    from torch.optim import AdaFactor
297 |    
298 |    optimizer = AdaFactor(
299 |        model.parameters(),
300 |        scale_parameter=True,
301 |        relative_step=True
302 |    )
303 |    ```
304 | 
305 | ### Inference Optimization
306 | 
307 | 1. **Caching Strategies**
308 |    ```python
309 |    # Enable all caching mechanisms
310 |    engine = InferenceEngine(
311 |        model,
312 |        config=InferenceConfig(
313 |            use_kv_cache=True,
314 |            use_memory_cache=True,
315 |            cache_size=1024
316 |        )
317 |    )
318 |    ```
319 | 
320 | 2. **Batch Processing**
321 |    ```python
322 |    # Process requests in optimal batches
323 |    processor = BatchProcessor(
324 |        max_batch_size=32,
325 |        timeout_ms=100,
326 |        dynamic_batching=True
327 |    )
328 |    ```
329 | 
330 | 3. **Quantization**
331 |    ```python
332 |    # Progressive quantization
333 |    engine = QuantizedEngine(
334 |        model,
335 |        config=InferenceConfig(
336 |            quantization_bits=8,
337 |            quantization_scheme="symmetric",
338 |            calibration_method="percentile"
339 |        )
340 |    )
341 |    ```
342 | 
343 | ## Monitoring and Profiling
344 | 
345 | ### CUDA Profiling
346 | 
347 | ```bash
348 | # Profile with NSight
349 | nsys profile --stats=true ./my_training_script.py
350 | 
351 | # Profile with NVTX
352 | nvprof --profile-from-start off ./my_training_script.py
353 | ```
354 | 
355 | ### Memory Monitoring
356 | 
357 | ```python
358 | # Custom memory monitor
359 | class MemoryMonitor:
360 |     @staticmethod
361 |     def log_memory():
362 |         print(f"Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
363 |         print(f"Cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
364 | 
365 | # Use in training
366 | monitor = MemoryMonitor()
367 | monitor.log_memory()
368 | ```
369 | 
370 | ### Performance Metrics
371 | 
372 | ```python
373 | # Track metrics
374 | class PerformanceTracker:
375 |     def __init__(self):
376 |         self.start_time = time.time()
377 |         self.tokens_processed = 0
378 |     
379 |     def update(self, num_tokens):
380 |         self.tokens_processed += num_tokens
381 |     
382 |     def get_throughput(self):
383 |         elapsed = time.time() - self.start_time
384 |         return self.tokens_processed / elapsed
385 | 
386 | # Use tracker
387 | tracker = PerformanceTracker()
388 | 


--------------------------------------------------------------------------------
/include/core/utils/tensor.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <vector>
  4 | #include <memory>
  5 | #include <cuda_runtime.h>
  6 | #include <cuda_fp16.h>
  7 | #include "core/utils/cuda_utils.cuh"
  8 | 
  9 | namespace ltm {
 10 | 
 11 | /**
 12 |  * @brief CUDA tensor class with automatic memory management
 13 |  * 
 14 |  * Provides a high-level interface for managing GPU memory and tensor operations.
 15 |  * Supports both float and half precision types.
 16 |  */
 17 | template<typename T>
 18 | class Tensor {
 19 | public:
 20 |     /**
 21 |      * @brief Create tensor with given shape
 22 |      * 
 23 |      * @param shape Vector of dimensions
 24 |      */
 25 |     explicit Tensor(const std::vector<int>& shape)
 26 |         : shape_(shape), stride_(computeStrides(shape)) {
 27 |         allocateMemory();
 28 |     }
 29 | 
 30 |     /**
 31 |      * @brief Create tensor with given shape and data
 32 |      * 
 33 |      * @param shape Vector of dimensions
 34 |      * @param data Pointer to data (will be copied to device)
 35 |      * @param own_memory Whether tensor should own the memory
 36 |      */
 37 |     Tensor(
 38 |         const std::vector<int>& shape,
 39 |         const T* data,
 40 |         bool own_memory = true
 41 |     ) : shape_(shape),
 42 |         stride_(computeStrides(shape)),
 43 |         own_memory_(own_memory) {
 44 |         if (own_memory_) {
 45 |             allocateMemory();
 46 |             copyFromHost(data);
 47 |         } else {
 48 |             data_ = const_cast<T*>(data);
 49 |         }
 50 |     }
 51 | 
 52 |     /**
 53 |      * @brief Move constructor
 54 |      */
 55 |     Tensor(Tensor&& other) noexcept
 56 |         : data_(other.data_),
 57 |           shape_(std::move(other.shape_)),
 58 |           stride_(std::move(other.stride_)),
 59 |           own_memory_(other.own_memory_) {
 60 |         other.data_ = nullptr;
 61 |         other.own_memory_ = false;
 62 |     }
 63 | 
 64 |     /**
 65 |      * @brief Move assignment
 66 |      */
 67 |     Tensor& operator=(Tensor&& other) noexcept {
 68 |         if (this != &other) {
 69 |             freeMemory();
 70 |             data_ = other.data_;
 71 |             shape_ = std::move(other.shape_);
 72 |             stride_ = std::move(other.stride_);
 73 |             own_memory_ = other.own_memory_;
 74 |             other.data_ = nullptr;
 75 |             other.own_memory_ = false;
 76 |         }
 77 |         return *this;
 78 |     }
 79 | 
 80 |     /**
 81 |      * @brief Destructor
 82 |      */
 83 |     ~Tensor() {
 84 |         freeMemory();
 85 |     }
 86 | 
 87 |     // Disable copy operations
 88 |     Tensor(const Tensor&) = delete;
 89 |     Tensor& operator=(const Tensor&) = delete;
 90 | 
 91 |     /**
 92 |      * @brief Get raw pointer to device memory
 93 |      */
 94 |     T* data() const { return data_; }
 95 | 
 96 |     /**
 97 |      * @brief Get tensor shape
 98 |      */
 99 |     const std::vector<int>& shape() const { return shape_; }
100 | 
101 |     /**
102 |      * @brief Get tensor strides
103 |      */
104 |     const std::vector<int>& stride() const { return stride_; }
105 | 
106 |     /**
107 |      * @brief Get total number of elements
108 |      */
109 |     size_t numel() const {
110 |         size_t n = 1;
111 |         for (int dim : shape_) {
112 |             n *= dim;
113 |         }
114 |         return n;
115 |     }
116 | 
117 |     /**
118 |      * @brief Get size of dimension
119 |      */
120 |     int size(int dim) const {
121 |         return shape_[dim];
122 |     }
123 | 
124 |     /**
125 |      * @brief Copy data from host to device
126 |      */
127 |     void copyFromHost(const T* host_data) {
128 |         CUDA_CHECK(cudaMemcpy(
129 |             data_,
130 |             host_data,
131 |             numel() * sizeof(T),
132 |             cudaMemcpyHostToDevice
133 |         ));
134 |     }
135 | 
136 |     /**
137 |      * @brief Copy data from device to host
138 |      */
139 |     void copyToHost(T* host_data) const {
140 |         CUDA_CHECK(cudaMemcpy(
141 |             host_data,
142 |             data_,
143 |             numel() * sizeof(T),
144 |             cudaMemcpyDeviceToHost
145 |         ));
146 |     }
147 | 
148 |     /**
149 |      * @brief Copy data from another tensor
150 |      */
151 |     void copyFrom(const Tensor<T>& other) {
152 |         if (numel() != other.numel()) {
153 |             throw std::runtime_error("Tensor sizes don't match for copy");
154 |         }
155 |         CUDA_CHECK(cudaMemcpy(
156 |             data_,
157 |             other.data_,
158 |             numel() * sizeof(T),
159 |             cudaMemcpyDeviceToDevice
160 |         ));
161 |     }
162 | 
163 |     /**
164 |      * @brief Fill tensor with value
165 |      */
166 |     void fill(T value) {
167 |         CUDA_CHECK(cudaMemset(
168 |             data_,
169 |             value,
170 |             numel() * sizeof(T)
171 |         ));
172 |     }
173 | 
174 |     /**
175 |      * @brief Reshape tensor to new dimensions
176 |      * 
177 |      * @param new_shape New shape
178 |      * @return Tensor& Reference to this tensor
179 |      */
180 |     Tensor& reshape(const std::vector<int>& new_shape) {
181 |         size_t new_size = 1;
182 |         for (int dim : new_shape) {
183 |             new_size *= dim;
184 |         }
185 |         if (new_size != numel()) {
186 |             throw std::runtime_error("Invalid reshape dimensions");
187 |         }
188 |         shape_ = new_shape;
189 |         stride_ = computeStrides(new_shape);
190 |         return *this;
191 |     }
192 | 
193 |     /**
194 |      * @brief Get view of tensor with different shape
195 |      * 
196 |      * @param new_shape New shape
197 |      * @return Tensor New tensor sharing the same memory
198 |      */
199 |     Tensor view(const std::vector<int>& new_shape) const {
200 |         size_t new_size = 1;
201 |         for (int dim : new_shape) {
202 |             new_size *= dim;
203 |         }
204 |         if (new_size != numel()) {
205 |             throw std::runtime_error("Invalid view dimensions");
206 |         }
207 |         return Tensor(new_shape, data_, false);
208 |     }
209 | 
210 |     /**
211 |      * @brief Transpose tensor
212 |      * 
213 |      * @return Tensor Transposed tensor
214 |      */
215 |     Tensor transpose() const {
216 |         if (shape_.size() != 2) {
217 |             throw std::runtime_error("Transpose only supported for 2D tensors");
218 |         }
219 |         std::vector<int> transposed_shape = {shape_[1], shape_[0]};
220 |         std::vector<int> transposed_stride = {stride_[1], stride_[0]};
221 |         
222 |         Tensor result(transposed_shape);
223 |         
224 |         // Launch efficient transpose kernel
225 |         const int TILE_DIM = 32;
226 |         const int BLOCK_ROWS = 8;
227 |         dim3 grid((shape_[1] + TILE_DIM - 1) / TILE_DIM,
228 |                   (shape_[0] + TILE_DIM - 1) / TILE_DIM);
229 |         dim3 block(TILE_DIM, BLOCK_ROWS);
230 |         
231 |         // Shared memory size with padding to avoid bank conflicts
232 |         const size_t shared_mem_size = (TILE_DIM * (TILE_DIM + 1)) * sizeof(T);
233 |         
234 |         transposeKernel<T><<<grid, block, shared_mem_size>>>(
235 |             data_,
236 |             result.data(),
237 |             shape_[0], // rows
238 |             shape_[1]  // cols
239 |         );
240 |         CUDA_CHECK(cudaGetLastError());
241 |         
242 |         return result;
243 |     }
244 | 
245 | private:
246 |     // Efficient matrix transpose kernel using shared memory tiling
247 |     template<typename U>
248 |     __global__ static void transposeKernel(
249 |         const U* __restrict__ input,
250 |         U* __restrict__ output,
251 |         const int rows,
252 |         const int cols
253 |     ) {
254 |         __shared__ U tile[32][33]; // +1 padding to avoid bank conflicts
255 |         
256 |         const int x = blockIdx.x * 32 + threadIdx.x;
257 |         const int y = blockIdx.y * 32 + threadIdx.y;
258 |         
259 |         // Load tile into shared memory with coalesced reads
260 |         for (int j = 0; j < 32; j += 8) {
261 |             if (y + j < rows && x < cols) {
262 |                 tile[threadIdx.y + j][threadIdx.x] = input[(y + j) * cols + x];
263 |             }
264 |         }
265 |         __syncthreads();
266 |         
267 |         // Write transposed tile with coalesced writes
268 |         const int out_x = blockIdx.y * 32 + threadIdx.x;
269 |         const int out_y = blockIdx.x * 32 + threadIdx.y;
270 |         
271 |         for (int j = 0; j < 32; j += 8) {
272 |             if (out_y + j < cols && out_x < rows) {
273 |                 output[(out_y + j) * rows + out_x] = tile[threadIdx.x][threadIdx.y + j];
274 |             }
275 |         }
276 |     }
277 | 
278 |     T* data_ = nullptr;
279 |     std::vector<int> shape_;
280 |     std::vector<int> stride_;
281 |     bool own_memory_ = true;
282 | 
283 |     /**
284 |      * @brief Compute strides for given shape
285 |      */
286 |     static std::vector<int> computeStrides(const std::vector<int>& shape) {
287 |         std::vector<int> stride(shape.size());
288 |         int curr_stride = 1;
289 |         for (int i = shape.size() - 1; i >= 0; --i) {
290 |             stride[i] = curr_stride;
291 |             curr_stride *= shape[i];
292 |         }
293 |         return stride;
294 |     }
295 | 
296 |     /**
297 |      * @brief Allocate device memory
298 |      */
299 |     void allocateMemory() {
300 |         if (numel() > 0) {
301 |             CUDA_CHECK(cudaMalloc(&data_, numel() * sizeof(T)));
302 |         }
303 |     }
304 | 
305 |     /**
306 |      * @brief Free device memory
307 |      */
308 |     void freeMemory() {
309 |         if (own_memory_ && data_) {
310 |             CUDA_CHECK(cudaFree(data_));
311 |             data_ = nullptr;
312 |         }
313 |     }
314 | };
315 | 
316 | // Common tensor types
317 | using FloatTensor = Tensor<float>;
318 | using HalfTensor = Tensor<half>;
319 | 
320 | /**
321 |  * @brief Create tensor from host data
322 |  * 
323 |  * @tparam T Data type
324 |  * @param shape Tensor shape
325 |  * @param data Host data pointer
326 |  * @return Tensor<T> New tensor with copied data
327 |  */
328 | template<typename T>
329 | Tensor<T> tensorFromHost(
330 |     const std::vector<int>& shape,
331 |     const T* data
332 | ) {
333 |     Tensor<T> tensor(shape);
334 |     tensor.copyFromHost(data);
335 |     return tensor;
336 | }
337 | 
338 | /**
339 |  * @brief Create tensor filled with zeros
340 |  * 
341 |  * @tparam T Data type
342 |  * @param shape Tensor shape
343 |  * @return Tensor<T> Zero-initialized tensor
344 |  */
345 | template<typename T>
346 | Tensor<T> zeros(const std::vector<int>& shape) {
347 |     Tensor<T> tensor(shape);
348 |     tensor.fill(0);
349 |     return tensor;
350 | }
351 | 
352 | /**
353 |  * @brief Create tensor filled with ones
354 |  * 
355 |  * @tparam T Data type
356 |  * @param shape Tensor shape
357 |  * @return Tensor<T> One-initialized tensor
358 |  */
359 | template<typename T>
360 | Tensor<T> ones(const std::vector<int>& shape) {
361 |     Tensor<T> tensor(shape);
362 |     tensor.fill(1);
363 |     return tensor;
364 | }
365 | 
366 | } // namespace ltm
367 | 


--------------------------------------------------------------------------------
/docs/usage/guide.md:
--------------------------------------------------------------------------------
  1 | # LTM Transformer Usage Guide
  2 | 
  3 | This guide provides practical examples and instructions for using the LTM Transformer library. It covers common tasks like training models, running inference, and optimizing performance.
  4 | 
  5 | ## Table of Contents
  6 | 
  7 | - [Installation](#installation)
  8 | - [Basic Usage](#basic-usage)
  9 | - [Training](#training)
 10 | - [Inference](#inference)
 11 | - [Distributed Training](#distributed-training)
 12 | - [Performance Optimization](#performance-optimization)
 13 | - [Advanced Features](#advanced-features)
 14 | 
 15 | ## Installation
 16 | 
 17 | ### From PyPI
 18 | 
 19 | ```bash
 20 | pip install ltm-transformer
 21 | ```
 22 | 
 23 | ### From Source
 24 | 
 25 | ```bash
 26 | git clone https://github.com/singularityresearch/ltm-transformer.git
 27 | cd ltm-transformer
 28 | pip install -e .
 29 | ```
 30 | 
 31 | ## Basic Usage
 32 | 
 33 | ### Creating a Model
 34 | 
 35 | ```python
 36 | from ltm import TitanModel, TitanConfig
 37 | 
 38 | # Initialize configuration
 39 | config = TitanConfig(
 40 |     hidden_size=768,
 41 |     num_attention_heads=12,
 42 |     num_hidden_layers=12,
 43 |     memory_slots=512,
 44 |     memory_dim=64,
 45 |     use_flash_attention=True
 46 | )
 47 | 
 48 | # Create model
 49 | model = TitanModel(config)
 50 | ```
 51 | 
 52 | ### Simple Forward Pass
 53 | 
 54 | ```python
 55 | import torch
 56 | 
 57 | # Prepare input
 58 | input_ids = torch.randint(0, vocab_size, (batch_size, seq_length))
 59 | 
 60 | # Forward pass
 61 | outputs = model(
 62 |     input_ids=input_ids,
 63 |     attention_mask=None,  # Optional attention mask
 64 |     use_cache=True       # Enable KV caching
 65 | )
 66 | 
 67 | # Access outputs
 68 | hidden_states = outputs.hidden_states
 69 | memory_states = outputs.memory_states
 70 | ```
 71 | 
 72 | ## Training
 73 | 
 74 | ### Basic Training Loop
 75 | 
 76 | ```python
 77 | from ltm import Trainer, TrainingArguments
 78 | from datasets import load_dataset
 79 | 
 80 | # Load dataset
 81 | dataset = load_dataset("wikitext", "wikitext-103-v1")
 82 | 
 83 | # Configure training
 84 | training_args = TrainingArguments(
 85 |     output_dir="./outputs",
 86 |     learning_rate=5e-5,
 87 |     per_device_train_batch_size=8,
 88 |     gradient_accumulation_steps=4,
 89 |     max_steps=100000,
 90 |     warmup_steps=10000,
 91 |     logging_steps=100,
 92 |     save_steps=1000,
 93 |     fp16=True,
 94 |     gradient_checkpointing=True
 95 | )
 96 | 
 97 | # Initialize trainer
 98 | trainer = Trainer(
 99 |     model=model,
100 |     args=training_args,
101 |     train_dataset=dataset["train"],
102 |     eval_dataset=dataset["validation"]
103 | )
104 | 
105 | # Start training
106 | trainer.train()
107 | ```
108 | 
109 | ### Custom Training Loop
110 | 
111 | ```python
112 | import torch.optim as optim
113 | 
114 | # Initialize optimizer
115 | optimizer = optim.AdamW(model.parameters(), lr=5e-5)
116 | 
117 | # Training loop
118 | model.train()
119 | for epoch in range(num_epochs):
120 |     for batch in dataloader:
121 |         # Forward pass
122 |         outputs = model(
123 |             input_ids=batch["input_ids"],
124 |             attention_mask=batch["attention_mask"],
125 |             labels=batch["labels"]
126 |         )
127 |         
128 |         loss = outputs.loss
129 |         
130 |         # Backward pass
131 |         loss.backward()
132 |         optimizer.step()
133 |         optimizer.zero_grad()
134 |         
135 |         # Update memory bank
136 |         model.update_memory_bank(outputs.hidden_states)
137 | ```
138 | 
139 | ## Inference
140 | 
141 | ### Text Generation
142 | 
143 | ```python
144 | from ltm import InferenceEngine, InferenceConfig
145 | from transformers import AutoTokenizer
146 | 
147 | # Initialize tokenizer
148 | tokenizer = AutoTokenizer.from_pretrained("gpt2")
149 | 
150 | # Configure inference
151 | config = InferenceConfig(
152 |     max_sequence_length=2048,
153 |     use_flash_attention=True,
154 |     use_memory_cache=True,
155 |     batch_size=1
156 | )
157 | 
158 | # Create inference engine
159 | engine = InferenceEngine(model, tokenizer, config)
160 | 
161 | # Generate text
162 | output = engine.generate(
163 |     input_ids=tokenizer.encode("Once upon a time"),
164 |     max_new_tokens=100,
165 |     temperature=0.7,
166 |     top_p=0.9
167 | )
168 | 
169 | # Decode output
170 | text = tokenizer.decode(output[0])
171 | ```
172 | 
173 | ### Streaming Generation
174 | 
175 | ```python
176 | # Stream tokens one by one
177 | for token in engine.stream_generate(
178 |     input_ids=tokenizer.encode("The future of AI"),
179 |     max_new_tokens=100
180 | ):
181 |     print(tokenizer.decode([token]), end="", flush=True)
182 | ```
183 | 
184 | ### Batch Processing
185 | 
186 | ```python
187 | # Process multiple inputs in parallel
188 | inputs = [
189 |     "First prompt",
190 |     "Second prompt",
191 |     "Third prompt"
192 | ]
193 | 
194 | # Tokenize inputs
195 | input_ids = [tokenizer.encode(text) for text in inputs]
196 | 
197 | # Add to batch processor
198 | for ids in input_ids:
199 |     engine.batch_processor.add_request(ids)
200 | 
201 | # Process batch
202 | results = list(engine.batch_processor.process_batch())
203 | ```
204 | 
205 | ## Distributed Training
206 | 
207 | ### Multi-GPU Training
208 | 
209 | ```python
210 | from ltm import DistributedTrainer
211 | 
212 | # Initialize distributed training
213 | training_args = TrainingArguments(
214 |     output_dir="./outputs",
215 |     per_device_train_batch_size=8,
216 |     gradient_accumulation_steps=4,
217 |     fp16=True,
218 |     gradient_checkpointing=True,
219 |     
220 |     # Distributed settings
221 |     local_rank=-1,  # Set by torch.distributed.launch
222 |     tensor_parallel_size=4,
223 |     pipeline_parallel_size=2
224 | )
225 | 
226 | trainer = DistributedTrainer(
227 |     model=model,
228 |     args=training_args,
229 |     train_dataset=dataset
230 | )
231 | 
232 | # Launch training
233 | trainer.train()
234 | ```
235 | 
236 | ### Multi-Node Training
237 | 
238 | ```bash
239 | # On node 1 (master)
240 | python -m torch.distributed.launch \
241 |     --nproc_per_node=8 \
242 |     --nnodes=2 \
243 |     --node_rank=0 \
244 |     --master_addr="192.168.1.1" \
245 |     --master_port=29500 \
246 |     train.py
247 | 
248 | # On node 2
249 | python -m torch.distributed.launch \
250 |     --nproc_per_node=8 \
251 |     --nnodes=2 \
252 |     --node_rank=1 \
253 |     --master_addr="192.168.1.1" \
254 |     --master_port=29500 \
255 |     train.py
256 | ```
257 | 
258 | ## Performance Optimization
259 | 
260 | ### Memory Optimization
261 | 
262 | ```python
263 | # Enable gradient checkpointing
264 | model.gradient_checkpointing_enable()
265 | 
266 | # Use mixed precision training
267 | from torch.cuda.amp import autocast
268 | 
269 | with autocast():
270 |     outputs = model(input_ids)
271 | ```
272 | 
273 | ### Quantization
274 | 
275 | ```python
276 | from ltm import QuantizedEngine
277 | 
278 | # Quantize model to INT8
279 | engine = QuantizedEngine(
280 |     model,
281 |     config=InferenceConfig(
282 |         use_quantization=True,
283 |         quantization_bits=8
284 |     )
285 | )
286 | 
287 | # Run inference with quantized model
288 | outputs = engine.generate(input_ids)
289 | ```
290 | 
291 | ### Caching
292 | 
293 | ```python
294 | # Enable KV caching
295 | engine = InferenceEngine(
296 |     model,
297 |     config=InferenceConfig(
298 |         use_cache=True,
299 |         use_kv_cache=True,
300 |         use_memory_cache=True
301 |     )
302 | )
303 | 
304 | # First forward pass
305 | outputs = engine.generate(
306 |     input_ids,
307 |     use_cache=True
308 | )
309 | 
310 | # Subsequent passes will reuse cached states
311 | next_outputs = engine.generate(
312 |     new_input_ids,
313 |     past_key_values=outputs.past_key_values
314 | )
315 | ```
316 | 
317 | ## Advanced Features
318 | 
319 | ### Custom Memory Bank
320 | 
321 | ```python
322 | from ltm import MemoryBank, CompressionGate
323 | 
324 | # Configure custom memory bank
325 | memory_bank = MemoryBank(
326 |     num_slots=1024,
327 |     slot_dim=128,
328 |     update_rate=0.9
329 | )
330 | 
331 | # Configure compression gate
332 | compression_gate = CompressionGate(
333 |     input_dim=768,
334 |     compressed_dim=128,
335 |     num_heads=4
336 | )
337 | 
338 | # Use custom components
339 | model = TitanModel(
340 |     config=TitanConfig(
341 |         memory_bank=memory_bank,
342 |         compression_gate=compression_gate
343 |     )
344 | )
345 | ```
346 | 
347 | ### Memory Visualization
348 | 
349 | ```python
350 | from ltm.utils import visualize_memory
351 | 
352 | # Get memory attention patterns
353 | attention_patterns = model.get_memory_attention_patterns()
354 | 
355 | # Visualize memory usage
356 | visualize_memory(
357 |     attention_patterns,
358 |     save_path="memory_visualization.png"
359 | )
360 | ```
361 | 
362 | ### Export Model
363 | 
364 | ```python
365 | # Save model and configuration
366 | model.save_pretrained("./my_model")
367 | 
368 | # Save tokenizer
369 | tokenizer.save_pretrained("./my_model")
370 | 
371 | # Load model
372 | from ltm import TitanModel
373 | model = TitanModel.from_pretrained("./my_model")
374 | ```
375 | 
376 | ## Best Practices
377 | 
378 | 1. **Memory Management**
379 |    - Use gradient checkpointing for large models
380 |    - Enable mixed precision training
381 |    - Monitor memory usage with `nvidia-smi`
382 | 
383 | 2. **Performance**
384 |    - Use FlashAttention when possible
385 |    - Enable memory caching for inference
386 |    - Batch inputs when processing multiple sequences
387 | 
388 | 3. **Training**
389 |    - Start with small models for testing
390 |    - Use learning rate warmup
391 |    - Monitor memory bank usage
392 |    - Save checkpoints regularly
393 | 
394 | 4. **Inference**
395 |    - Use quantization for deployment
396 |    - Enable KV caching for autoregressive generation
397 |    - Batch requests when possible
398 | 
399 | 5. **Distributed Training**
400 |    - Use tensor parallelism for large models
401 |    - Enable pipeline parallelism when appropriate
402 |    - Monitor GPU utilization across nodes
403 | 
404 | ## Troubleshooting
405 | 
406 | ### Common Issues
407 | 
408 | 1. **Out of Memory**
409 |    ```python
410 |    # Solution: Enable gradient checkpointing
411 |    model.gradient_checkpointing_enable()
412 |    
413 |    # Or reduce batch size
414 |    training_args.per_device_train_batch_size //= 2
415 |    ```
416 | 
417 | 2. **Slow Training**
418 |    ```python
419 |    # Solution: Enable Flash Attention
420 |    config.use_flash_attention = True
421 |    
422 |    # Use larger batch size with gradient accumulation
423 |    training_args.gradient_accumulation_steps *= 2
424 |    ```
425 | 
426 | 3. **Memory Bank Issues**
427 |    ```python
428 |    # Solution: Monitor memory usage
429 |    memory_stats = model.get_memory_stats()
430 |    print(f"Memory utilization: {memory_stats['utilization']}")
431 |    
432 |    # Reset memory if needed
433 |    model.reset_memory_bank()
434 |    ```
435 | 
436 | ### Getting Help
437 | 
438 | - Check the [GitHub issues](https://github.com/singularityresearch/ltm-transformer/issues)
439 | - Join our [Discord community](https://discord.gg/ltm-transformer)
440 | - Contact maintainers at support@singularityresearch.org
441 | 


--------------------------------------------------------------------------------
/src/core/parallel/tensor_parallel.cpp:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <nccl.h>
  3 | #include <vector>
  4 | #include <memory>
  5 | #include "core/parallel/tensor_parallel.hpp"
  6 | #include "core/utils/cuda_utils.cuh"
  7 | 
  8 | namespace ltm {
  9 | namespace parallel {
 10 | 
 11 | class TensorParallelContext {
 12 | public:
 13 |     TensorParallelContext(
 14 |         int world_size,
 15 |         int rank,
 16 |         ncclComm_t nccl_comm,
 17 |         int device_id
 18 |     ) : world_size_(world_size),
 19 |         rank_(rank),
 20 |         nccl_comm_(nccl_comm) {
 21 |         CUDA_CHECK(cudaSetDevice(device_id));
 22 |         CUDA_CHECK(cudaStreamCreate(&compute_stream_));
 23 |         CUDA_CHECK(cudaStreamCreate(&comm_stream_));
 24 |         
 25 |         // Create events for stream synchronization
 26 |         CUDA_CHECK(cudaEventCreate(&compute_event_));
 27 |         CUDA_CHECK(cudaEventCreate(&comm_event_));
 28 |     }
 29 | 
 30 |     ~TensorParallelContext() {
 31 |         CUDA_CHECK(cudaEventDestroy(compute_event_));
 32 |         CUDA_CHECK(cudaEventDestroy(comm_event_));
 33 |         CUDA_CHECK(cudaStreamDestroy(compute_stream_));
 34 |         CUDA_CHECK(cudaStreamDestroy(comm_stream_));
 35 |     }
 36 | 
 37 |     // Split tensor along specified dimension
 38 |     template<typename T>
 39 |     void splitTensor(
 40 |         const Tensor<T>& input,
 41 |         Tensor<T>& local_output,
 42 |         int split_dim
 43 |     ) {
 44 |         const auto& shape = input.shape();
 45 |         std::vector<int> local_shape = shape;
 46 |         local_shape[split_dim] /= world_size_;
 47 |         
 48 |         // Calculate offset for this rank
 49 |         size_t offset = rank_ * input.numel() / world_size_;
 50 |         
 51 |         // Copy local portion
 52 |         CUDA_CHECK(cudaMemcpyAsync(
 53 |             local_output.data(),
 54 |             static_cast<const T*>(input.data()) + offset,
 55 |             local_output.numel() * sizeof(T),
 56 |             cudaMemcpyDeviceToDevice,
 57 |             compute_stream_
 58 |         ));
 59 |     }
 60 | 
 61 |     // Gather split tensor
 62 |     template<typename T>
 63 |     void gatherTensor(
 64 |         const Tensor<T>& local_input,
 65 |         Tensor<T>& output,
 66 |         int split_dim
 67 |     ) {
 68 |         ncclAllGather(
 69 |             local_input.data(),
 70 |             output.data(),
 71 |             local_input.numel(),
 72 |             ncclDataType<T>(),
 73 |             nccl_comm_,
 74 |             comm_stream_
 75 |         );
 76 |         
 77 |         // Wait for gather to complete
 78 |         CUDA_CHECK(cudaStreamSynchronize(comm_stream_));
 79 |     }
 80 | 
 81 |     // All-reduce across devices
 82 |     template<typename T>
 83 |     void allReduce(void* data, size_t count) {
 84 |         ncclAllReduce(
 85 |             data,
 86 |             data,
 87 |             count,
 88 |             ncclDataType<T>(),
 89 |             ncclSum,
 90 |             nccl_comm_,
 91 |             comm_stream_
 92 |         );
 93 |         
 94 |         // Wait for reduction to complete
 95 |         CUDA_CHECK(cudaStreamSynchronize(comm_stream_));
 96 |     }
 97 | 
 98 |     cudaStream_t computeStream() const { return compute_stream_; }
 99 |     cudaStream_t commStream() const { return comm_stream_; }
100 |     int worldSize() const { return world_size_; }
101 |     int rank() const { return rank_; }
102 | 
103 | private:
104 |     int world_size_;
105 |     int rank_;
106 |     ncclComm_t nccl_comm_;
107 |     
108 |     cudaStream_t compute_stream_;
109 |     cudaStream_t comm_stream_;
110 |     cudaEvent_t compute_event_;
111 |     cudaEvent_t comm_event_;
112 |     
113 |     // Helper to get NCCL data type
114 |     template<typename T>
115 |     static ncclDataType_t ncclDataType();
116 | };
117 | 
118 | // Template specializations for NCCL data types
119 | template<> ncclDataType_t TensorParallelContext::ncclDataType<float>() { return ncclFloat32; }
120 | template<> ncclDataType_t TensorParallelContext::ncclDataType<half>() { return ncclFloat16; }
121 | 
122 | class TensorParallelLinear {
123 | public:
124 |     TensorParallelLinear(
125 |         TensorParallelContext& ctx,
126 |         int input_dim,
127 |         int output_dim
128 |     ) : ctx_(ctx) {
129 |         // Split weight matrix across devices
130 |         const int local_output_dim = output_dim / ctx.worldSize();
131 |         weight_ = Tensor<float>({local_output_dim, input_dim});
132 |         bias_ = Tensor<float>({local_output_dim});
133 |         
134 |         // Initialize parameters
135 |         initializeParameters();
136 |     }
137 | 
138 |     void forward(
139 |         const Tensor<float>& input,
140 |         Tensor<float>& output
141 |     ) {
142 |         // Local matrix multiplication
143 |         matmul(input, weight_, local_output_, ctx_.computeStream());
144 |         
145 |         // Add bias
146 |         addBias(local_output_, bias_, ctx_.computeStream());
147 |         
148 |         // Gather results from all devices
149 |         ctx_.gatherTensor(local_output_, output, 1);
150 |     }
151 | 
152 |     void backward(
153 |         const Tensor<float>& grad_output,
154 |         const Tensor<float>& input,
155 |         Tensor<float>& grad_input,
156 |         bool compute_grad_input = true
157 |     ) {
158 |         // Split gradient
159 |         Tensor<float> local_grad_output;
160 |         ctx_.splitTensor(grad_output, local_grad_output, 1);
161 |         
162 |         // Compute weight gradients
163 |         matmul(
164 |             local_grad_output.transpose(),
165 |             input,
166 |             weight_grad_,
167 |             ctx_.computeStream()
168 |         );
169 |         
170 |         // Compute bias gradients
171 |         reduceSumAlongDim(local_grad_output, bias_grad_, 0, ctx_.computeStream());
172 |         
173 |         if (compute_grad_input) {
174 |             // Compute input gradients
175 |             matmul(
176 |                 local_grad_output,
177 |                 weight_.transpose(),
178 |                 local_grad_input_,
179 |                 ctx_.computeStream()
180 |             );
181 |             
182 |             // All-reduce input gradients
183 |             ctx_.allReduce<float>(
184 |                 local_grad_input_.data(),
185 |                 local_grad_input_.numel()
186 |             );
187 |             
188 |             grad_input = local_grad_input_;
189 |         }
190 |     }
191 | 
192 |     void updateParameters(float learning_rate) {
193 |         // Update weights
194 |         axpy(
195 |             -learning_rate,
196 |             weight_grad_,
197 |             weight_,
198 |             ctx_.computeStream()
199 |         );
200 |         
201 |         // Update bias
202 |         axpy(
203 |             -learning_rate,
204 |             bias_grad_,
205 |             bias_,
206 |             ctx_.computeStream()
207 |         );
208 |     }
209 | 
210 | private:
211 |     void initializeParameters() {
212 |         // Initialize weights using Kaiming initialization
213 |         float std = sqrt(2.0f / weight_.shape()[1]);
214 |         initializeNormal(weight_, 0.0f, std, ctx_.computeStream());
215 |         
216 |         // Initialize bias to zero
217 |         initializeZero(bias_, ctx_.computeStream());
218 |     }
219 | 
220 |     TensorParallelContext& ctx_;
221 |     Tensor<float> weight_;
222 |     Tensor<float> bias_;
223 |     Tensor<float> weight_grad_;
224 |     Tensor<float> bias_grad_;
225 |     Tensor<float> local_output_;
226 |     Tensor<float> local_grad_input_;
227 | };
228 | 
229 | class TensorParallelAttention {
230 | public:
231 |     TensorParallelAttention(
232 |         TensorParallelContext& ctx,
233 |         int hidden_dim,
234 |         int num_heads
235 |     ) : ctx_(ctx),
236 |         hidden_dim_(hidden_dim),
237 |         num_heads_(num_heads) {
238 |         // Split attention heads across devices
239 |         const int local_num_heads = num_heads / ctx.worldSize();
240 |         const int head_dim = hidden_dim / num_heads;
241 |         
242 |         // Initialize projection matrices
243 |         query_proj_ = std::make_unique<TensorParallelLinear>(
244 |             ctx, hidden_dim, local_num_heads * head_dim);
245 |         key_proj_ = std::make_unique<TensorParallelLinear>(
246 |             ctx, hidden_dim, local_num_heads * head_dim);
247 |         value_proj_ = std::make_unique<TensorParallelLinear>(
248 |             ctx, hidden_dim, local_num_heads * head_dim);
249 |         output_proj_ = std::make_unique<TensorParallelLinear>(
250 |             ctx, local_num_heads * head_dim, hidden_dim);
251 |     }
252 | 
253 |     void forward(
254 |         const Tensor<float>& input,
255 |         Tensor<float>& output,
256 |         const Tensor<float>* attention_mask = nullptr
257 |     ) {
258 |         // Project queries, keys, and values
259 |         query_proj_->forward(input, query_);
260 |         key_proj_->forward(input, key_);
261 |         value_proj_->forward(input, value_);
262 |         
263 |         // Compute attention scores
264 |         computeAttentionScores(
265 |             query_,
266 |             key_,
267 |             attention_scores_,
268 |             attention_mask,
269 |             ctx_.computeStream()
270 |         );
271 |         
272 |         // Apply attention to values
273 |         applyAttention(
274 |             attention_scores_,
275 |             value_,
276 |             attention_output_,
277 |             ctx_.computeStream()
278 |         );
279 |         
280 |         // Project output
281 |         output_proj_->forward(attention_output_, output);
282 |     }
283 | 
284 |     void backward(
285 |         const Tensor<float>& grad_output,
286 |         const Tensor<float>& input,
287 |         Tensor<float>& grad_input
288 |     ) {
289 |         // Backward through output projection
290 |         output_proj_->backward(grad_output, attention_output_, grad_attention_output_);
291 |         
292 |         // Backward through attention
293 |         backwardAttention(
294 |             grad_attention_output_,
295 |             attention_scores_,
296 |             value_,
297 |             grad_attention_scores_,
298 |             grad_value_,
299 |             ctx_.computeStream()
300 |         );
301 |         
302 |         // Backward through projections
303 |         value_proj_->backward(grad_value_, input, grad_input, false);
304 |         key_proj_->backward(grad_attention_scores_, input, grad_input, false);
305 |         query_proj_->backward(grad_attention_scores_, input, grad_input, true);
306 |     }
307 | 
308 | private:
309 |     TensorParallelContext& ctx_;
310 |     int hidden_dim_;
311 |     int num_heads_;
312 |     
313 |     std::unique_ptr<TensorParallelLinear> query_proj_;
314 |     std::unique_ptr<TensorParallelLinear> key_proj_;
315 |     std::unique_ptr<TensorParallelLinear> value_proj_;
316 |     std::unique_ptr<TensorParallelLinear> output_proj_;
317 |     
318 |     Tensor<float> query_;
319 |     Tensor<float> key_;
320 |     Tensor<float> value_;
321 |     Tensor<float> attention_scores_;
322 |     Tensor<float> attention_output_;
323 |     Tensor<float> grad_attention_output_;
324 |     Tensor<float> grad_attention_scores_;
325 |     Tensor<float> grad_value_;
326 | };
327 | 
328 | } // namespace parallel
329 | } // namespace ltm
330 | 


--------------------------------------------------------------------------------
/src/core/quantization/quantizer.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <cuda_fp16.h>
  3 | #include <vector>
  4 | #include "core/quantization/quantizer.cuh"
  5 | #include "core/utils/cuda_utils.cuh"
  6 | 
  7 | namespace ltm {
  8 | namespace quantization {
  9 | 
 10 | // CUDA kernels for quantization
 11 | namespace {
 12 | 
 13 | // Kernel for computing min/max values
 14 | template<typename T>
 15 | __global__ void computeMinMaxKernel(
 16 |     const T* __restrict__ input,
 17 |     float* __restrict__ min_val,
 18 |     float* __restrict__ max_val,
 19 |     int size
 20 | ) {
 21 |     extern __shared__ float shared_mem[];
 22 |     float* shared_min = shared_mem;
 23 |     float* shared_max = &shared_mem[blockDim.x];
 24 |     
 25 |     const int tid = threadIdx.x;
 26 |     const int gid = blockIdx.x * blockDim.x + threadIdx.x;
 27 |     
 28 |     // Initialize local min/max
 29 |     float thread_min = FLT_MAX;
 30 |     float thread_max = -FLT_MAX;
 31 |     
 32 |     // Process multiple elements per thread
 33 |     for (int i = gid; i < size; i += gridDim.x * blockDim.x) {
 34 |         float val = static_cast<float>(input[i]);
 35 |         thread_min = min(thread_min, val);
 36 |         thread_max = max(thread_max, val);
 37 |     }
 38 |     
 39 |     // Store in shared memory
 40 |     shared_min[tid] = thread_min;
 41 |     shared_max[tid] = thread_max;
 42 |     __syncthreads();
 43 |     
 44 |     // Reduce within block
 45 |     for (int s = blockDim.x/2; s > 0; s >>= 1) {
 46 |         if (tid < s) {
 47 |             shared_min[tid] = min(shared_min[tid], shared_min[tid + s]);
 48 |             shared_max[tid] = max(shared_max[tid], shared_max[tid + s]);
 49 |         }
 50 |         __syncthreads();
 51 |     }
 52 |     
 53 |     // Write block result
 54 |     if (tid == 0) {
 55 |         atomicMin((int*)min_val, __float_as_int(shared_min[0]));
 56 |         atomicMax((int*)max_val, __float_as_int(shared_max[0]));
 57 |     }
 58 | }
 59 | 
 60 | // Kernel for linear quantization
 61 | template<typename T>
 62 | __global__ void linearQuantizeKernel(
 63 |     const T* __restrict__ input,
 64 |     int8_t* __restrict__ output,
 65 |     float scale,
 66 |     float zero_point,
 67 |     int size
 68 | ) {
 69 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 70 |     if (idx < size) {
 71 |         float val = static_cast<float>(input[idx]);
 72 |         output[idx] = static_cast<int8_t>(round(val / scale + zero_point));
 73 |     }
 74 | }
 75 | 
 76 | // Kernel for linear dequantization
 77 | template<typename T>
 78 | __global__ void linearDequantizeKernel(
 79 |     const int8_t* __restrict__ input,
 80 |     T* __restrict__ output,
 81 |     float scale,
 82 |     float zero_point,
 83 |     int size
 84 | ) {
 85 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 86 |     if (idx < size) {
 87 |         float val = (static_cast<float>(input[idx]) - zero_point) * scale;
 88 |         output[idx] = static_cast<T>(val);
 89 |     }
 90 | }
 91 | 
 92 | // Kernel for symmetric quantization
 93 | template<typename T>
 94 | __global__ void symmetricQuantizeKernel(
 95 |     const T* __restrict__ input,
 96 |     int8_t* __restrict__ output,
 97 |     float scale,
 98 |     int size
 99 | ) {
100 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
101 |     if (idx < size) {
102 |         float val = static_cast<float>(input[idx]);
103 |         output[idx] = static_cast<int8_t>(round(val / scale));
104 |     }
105 | }
106 | 
107 | // Kernel for symmetric dequantization
108 | template<typename T>
109 | __global__ void symmetricDequantizeKernel(
110 |     const int8_t* __restrict__ input,
111 |     T* __restrict__ output,
112 |     float scale,
113 |     int size
114 | ) {
115 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
116 |     if (idx < size) {
117 |         float val = static_cast<float>(input[idx]) * scale;
118 |         output[idx] = static_cast<T>(val);
119 |     }
120 | }
121 | 
122 | // Kernel for per-channel quantization
123 | template<typename T>
124 | __global__ void perChannelQuantizeKernel(
125 |     const T* __restrict__ input,
126 |     int8_t* __restrict__ output,
127 |     const float* __restrict__ scales,
128 |     const float* __restrict__ zero_points,
129 |     int size,
130 |     int channels,
131 |     int elements_per_channel
132 | ) {
133 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
134 |     if (idx < size) {
135 |         const int channel = (idx / elements_per_channel) % channels;
136 |         float val = static_cast<float>(input[idx]);
137 |         output[idx] = static_cast<int8_t>(
138 |             round(val / scales[channel] + zero_points[channel])
139 |         );
140 |     }
141 | }
142 | 
143 | // Kernel for per-channel dequantization
144 | template<typename T>
145 | __global__ void perChannelDequantizeKernel(
146 |     const int8_t* __restrict__ input,
147 |     T* __restrict__ output,
148 |     const float* __restrict__ scales,
149 |     const float* __restrict__ zero_points,
150 |     int size,
151 |     int channels,
152 |     int elements_per_channel
153 | ) {
154 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
155 |     if (idx < size) {
156 |         const int channel = (idx / elements_per_channel) % channels;
157 |         float val = (static_cast<float>(input[idx]) - zero_points[channel]) 
158 |                    * scales[channel];
159 |         output[idx] = static_cast<T>(val);
160 |     }
161 | }
162 | 
163 | } // anonymous namespace
164 | 
165 | template<typename T>
166 | class Quantizer {
167 | public:
168 |     Quantizer(QuantizationConfig config) : config_(config) {
169 |         CUDA_CHECK(cudaStreamCreate(&stream_));
170 |     }
171 | 
172 |     ~Quantizer() {
173 |         CUDA_CHECK(cudaStreamDestroy(stream_));
174 |     }
175 | 
176 |     void quantize(const Tensor<T>& input, Tensor<int8_t>& output) {
177 |         if (config_.per_channel) {
178 |             quantizePerChannel(input, output);
179 |         } else {
180 |             quantizePerTensor(input, output);
181 |         }
182 |     }
183 | 
184 |     void dequantize(const Tensor<int8_t>& input, Tensor<T>& output) {
185 |         if (config_.per_channel) {
186 |             dequantizePerChannel(input, output);
187 |         } else {
188 |             dequantizePerTensor(input, output);
189 |         }
190 |     }
191 | 
192 | private:
193 |     void quantizePerTensor(const Tensor<T>& input, Tensor<int8_t>& output) {
194 |         // Compute min/max values
195 |         float min_val = FLT_MAX;
196 |         float max_val = -FLT_MAX;
197 |         
198 |         const int block_size = 256;
199 |         const int num_blocks = (input.numel() + block_size - 1) / block_size;
200 |         const int shared_mem_size = 2 * block_size * sizeof(float);
201 |         
202 |         computeMinMaxKernel<T><<<num_blocks, block_size, shared_mem_size, stream_>>>(
203 |             input.data(),
204 |             &min_val,
205 |             &max_val,
206 |             input.numel()
207 |         );
208 |         
209 |         // Compute quantization parameters
210 |         float scale, zero_point;
211 |         if (config_.symmetric) {
212 |             scale = max(abs(min_val), abs(max_val)) / 127.0f;
213 |             zero_point = 0.0f;
214 |             
215 |             symmetricQuantizeKernel<T><<<num_blocks, block_size, 0, stream_>>>(
216 |                 input.data(),
217 |                 output.data(),
218 |                 scale,
219 |                 input.numel()
220 |             );
221 |         } else {
222 |             scale = (max_val - min_val) / 255.0f;
223 |             zero_point = -min_val / scale + 128.0f;
224 |             
225 |             linearQuantizeKernel<T><<<num_blocks, block_size, 0, stream_>>>(
226 |                 input.data(),
227 |                 output.data(),
228 |                 scale,
229 |                 zero_point,
230 |                 input.numel()
231 |             );
232 |         }
233 |         
234 |         // Store quantization parameters
235 |         scales_.resize(1);
236 |         zero_points_.resize(1);
237 |         scales_[0] = scale;
238 |         zero_points_[0] = zero_point;
239 |     }
240 | 
241 |     void dequantizePerTensor(const Tensor<int8_t>& input, Tensor<T>& output) {
242 |         const int block_size = 256;
243 |         const int num_blocks = (input.numel() + block_size - 1) / block_size;
244 |         
245 |         if (config_.symmetric) {
246 |             symmetricDequantizeKernel<T><<<num_blocks, block_size, 0, stream_>>>(
247 |                 input.data(),
248 |                 output.data(),
249 |                 scales_[0],
250 |                 input.numel()
251 |             );
252 |         } else {
253 |             linearDequantizeKernel<T><<<num_blocks, block_size, 0, stream_>>>(
254 |                 input.data(),
255 |                 output.data(),
256 |                 scales_[0],
257 |                 zero_points_[0],
258 |                 input.numel()
259 |             );
260 |         }
261 |     }
262 | 
263 |     void quantizePerChannel(const Tensor<T>& input, Tensor<int8_t>& output) {
264 |         const int num_channels = input.shape()[config_.channel_axis];
265 |         const int elements_per_channel = input.numel() / num_channels;
266 |         
267 |         // Compute per-channel scales and zero points
268 |         scales_.resize(num_channels);
269 |         zero_points_.resize(num_channels);
270 |         
271 |         for (int c = 0; c < num_channels; ++c) {
272 |             float min_val = FLT_MAX;
273 |             float max_val = -FLT_MAX;
274 |             
275 |             // Compute min/max for channel
276 |             const int block_size = 256;
277 |             const int num_blocks = (elements_per_channel + block_size - 1) / block_size;
278 |             const int shared_mem_size = 2 * block_size * sizeof(float);
279 |             
280 |             computeMinMaxKernel<T><<<num_blocks, block_size, shared_mem_size, stream_>>>(
281 |                 input.data() + c * elements_per_channel,
282 |                 &min_val,
283 |                 &max_val,
284 |                 elements_per_channel
285 |             );
286 |             
287 |             // Compute quantization parameters for channel
288 |             if (config_.symmetric) {
289 |                 scales_[c] = max(abs(min_val), abs(max_val)) / 127.0f;
290 |                 zero_points_[c] = 0.0f;
291 |             } else {
292 |                 scales_[c] = (max_val - min_val) / 255.0f;
293 |                 zero_points_[c] = -min_val / scales_[c] + 128.0f;
294 |             }
295 |         }
296 |         
297 |         // Quantize using per-channel parameters
298 |         const int block_size = 256;
299 |         const int num_blocks = (input.numel() + block_size - 1) / block_size;
300 |         
301 |         perChannelQuantizeKernel<T><<<num_blocks, block_size, 0, stream_>>>(
302 |             input.data(),
303 |             output.data(),
304 |             scales_.data(),
305 |             zero_points_.data(),
306 |             input.numel(),
307 |             num_channels,
308 |             elements_per_channel
309 |         );
310 |     }
311 | 
312 |     void dequantizePerChannel(const Tensor<int8_t>& input, Tensor<T>& output) {
313 |         const int num_channels = input.shape()[config_.channel_axis];
314 |         const int elements_per_channel = input.numel() / num_channels;
315 |         
316 |         const int block_size = 256;
317 |         const int num_blocks = (input.numel() + block_size - 1) / block_size;
318 |         
319 |         perChannelDequantizeKernel<T><<<num_blocks, block_size, 0, stream_>>>(
320 |             input.data(),
321 |             output.data(),
322 |             scales_.data(),
323 |             zero_points_.data(),
324 |             input.numel(),
325 |             num_channels,
326 |             elements_per_channel
327 |         );
328 |     }
329 | 
330 |     QuantizationConfig config_;
331 |     cudaStream_t stream_;
332 |     std::vector<float> scales_;
333 |     std::vector<float> zero_points_;
334 | };
335 | 
336 | // Explicit instantiations
337 | template class Quantizer<float>;
338 | template class Quantizer<half>;
339 | 
340 | } // namespace quantization
341 | } // namespace ltm
342 | 


--------------------------------------------------------------------------------
/src/core/ops/fused_ops.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <cooperative_groups.h>
  3 | #include "core/ops/fused_ops.cuh"
  4 | #include "core/utils/cuda_utils.cuh"
  5 | 
  6 | namespace cg = cooperative_groups;
  7 | 
  8 | namespace ltm {
  9 | namespace ops {
 10 | 
 11 | namespace {
 12 | 
 13 | // Kernel for fused layer normalization and residual connection
 14 | template<typename T>
 15 | __global__ void layerNormResidualKernel(
 16 |     const T* __restrict__ input,
 17 |     const T* __restrict__ residual,
 18 |     const T* __restrict__ gamma,
 19 |     const T* __restrict__ beta,
 20 |     T* __restrict__ output,
 21 |     const int batch_size,
 22 |     const int hidden_dim
 23 | ) {
 24 |     extern __shared__ float s_mem[];
 25 |     float* s_mean = s_mem;
 26 |     float* s_var = &s_mem[blockDim.x];
 27 |     
 28 |     const int tid = threadIdx.x;
 29 |     const int bid = blockIdx.x;
 30 |     
 31 |     if (bid >= batch_size) return;
 32 |     
 33 |     // Step 1: Compute mean
 34 |     float local_sum = 0.0f;
 35 |     for (int i = tid; i < hidden_dim; i += blockDim.x) {
 36 |         const int idx = bid * hidden_dim + i;
 37 |         local_sum += static_cast<float>(input[idx]) + static_cast<float>(residual[idx]);
 38 |     }
 39 |     
 40 |     s_mean[tid] = local_sum;
 41 |     __syncthreads();
 42 |     
 43 |     // Reduce mean
 44 |     for (int stride = blockDim.x/2; stride > 0; stride >>= 1) {
 45 |         if (tid < stride) {
 46 |             s_mean[tid] += s_mean[tid + stride];
 47 |         }
 48 |         __syncthreads();
 49 |     }
 50 |     
 51 |     const float mean = s_mean[0] / hidden_dim;
 52 |     
 53 |     // Step 2: Compute variance
 54 |     local_sum = 0.0f;
 55 |     for (int i = tid; i < hidden_dim; i += blockDim.x) {
 56 |         const int idx = bid * hidden_dim + i;
 57 |         const float val = static_cast<float>(input[idx]) + 
 58 |                          static_cast<float>(residual[idx]) - mean;
 59 |         local_sum += val * val;
 60 |     }
 61 |     
 62 |     s_var[tid] = local_sum;
 63 |     __syncthreads();
 64 |     
 65 |     // Reduce variance
 66 |     for (int stride = blockDim.x/2; stride > 0; stride >>= 1) {
 67 |         if (tid < stride) {
 68 |             s_var[tid] += s_var[tid + stride];
 69 |         }
 70 |         __syncthreads();
 71 |     }
 72 |     
 73 |     const float var = s_var[0] / hidden_dim;
 74 |     const float rsqrt_var = rsqrtf(var + 1e-5f);
 75 |     
 76 |     // Step 3: Normalize and scale
 77 |     for (int i = tid; i < hidden_dim; i += blockDim.x) {
 78 |         const int idx = bid * hidden_dim + i;
 79 |         const float val = static_cast<float>(input[idx]) + 
 80 |                          static_cast<float>(residual[idx]);
 81 |         const float normalized = (val - mean) * rsqrt_var;
 82 |         output[idx] = static_cast<T>(
 83 |             normalized * static_cast<float>(gamma[i]) + static_cast<float>(beta[i])
 84 |         );
 85 |     }
 86 | }
 87 | 
 88 | // Kernel for fused dropout and residual connection
 89 | template<typename T>
 90 | __global__ void dropoutResidualKernel(
 91 |     const T* __restrict__ input,
 92 |     const T* __restrict__ residual,
 93 |     T* __restrict__ output,
 94 |     const float dropout_prob,
 95 |     const unsigned long long seed,
 96 |     const int size
 97 | ) {
 98 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 99 |     if (idx >= size) return;
100 |     
101 |     // Generate random number using Philox algorithm
102 |     curandStatePhilox4_32_10_t state;
103 |     curand_init(seed, idx, 0, &state);
104 |     const float rand = curand_uniform(&state);
105 |     
106 |     const float scale = 1.0f / (1.0f - dropout_prob);
107 |     const float val = static_cast<float>(input[idx]);
108 |     const float res = static_cast<float>(residual[idx]);
109 |     
110 |     output[idx] = static_cast<T>(
111 |         (rand > dropout_prob ? val * scale : 0.0f) + res
112 |     );
113 | }
114 | 
115 | // Kernel for fused bias and activation
116 | template<typename T, typename Act>
117 | __global__ void biasActivationKernel(
118 |     const T* __restrict__ input,
119 |     const T* __restrict__ bias,
120 |     T* __restrict__ output,
121 |     const int batch_size,
122 |     const int hidden_dim
123 | ) {
124 |     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
125 |     const int total_size = batch_size * hidden_dim;
126 |     
127 |     if (tid >= total_size) return;
128 |     
129 |     const int bias_idx = tid % hidden_dim;
130 |     const float val = static_cast<float>(input[tid]) + 
131 |                      static_cast<float>(bias[bias_idx]);
132 |     output[tid] = static_cast<T>(Act::forward(val));
133 | }
134 | 
135 | // GELU activation functor
136 | struct GELU {
137 |     __device__ static float forward(float x) {
138 |         // GELU approximation: 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x^3)))
139 |         const float cdf = 0.5f * (1.0f + tanhf(0.797885f * (x + 0.044715f * x * x * x)));
140 |         return x * cdf;
141 |     }
142 | };
143 | 
144 | // ReLU activation functor
145 | struct ReLU {
146 |     __device__ static float forward(float x) {
147 |         return x > 0.0f ? x : 0.0f;
148 |     }
149 | };
150 | 
151 | // Kernel for tensor addition
152 | template<typename T>
153 | __global__ void tensorAddKernel(
154 |     const T* __restrict__ input1,
155 |     const T* __restrict__ input2,
156 |     T* __restrict__ output,
157 |     const float alpha,
158 |     const float beta,
159 |     const int size
160 | ) {
161 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
162 |     if (idx >= size) return;
163 |     
164 |     const float val1 = static_cast<float>(input1[idx]);
165 |     const float val2 = static_cast<float>(input2[idx]);
166 |     output[idx] = static_cast<T>(alpha * val1 + beta * val2);
167 | }
168 | 
169 | // Kernel for element-wise multiplication
170 | template<typename T>
171 | __global__ void elementwiseMulKernel(
172 |     const T* __restrict__ input1,
173 |     const T* __restrict__ input2,
174 |     T* __restrict__ output,
175 |     const int size
176 | ) {
177 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
178 |     if (idx >= size) return;
179 |     
180 |     const float val1 = static_cast<float>(input1[idx]);
181 |     const float val2 = static_cast<float>(input2[idx]);
182 |     output[idx] = static_cast<T>(val1 * val2);
183 | }
184 | 
185 | } // anonymous namespace
186 | 
187 | template<typename T>
188 | void tensorAdd(
189 |     const Tensor<T>& input1,
190 |     const Tensor<T>& input2,
191 |     Tensor<T>& output,
192 |     float alpha,
193 |     float beta,
194 |     cudaStream_t stream
195 | ) {
196 |     const int size = input1.numel();
197 |     const int block_size = 256;
198 |     const int num_blocks = (size + block_size - 1) / block_size;
199 |     
200 |     tensorAddKernel<T><<<num_blocks, block_size, 0, stream>>>(
201 |         input1.data(),
202 |         input2.data(),
203 |         output.data(),
204 |         alpha,
205 |         beta,
206 |         size
207 |     );
208 |     
209 |     CUDA_CHECK(cudaGetLastError());
210 | }
211 | 
212 | template<typename T>
213 | void elementwiseMul(
214 |     const Tensor<T>& input1,
215 |     const Tensor<T>& input2,
216 |     Tensor<T>& output,
217 |     cudaStream_t stream
218 | ) {
219 |     const int size = input1.numel();
220 |     const int block_size = 256;
221 |     const int num_blocks = (size + block_size - 1) / block_size;
222 |     
223 |     elementwiseMulKernel<T><<<num_blocks, block_size, 0, stream>>>(
224 |         input1.data(),
225 |         input2.data(),
226 |         output.data(),
227 |         size
228 |     );
229 |     
230 |     CUDA_CHECK(cudaGetLastError());
231 | }
232 | 
233 | template<typename T>
234 | void layerNormResidual(
235 |     const Tensor<T>& input,
236 |     const Tensor<T>& residual,
237 |     const Tensor<T>& gamma,
238 |     const Tensor<T>& beta,
239 |     Tensor<T>& output,
240 |     cudaStream_t stream
241 | ) {
242 |     const int batch_size = input.shape()[0];
243 |     const int hidden_dim = input.shape()[1];
244 |     
245 |     const int block_size = 256;
246 |     const int shared_mem_size = 2 * block_size * sizeof(float);
247 |     
248 |     layerNormResidualKernel<T><<<batch_size, block_size, shared_mem_size, stream>>>(
249 |         input.data(),
250 |         residual.data(),
251 |         gamma.data(),
252 |         beta.data(),
253 |         output.data(),
254 |         batch_size,
255 |         hidden_dim
256 |     );
257 |     
258 |     CUDA_CHECK(cudaGetLastError());
259 | }
260 | 
261 | template<typename T>
262 | void dropoutResidual(
263 |     const Tensor<T>& input,
264 |     const Tensor<T>& residual,
265 |     Tensor<T>& output,
266 |     float dropout_prob,
267 |     unsigned long long seed,
268 |     cudaStream_t stream
269 | ) {
270 |     const int size = input.numel();
271 |     const int block_size = 256;
272 |     const int num_blocks = (size + block_size - 1) / block_size;
273 |     
274 |     dropoutResidualKernel<T><<<num_blocks, block_size, 0, stream>>>(
275 |         input.data(),
276 |         residual.data(),
277 |         output.data(),
278 |         dropout_prob,
279 |         seed,
280 |         size
281 |     );
282 |     
283 |     CUDA_CHECK(cudaGetLastError());
284 | }
285 | 
286 | template<typename T>
287 | void biasGeluFused(
288 |     const Tensor<T>& input,
289 |     const Tensor<T>& bias,
290 |     Tensor<T>& output,
291 |     cudaStream_t stream
292 | ) {
293 |     const int batch_size = input.shape()[0];
294 |     const int hidden_dim = input.shape()[1];
295 |     const int total_size = batch_size * hidden_dim;
296 |     
297 |     const int block_size = 256;
298 |     const int num_blocks = (total_size + block_size - 1) / block_size;
299 |     
300 |     biasActivationKernel<T, GELU><<<num_blocks, block_size, 0, stream>>>(
301 |         input.data(),
302 |         bias.data(),
303 |         output.data(),
304 |         batch_size,
305 |         hidden_dim
306 |     );
307 |     
308 |     CUDA_CHECK(cudaGetLastError());
309 | }
310 | 
311 | template<typename T>
312 | void biasReluFused(
313 |     const Tensor<T>& input,
314 |     const Tensor<T>& bias,
315 |     Tensor<T>& output,
316 |     cudaStream_t stream
317 | ) {
318 |     const int batch_size = input.shape()[0];
319 |     const int hidden_dim = input.shape()[1];
320 |     const int total_size = batch_size * hidden_dim;
321 |     
322 |     const int block_size = 256;
323 |     const int num_blocks = (total_size + block_size - 1) / block_size;
324 |     
325 |     biasActivationKernel<T, ReLU><<<num_blocks, block_size, 0, stream>>>(
326 |         input.data(),
327 |         bias.data(),
328 |         output.data(),
329 |         batch_size,
330 |         hidden_dim
331 |     );
332 |     
333 |     CUDA_CHECK(cudaGetLastError());
334 | }
335 | 
336 | // Explicit instantiations
337 | template void tensorAdd<float>(
338 |     const Tensor<float>&, const Tensor<float>&,
339 |     Tensor<float>&, float, float, cudaStream_t
340 | );
341 | template void tensorAdd<half>(
342 |     const Tensor<half>&, const Tensor<half>&,
343 |     Tensor<half>&, float, float, cudaStream_t
344 | );
345 | 
346 | template void elementwiseMul<float>(
347 |     const Tensor<float>&, const Tensor<float>&,
348 |     Tensor<float>&, cudaStream_t
349 | );
350 | template void elementwiseMul<half>(
351 |     const Tensor<half>&, const Tensor<half>&,
352 |     Tensor<half>&, cudaStream_t
353 | );
354 | 
355 | template void layerNormResidual<float>(
356 |     const Tensor<float>&, const Tensor<float>&,
357 |     const Tensor<float>&, const Tensor<float>&,
358 |     Tensor<float>&, cudaStream_t
359 | );
360 | template void layerNormResidual<half>(
361 |     const Tensor<half>&, const Tensor<half>&,
362 |     const Tensor<half>&, const Tensor<half>&,
363 |     Tensor<half>&, cudaStream_t
364 | );
365 | 
366 | template void dropoutResidual<float>(
367 |     const Tensor<float>&, const Tensor<float>&,
368 |     Tensor<float>&, float, unsigned long long, cudaStream_t
369 | );
370 | template void dropoutResidual<half>(
371 |     const Tensor<half>&, const Tensor<half>&,
372 |     Tensor<half>&, float, unsigned long long, cudaStream_t
373 | );
374 | 
375 | template void biasGeluFused<float>(
376 |     const Tensor<float>&, const Tensor<float>&,
377 |     Tensor<float>&, cudaStream_t
378 | );
379 | template void biasGeluFused<half>(
380 |     const Tensor<half>&, const Tensor<half>&,
381 |     Tensor<half>&, cudaStream_t
382 | );
383 | 
384 | template void biasReluFused<float>(
385 |     const Tensor<float>&, const Tensor<float>&,
386 |     Tensor<float>&, cudaStream_t
387 | );
388 | template void biasReluFused<half>(
389 |     const Tensor<half>&, const Tensor<half>&,
390 |     Tensor<half>&, cudaStream_t
391 | );
392 | 
393 | } // namespace ops
394 | } // namespace ltm
395 | 


--------------------------------------------------------------------------------