├── .clang-format ├── .gitattributes ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── README.md ├── bench ├── CMakeLists.txt ├── bench.cpp └── timer.h ├── clang-format.cmake ├── fasttensor ├── Assign.hpp ├── CMakeLists.txt ├── CWiseBinaryOp.hpp ├── DefaultDevice.hpp ├── Device.hpp ├── DeviceFactory.hpp ├── DeviceProperties.hpp ├── GpuDevice.hpp ├── GpuDeviceFunction.hpp ├── Memory.hpp ├── RefSelector.hpp ├── Simd │ ├── Avx2.hpp │ ├── Generic.hpp │ ├── Simd.hpp │ └── SimdMacros.hpp ├── StorageUnwrapper.hpp ├── Tensor.hpp ├── TensorExpression.fwd.hpp ├── TensorExpression.hpp ├── TensorStorage.hpp ├── TensorStorageRef.hpp └── UnrollUtils.hpp └── tests ├── CMakeLists.txt ├── cwiseops.cpp └── test.cpp /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: LLVM 4 | AccessModifierOffset: -2 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlines: Right 9 | AlignOperands: true 10 | AlignTrailingComments: true 11 | AllowAllParametersOfDeclarationOnNextLine: true 12 | AllowShortBlocksOnASingleLine: false 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: All 15 | AllowShortIfStatementsOnASingleLine: false 16 | AllowShortLoopsOnASingleLine: false 17 | AlwaysBreakAfterDefinitionReturnType: None 18 | AlwaysBreakAfterReturnType: None 19 | AlwaysBreakBeforeMultilineStrings: false 20 | AlwaysBreakTemplateDeclarations: Yes 21 | BinPackArguments: true 22 | BinPackParameters: true 23 | BraceWrapping: 24 | AfterClass: false 25 | AfterControlStatement: false 26 | AfterEnum: false 27 | AfterFunction: false 28 | AfterNamespace: false 29 | AfterObjCDeclaration: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | AfterExternBlock: false 33 | BeforeCatch: false 34 | BeforeElse: false 35 | IndentBraces: false 36 | SplitEmptyFunction: true 37 | SplitEmptyRecord: true 38 | SplitEmptyNamespace: true 39 | BreakBeforeBinaryOperators: None 40 | BreakBeforeBraces: Attach 41 | BreakBeforeInheritanceComma: false 42 | BreakInheritanceList: BeforeColon 43 | BreakBeforeTernaryOperators: true 44 | BreakConstructorInitializersBeforeComma: false 45 | BreakConstructorInitializers: BeforeColon 46 | BreakAfterJavaFieldAnnotations: false 47 | BreakStringLiterals: true 48 | ColumnLimit: 100 49 | CommentPragmas: '^ IWYU pragma:' 50 | CompactNamespaces: false 51 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 52 | ConstructorInitializerIndentWidth: 4 53 | ContinuationIndentWidth: 4 54 | Cpp11BracedListStyle: true 55 | DerivePointerAlignment: false 56 | DisableFormat: false 57 | ExperimentalAutoDetectBinPacking: false 58 | FixNamespaceComments: true 59 | ForEachMacros: 60 | - foreach 61 | - Q_FOREACH 62 | - BOOST_FOREACH 63 | IncludeBlocks: Preserve 64 | IncludeCategories: 65 | - Regex: '^"(llvm|llvm-c|clang|clang-c)/' 66 | Priority: 2 67 | - Regex: '^(<|"(gtest|gmock|isl|json)/)' 68 | Priority: 3 69 | - Regex: '.*' 70 | Priority: 1 71 | IncludeIsMainRegex: '(Test)?$' 72 | IndentCaseLabels: false 73 | IndentPPDirectives: AfterHash 74 | IndentWidth: 2 75 | IndentWrappedFunctionNames: false 76 | JavaScriptQuotes: Leave 77 | JavaScriptWrapImports: true 78 | KeepEmptyLinesAtTheStartOfBlocks: true 79 | MacroBlockBegin: '' 80 | MacroBlockEnd: '' 81 | MaxEmptyLinesToKeep: 1 82 | NamespaceIndentation: None 83 | ObjCBinPackProtocolList: Auto 84 | ObjCBlockIndentWidth: 2 85 | ObjCSpaceAfterProperty: false 86 | ObjCSpaceBeforeProtocolList: true 87 | PenaltyBreakAssignment: 2 88 | PenaltyBreakBeforeFirstCallParameter: 19 89 | PenaltyBreakComment: 300 90 | PenaltyBreakFirstLessLess: 120 91 | PenaltyBreakString: 1000 92 | PenaltyBreakTemplateDeclaration: 10 93 | PenaltyExcessCharacter: 1000000 94 | PenaltyReturnTypeOnItsOwnLine: 60 95 | PointerAlignment: Right 96 | ReflowComments: true 97 | SortIncludes: true 98 | SortUsingDeclarations: true 99 | SpaceAfterCStyleCast: false 100 | SpaceAfterTemplateKeyword: true 101 | SpaceBeforeAssignmentOperators: true 102 | SpaceBeforeCpp11BracedList: false 103 | SpaceBeforeCtorInitializerColon: true 104 | SpaceBeforeInheritanceColon: true 105 | SpaceBeforeParens: ControlStatements 106 | SpaceBeforeRangeBasedForLoopColon: true 107 | SpaceInEmptyParentheses: false 108 | SpacesBeforeTrailingComments: 1 109 | SpacesInAngles: false 110 | SpacesInContainerLiterals: true 111 | SpacesInCStyleCastParentheses: false 112 | SpacesInParentheses: false 113 | SpacesInSquareBrackets: false 114 | Standard: Cpp11 115 | StatementMacros: 116 | - Q_UNUSED 117 | - QT_REQUIRE_VERSION 118 | TabWidth: 8 119 | UseTab: Never 120 | ... 121 | 122 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, in case people don't have core.autocrlf set. 2 | * text=auto 3 | 4 | # Explicitly declare text files you want to always be normalized and converted 5 | # to native line endings on checkout. 6 | *.c text 7 | *.cc text 8 | *.cxx text 9 | *.cpp text 10 | *.c++ text 11 | *.hpp text 12 | *.h text 13 | *.h++ text 14 | *.hh text 15 | *.txt text 16 | *.md text 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | 3 | fasttensor.sublime-project 4 | fasttensor.sublime-workspace 5 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "ext/googletest"] 2 | path = ext/googletest 3 | url = https://github.com/google/googletest 4 | branch = master 5 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14.6) 2 | project(fasttensor LANGUAGES CXX) 3 | 4 | set(CMAKE_CXX_FLAGS "-Wall -Wextra") 5 | set(CMAKE_CXX_FLAGS_DEBUG "-g") 6 | set(CMAKE_CXX_FLAGS_RELEASE "-O3") 7 | 8 | set(USE_LIBCPP ON CACHE BOOL "Use libc++ if compiling with clang") 9 | if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" AND USE_LIBCPP) 10 | set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -stdlib=libc++") 11 | endif() 12 | 13 | option(FORCE_COLORED_OUTPUT "Always produce ANSI-colored output (GNU/Clang only)." TRUE) 14 | if(${FORCE_COLORED_OUTPUT}) 15 | if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") 16 | add_compile_options(-fdiagnostics-color=always) 17 | elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 18 | add_compile_options(-fcolor-diagnostics) 19 | endif() 20 | endif() 21 | 22 | include(clang-format.cmake) 23 | 24 | option(BUILD_TESTS "Set to ON to build tests" ON) 25 | option(BUILD_BENCHMARKS "Set to ON to build benchmarks" ON) 26 | 27 | set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) 28 | 29 | add_subdirectory(fasttensor) 30 | 31 | if(BUILD_TESTS) 32 | add_subdirectory(ext/googletest) 33 | add_subdirectory(tests) 34 | endif() 35 | 36 | if(BUILD_BENCHMARKS) 37 | add_subdirectory(bench) 38 | endif() 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fasttensor 2 | 3 | C++ library for tensor arithmetic. 4 | 5 | Uses SIMD for CPU acceleration and CUDA for GPU acceleration. Supports multi-GPU if more than 1 is available. [Kernel fusion](https://stackoverflow.com/a/53311373) with [expression templates](https://en.wikipedia.org/wiki/Expression_templates) allows efficient computation of long arithmetic expressions. 6 | 7 | ## Usage 8 | 9 | fasttensor is header-only, simply add the location of the header files to your include path while compiling. 10 | 11 | Example code: 12 | 13 | ```cpp 14 | using namespace fasttensor; 15 | 16 | int main() { 17 | int num_rows = 4; 18 | int num_cols = 2; 19 | // Create integer tensor of rank 2 20 | // Dimensions: 4 rows, 2 columns (4x2) 21 | Tensor a(array{num_rows, num_cols}); 22 | Tensor b(array{num_rows, num_cols}); 23 | 24 | for (int i = 0; i < num_rows; ++i) { 25 | for (int j = 0; j < num_cols; ++j) { 26 | // This is how you set/get elements 27 | a(i, j) = j + num_cols * i; 28 | b(i, j) = j + num_cols * i; 29 | } 30 | } 31 | 32 | Tensor results(array{num_rows, num_cols}); 33 | 34 | // Element-wise addition of the two tensors 35 | // This will auto-magically use GPU/SIMD instructions 36 | // Need to compile with appropriate compiler flags and hardware 37 | results = a + b; 38 | 39 | for (int i = 0; i < num_rows; ++i) { 40 | for (int j = 0; j < num_cols; ++j) { 41 | // Just checking if we got the right answer 42 | assert(results(i, j) == 2 * (j + num_cols * i)); 43 | } 44 | } 45 | 46 | return 0; 47 | } 48 | ``` 49 | 50 | ## Benchmarks 51 | 52 | **Eager mode** is equivalent to a naive implementation of arithmetic expressions, creating a temporary variable after each operation. This behaviour was simulated with a helper function that forces eager evaluation of a given arithmetic expression. 53 | 54 | **Lazy mode** constructs an expression at compile time using expression templates and only evaluates the expression when assigned to a matrix. 55 | 56 | ### Config: 57 | 58 | **CPU**: Intel Xeon E5-2690 v3 @ 2.60 GHz 59 | **GPU**: NVidia Tesla P4 60 | **Compiler**: Clang 9.0.1 61 | **CUDA Toolkit Version**: 10.0 62 | 63 | ### Results: 64 | 65 | - The variables are 3-dimensional float tensors of size 104 × 102 × 102 filled with random values. 66 | - The results were obtained by running 10 trials. 67 | - Each trial consisted of evaluating the expression 100 times. 68 | 69 | 70 | #### X = A + B + C + D 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 |
DevicesEagerLazy
TimeGFlopsTimeGFlops
AVX2 on CPU28.26 ± 0.21s0.9917.73 ± 0.05s1.58
1 Tesla P4 GPU2.65 ± 0.00s10.561.51 ± 0.12s18.52
2 Tesla P4 GPUs1.56 ± 0.20s17.920.89 ± 0.08s31.25
106 | 107 | ## Development 108 | 109 | To run the tests and benchmarks on Linux: 110 | 111 | (Dependencies: CMake >= 3.14.6, clang++ >= 8, CUDA >= 9) 112 | 113 | 1. Clone [this repo](https://github.com/JHurricane96/fasttensor) 114 | 115 | 2. `mkdir build && cd build` 116 | 117 | 3. Run CMake to generate build files (detailed instructions below). Add `-DBUILD_TESTS=OFF` to not build tests, `-DBUILD_BENCHMARKS=OFF` to not build benchmarks. 118 | 119 | 4. `cmake --build .` 120 | 121 | 5. `./tests` to run the tests and `./bench/bench` to run the benchmarks 122 | 123 | ### Running CMake 124 | 125 | The build can be configured with various build options. The full command to run is: 126 | 127 | ``` 128 | CXX= CC= cmake.. \ 129 | -DDEVICE_TYPE= -DCMAKE_BUILD_TYPE= \ 130 | -DCUDA_PATH= -DGPU_ARCH= 131 | ``` 132 | 133 | - Use `CXX` and `CC` to set the C and C++ compiler to clang. 134 | - Set `DEVICE_TYPE` to `NORMAL` for normal CPU mode, `SIMD` to use SIMD vectorized instructions, and `GPU` to use the GPU. 135 | - Set `CMAKE_BUILD_TYPE` to `Release` or `Debug` depending on your need. 136 | - Set `CUDA_PATH` to the location of the CUDA toolkit, and `GPU_ARCH` to the GPU's CUDA compute capability (3.7 means you should set it to 37, that is, simply remove the decimal). These options are only required if `DEVICE_TYPE` is `GPU`. 137 | -------------------------------------------------------------------------------- /bench/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14.6) 2 | 3 | add_executable(bench bench.cpp) 4 | target_link_libraries(bench fasttensor rt) 5 | target_compile_options(bench PRIVATE -O3) 6 | target_compile_options(bench PRIVATE -DNDEBUG) 7 | target_compile_options(bench PRIVATE -march=native) 8 | -------------------------------------------------------------------------------- /bench/bench.cpp: -------------------------------------------------------------------------------- 1 | #include "Assign.hpp" 2 | #include "CWiseBinaryOp.hpp" 3 | #include "Tensor.hpp" 4 | #include "timer.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace std; 11 | using namespace fasttensor; 12 | 13 | const int TRIES = 10; 14 | const int REPEAT = 100; 15 | 16 | template 17 | Tensor make_rand_tensor(array dimensions) { 18 | static_assert(is_floating_point::value); 19 | static std::random_device rd; 20 | static std::mt19937 gen(rd()); 21 | static std::uniform_real_distribution dis(-100.0, 100.0); 22 | 23 | Tensor t(dimensions); 24 | auto num_elts = t.num_elements(); 25 | auto &_storage = t.storage(); 26 | for (int i = 0; i < num_elts; ++i) { 27 | _storage.storeCoeff(dis(gen), i); 28 | } 29 | return t; 30 | } 31 | 32 | double square(const double num) { return num * num; } 33 | 34 | double timer_mean(const BenchTimer &timer) { return timer.total() / TRIES; } 35 | 36 | double timer_sd(const BenchTimer &timer) { 37 | double mean_squares = timer.squared_total() / TRIES; 38 | double squared_mean = square(timer_mean(timer)); 39 | return sqrt(mean_squares - squared_mean); 40 | } 41 | 42 | void print_results(string test_name, const BenchTimer &timer, double flops_factor) { 43 | auto mean = timer_mean(timer); 44 | auto sd = timer_sd(timer); 45 | auto flops = flops_factor * REPEAT * TRIES / (pow(1024., 3) * timer.total()); 46 | std::cout << test_name << " :: Mean: " << mean << "s, SD: " << sd << "s; " << flops 47 | << " GFlops\n"; 48 | } 49 | 50 | template 51 | inline void _eager_eval(Result &result, Operand &operand, OtherOperands &... other_operands) { 52 | result = result + operand; 53 | if constexpr (sizeof...(other_operands) > 0) { 54 | _eager_eval(result, other_operands...); 55 | } 56 | } 57 | 58 | template 59 | inline void eager_eval(Result &result, Operand1 &operand1, Operand2 &operand2, 60 | OtherOperands &... other_operands) { 61 | result = operand1 + operand2; 62 | _eager_eval(result, other_operands...); 63 | } 64 | 65 | int main() { 66 | ptrdiff_t row = 1E4, col = 1E2, dep = 1E2; 67 | array dimensions{row, col, dep}; 68 | auto a = make_rand_tensor(dimensions); 69 | auto b = make_rand_tensor(dimensions); 70 | auto c = make_rand_tensor(dimensions); 71 | auto d = make_rand_tensor(dimensions); 72 | Tensor result(dimensions); 73 | 74 | BenchTimer timer; 75 | 76 | BENCH(timer, TRIES, REPEAT, result = a + b + c + d); 77 | print_results("a+b+c+d (Lazy)", timer, double(row * col * dep * 3)); 78 | std::cout << result.getCoeff(0) << "\n"; 79 | 80 | BENCH(timer, TRIES, REPEAT, eager_eval(result, a, b, c, d)); 81 | print_results("a+b+c+d (Eager)", timer, double(row * col * dep * 3)); 82 | std::cout << result.getCoeff(0) << "\n"; 83 | 84 | return 0; 85 | } 86 | -------------------------------------------------------------------------------- /bench/timer.h: -------------------------------------------------------------------------------- 1 | // The original file is part of Eigen, a lightweight C++ template library 2 | // for linear algebra. 3 | // 4 | // Original work Copyright (C) 2008-2010 Gael Guennebaud 5 | // Original work Copyright (C) 2009 Benoit Jacob 6 | // Modified work Copyright (C) 2019 Arun Ramachandran 7 | // 8 | // This Source Code Form is subject to the terms of the Mozilla 9 | // Public License v. 2.0. If a copy of the MPL was not distributed 10 | // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 11 | 12 | #pragma once 13 | 14 | #include 15 | 16 | #if defined(_WIN32) || defined(__CYGWIN__) 17 | # ifndef NOMINMAX 18 | # define NOMINMAX 19 | # define FASTTENSOR_BT_UNDEF_NOMINMAX 20 | # endif 21 | # ifndef WIN32_LEAN_AND_MEAN 22 | # define WIN32_LEAN_AND_MEAN 23 | # define FASTTENSOR_BT_UNDEF_WIN32_LEAN_AND_MEAN 24 | # endif 25 | # include 26 | #elif defined(__APPLE__) 27 | # include 28 | #else 29 | # include 30 | # include 31 | #endif 32 | 33 | static void escape(void *p) { asm volatile("" : : "g"(p) : "memory"); } 34 | 35 | static void clobber() { asm volatile("" : : : "memory"); } 36 | 37 | enum { CPU_TIMER = 0, REAL_TIMER = 1 }; 38 | 39 | /** Elapsed time timer keeping the best try. 40 | * 41 | * On POSIX platforms we use clock_gettime with CLOCK_PROCESS_CPUTIME_ID. 42 | * On Windows we use QueryPerformanceCounter 43 | * 44 | * Important: on linux, you must link with -lrt 45 | */ 46 | class BenchTimer { 47 | public: 48 | BenchTimer() { 49 | #if defined(_WIN32) || defined(__CYGWIN__) 50 | LARGE_INTEGER freq; 51 | QueryPerformanceFrequency(&freq); 52 | m_frequency = (double)freq.QuadPart; 53 | #endif 54 | reset(); 55 | } 56 | 57 | ~BenchTimer() {} 58 | 59 | inline void reset() { 60 | m_bests.fill(1e9); 61 | m_worsts.fill(0); 62 | m_totals.fill(0); 63 | m_squared_totals.fill(0); 64 | } 65 | inline void start() { 66 | m_starts[CPU_TIMER] = getCpuTime(); 67 | m_starts[REAL_TIMER] = getRealTime(); 68 | } 69 | inline void stop() { 70 | m_times[CPU_TIMER] = getCpuTime() - m_starts[CPU_TIMER]; 71 | m_times[REAL_TIMER] = getRealTime() - m_starts[REAL_TIMER]; 72 | m_bests[0] = std::min(m_bests[0], m_times[0]); 73 | m_bests[1] = std::min(m_bests[1], m_times[1]); 74 | m_worsts[0] = std::max(m_worsts[0], m_times[0]); 75 | m_worsts[1] = std::max(m_worsts[1], m_times[1]); 76 | m_totals[0] += m_times[0]; 77 | m_totals[1] += m_times[1]; 78 | m_squared_totals[0] += m_times[0] * m_times[0]; 79 | m_squared_totals[1] += m_times[1] * m_times[1]; 80 | } 81 | 82 | /** Return the elapsed time in seconds between the last start/stop pair 83 | */ 84 | inline double value(int TIMER = CPU_TIMER) const { return m_times[TIMER]; } 85 | 86 | /** Return the best elapsed time in seconds 87 | */ 88 | inline double best(int TIMER = CPU_TIMER) const { return m_bests[TIMER]; } 89 | 90 | /** Return the worst elapsed time in seconds 91 | */ 92 | inline double worst(int TIMER = CPU_TIMER) const { return m_worsts[TIMER]; } 93 | 94 | /** Return the total elapsed time in seconds. 95 | */ 96 | inline double total(int TIMER = CPU_TIMER) const { return m_totals[TIMER]; } 97 | 98 | /** Return the total of squares of elapsed time in seconds. 99 | */ 100 | inline double squared_total(int TIMER = CPU_TIMER) const { return m_squared_totals[TIMER]; } 101 | 102 | inline double getCpuTime() const { 103 | #ifdef _WIN32 104 | LARGE_INTEGER query_ticks; 105 | QueryPerformanceCounter(&query_ticks); 106 | return query_ticks.QuadPart / m_frequency; 107 | #elif __APPLE__ 108 | return double(mach_absolute_time()) * 1e-9; 109 | #else 110 | timespec ts; 111 | clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); 112 | return double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec); 113 | #endif 114 | } 115 | 116 | inline double getRealTime() const { 117 | #ifdef _WIN32 118 | SYSTEMTIME st; 119 | GetSystemTime(&st); 120 | return (double)st.wSecond + 1.e-3 * (double)st.wMilliseconds; 121 | #elif __APPLE__ 122 | return double(mach_absolute_time()) * 1e-9; 123 | #else 124 | timespec ts; 125 | clock_gettime(CLOCK_REALTIME, &ts); 126 | return double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec); 127 | #endif 128 | } 129 | 130 | protected: 131 | #if defined(_WIN32) || defined(__CYGWIN__) 132 | double m_frequency; 133 | #endif 134 | std::array m_starts; 135 | std::array m_times; 136 | std::array m_bests; 137 | std::array m_worsts; 138 | std::array m_totals; 139 | std::array m_squared_totals; 140 | }; 141 | 142 | #define BENCH(TIMER, TRIES, REP, CODE) \ 143 | { \ 144 | TIMER.reset(); \ 145 | for (int uglyvarname1 = 0; uglyvarname1 < TRIES; ++uglyvarname1) { \ 146 | TIMER.start(); \ 147 | for (int uglyvarname2 = 0; uglyvarname2 < REP; ++uglyvarname2) { \ 148 | CODE; \ 149 | } \ 150 | TIMER.stop(); \ 151 | clobber(); \ 152 | } \ 153 | } 154 | 155 | // clean #defined tokens 156 | #ifdef FASTTENSOR_BT_UNDEF_NOMINMAX 157 | # undef FASTTENSOR_BT_UNDEF_NOMINMAX 158 | # undef NOMINMAX 159 | #endif 160 | 161 | #ifdef FASTTENSOR_BT_UNDEF_WIN32_LEAN_AND_MEAN 162 | # undef FASTTENSOR_BT_UNDEF_WIN32_LEAN_AND_MEAN 163 | # undef WIN32_LEAN_AND_MEAN 164 | #endif 165 | -------------------------------------------------------------------------------- /clang-format.cmake: -------------------------------------------------------------------------------- 1 | # Creates additional target to perform clang-format run, requires clang-format 2 | 3 | # Find clang format 4 | find_program(CLANG_FORMAT_EXECUTABLE "clang-format") 5 | if(NOT CLANG_FORMAT_EXECUTABLE) 6 | return() 7 | endif() 8 | 9 | # Get all project files 10 | file(GLOB_RECURSE ALL_SOURCE_FILES fasttensor/*.hpp tests/*.cpp bench/*.h bench/*.cpp) 11 | 12 | # Add target to build 13 | add_custom_target( 14 | clangformat 15 | COMMAND ${CLANG_FORMAT_EXECUTABLE} 16 | -style=file 17 | -i 18 | ${ALL_SOURCE_FILES} 19 | ) -------------------------------------------------------------------------------- /fasttensor/Assign.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Device.hpp" 4 | #include "StorageUnwrapper.hpp" 5 | #include "Tensor.hpp" 6 | 7 | #if defined FASTTENSOR_GPU 8 | # include "GpuDevice.hpp" 9 | #endif 10 | 11 | namespace fasttensor { 12 | 13 | #if defined FASTTENSOR_SIMD || defined FASTTENSOR_NORMAL 14 | 15 | template > 17 | inline void Assign(Tensor &lhs, OtherExpr const &rhs) { 18 | auto &storage = lhs.storage(); 19 | if constexpr (device_type == DeviceType::Simd && 20 | simd::PacketTraits::is_vectorizable) { 21 | auto packet_size = simd::PacketTraits::size; 22 | auto num_packets = storage.num_elements() / packet_size; 23 | for (std::ptrdiff_t i = 0; i < num_packets; ++i) { 24 | storage.storePacket(i, rhs.getPacket(i)); 25 | } 26 | for (std::ptrdiff_t i = num_packets * packet_size; i < storage.num_elements(); ++i) { 27 | storage.storeCoeff(rhs.getCoeff(i), i); 28 | } 29 | } else { 30 | for (std::ptrdiff_t i = 0; i < storage.num_elements(); ++i) { 31 | storage.storeCoeff(rhs.getCoeff(i), i); 32 | } 33 | } 34 | } 35 | 36 | #elif defined FASTTENSOR_GPU 37 | 38 | template > 39 | __global__ void Kernel(ElementType *lhs_storage, OtherExpr rhs, int start_offset, int end_offset) { 40 | int index = blockIdx.x * blockDim.x + threadIdx.x + start_offset; 41 | int stride = blockDim.x * gridDim.x; 42 | for (int i = index; i < end_offset; i += stride) { 43 | lhs_storage[i] = rhs.getCoeff(i); 44 | } 45 | } 46 | 47 | template > 49 | inline void Assign(Tensor &lhs, OtherExpr const &rhs) { 50 | auto unwrapped_rhs = UnwrapStorage(rhs); 51 | 52 | auto &device_props = lhs.device().deviceProps(); 53 | auto num_devices = device_props.size(); 54 | auto num_elements = lhs.num_elements(); 55 | auto num_elts_per_device_floored = num_elements / num_devices; 56 | auto partition_point = num_devices - (num_elements % num_devices); 57 | 58 | auto num_calculted_elts = 0; 59 | for (int i = 0; i < num_devices; ++i) { 60 | cudaSetDevice(i); 61 | 62 | auto &device = device_props[i]; 63 | auto block_size = device.blockSize(); 64 | auto max_blocks = device.maxBlocks(); 65 | decltype(num_elements) num_elts_current_device = 0; 66 | 67 | if (i >= partition_point) { 68 | num_elts_current_device = num_elts_per_device_floored + 1; 69 | } else { 70 | num_elts_current_device = num_elts_per_device_floored; 71 | } 72 | 73 | int num_blocks = std::min((decltype(num_elements))max_blocks, 74 | (num_elts_current_device + block_size - 1) / block_size); 75 | auto end_offset = num_calculted_elts + num_elts_current_device; 76 | 77 | Kernel<<>>(lhs.storage().elements(), unwrapped_rhs, num_calculted_elts, 78 | end_offset); 79 | 80 | num_calculted_elts = end_offset; 81 | } 82 | 83 | for (int i = 0; i < device_props.size(); ++i) { 84 | cudaSetDevice(i); 85 | cudaDeviceSynchronize(); 86 | } 87 | } 88 | 89 | #endif 90 | 91 | template 92 | template 93 | inline Tensor & 94 | Tensor::operator=(OtherExpr const &other) { 95 | Assign(*this, other); 96 | return *this; 97 | } 98 | 99 | } // namespace fasttensor 100 | -------------------------------------------------------------------------------- /fasttensor/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14.6) 2 | 3 | add_library(fasttensor INTERFACE) 4 | 5 | target_include_directories(fasttensor INTERFACE 6 | $ 7 | $ 8 | ) 9 | 10 | set(DEVICE_TYPE "SIMD" CACHE STRING "Device type") 11 | message("** Device type is set to ${DEVICE_TYPE}") 12 | 13 | set(CUDA_PATH "" CACHE PATH "Path to CUDA toolkit") 14 | set(GPU_ARCH "" CACHE STRING "CUDA GPU compute architecture") 15 | 16 | if(DEVICE_TYPE STREQUAL "SIMD") 17 | target_compile_definitions(fasttensor INTERFACE FASTTENSOR_SIMD) 18 | elseif(DEVICE_TYPE STREQUAL "GPU") 19 | target_compile_definitions(fasttensor INTERFACE FASTTENSOR_GPU) 20 | target_compile_options(fasttensor INTERFACE -x cuda -pthread --cuda-gpu-arch=sm_${GPU_ARCH} --cuda-path=${CUDA_PATH}) 21 | target_link_options(fasttensor INTERFACE -lcudart -ldl -lrt -L${CUDA_PATH}/lib64) 22 | endif() 23 | 24 | target_compile_features(fasttensor INTERFACE cxx_std_17) 25 | 26 | install(TARGETS fasttensor EXPORT fasttensor_config 27 | ARCHIVE DESTINATION lib 28 | LIBRARY DESTINATION lib 29 | RUNTIME DESTINATION bin 30 | ) 31 | 32 | install(EXPORT fasttensor_config DESTINATION lib) 33 | install(DIRECTORY ./ DESTINATION fasttensor) 34 | -------------------------------------------------------------------------------- /fasttensor/CWiseBinaryOp.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "GpuDeviceFunction.hpp" 4 | #include "RefSelector.hpp" 5 | #include "Simd/Simd.hpp" 6 | #include "Tensor.hpp" 7 | #include "TensorExpression.hpp" 8 | 9 | namespace fasttensor { 10 | 11 | enum class BinaryOp { Plus, Minus, Multiplies, Divides }; 12 | 13 | template 14 | class CWiseBinaryOp; 15 | 16 | template 17 | class CWiseBinaryOp : public TensorExpression { 18 | static_assert(are_tensor_exprs, 19 | "Expressions in CWiseBinaryOp must inherit from TensorExpression"); 20 | 21 | public: 22 | using LeftExprRefType = ref_selector_t; 23 | using RightExprRefType = ref_selector_t; 24 | 25 | CWiseBinaryOp(const LeftExpr &left_expr, const RightExpr &right_expr) 26 | : _left_expr(left_expr), _right_expr(right_expr) {} 27 | 28 | inline auto getPacket(std::ptrdiff_t n) const { 29 | if constexpr (Op == BinaryOp::Plus) { 30 | return simd::Add(_left_expr.getPacket(n), _right_expr.getPacket(n)); 31 | } else if constexpr (Op == BinaryOp::Minus) { 32 | return simd::Sub(_left_expr.getPacket(n), _right_expr.getPacket(n)); 33 | } else if constexpr (Op == BinaryOp::Multiplies) { 34 | return simd::Mult(_left_expr.getPacket(n), _right_expr.getPacket(n)); 35 | } else if constexpr (Op == BinaryOp::Divides) { 36 | return simd::Div(_left_expr.getPacket(n), _right_expr.getPacket(n)); 37 | } 38 | } 39 | 40 | GPU_DEVICE_FUNC inline auto getCoeff(std::ptrdiff_t n) const { 41 | if constexpr (Op == BinaryOp::Plus) { 42 | return _left_expr.getCoeff(n) + _right_expr.getCoeff(n); 43 | } else if constexpr (Op == BinaryOp::Minus) { 44 | return _left_expr.getCoeff(n) - _right_expr.getCoeff(n); 45 | } else if constexpr (Op == BinaryOp::Multiplies) { 46 | return _left_expr.getCoeff(n) * _right_expr.getCoeff(n); 47 | } else if constexpr (Op == BinaryOp::Divides) { 48 | return _left_expr.getCoeff(n) / _right_expr.getCoeff(n); 49 | } 50 | } 51 | 52 | inline LeftExprRefType leftExpr() const { return _left_expr; } 53 | 54 | inline RightExprRefType rightExpr() const { return _right_expr; } 55 | 56 | private: 57 | LeftExprRefType _left_expr; 58 | RightExprRefType _right_expr; 59 | }; 60 | 61 | template > 63 | inline auto operator+(LeftExpr const &left_expr, RightExpr const &right_expr) { 64 | return CWiseBinaryOp(left_expr, right_expr); 65 | } 66 | 67 | template > 69 | inline auto operator-(LeftExpr const &left_expr, RightExpr const &right_expr) { 70 | return CWiseBinaryOp(left_expr, right_expr); 71 | } 72 | 73 | template > 75 | inline auto operator*(LeftExpr const &left_expr, RightExpr const &right_expr) { 76 | return CWiseBinaryOp(left_expr, 77 | right_expr); 78 | } 79 | 80 | template > 82 | inline auto operator/(LeftExpr const &left_expr, RightExpr const &right_expr) { 83 | return CWiseBinaryOp(left_expr, right_expr); 84 | } 85 | 86 | } // namespace fasttensor 87 | -------------------------------------------------------------------------------- /fasttensor/DefaultDevice.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "DeviceProperties.hpp" 4 | #include 5 | 6 | namespace fasttensor { 7 | 8 | struct DefaultDevice { 9 | DefaultDevice() { device_props.emplace_back(1, 1); } 10 | 11 | private: 12 | std::vector device_props; 13 | }; 14 | 15 | } // namespace fasttensor 16 | -------------------------------------------------------------------------------- /fasttensor/Device.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace fasttensor { 4 | 5 | enum class DeviceType { Normal, Simd, Gpu }; 6 | 7 | #if defined FASTTENSOR_GPU 8 | constexpr DeviceType device_type = DeviceType::Gpu; 9 | #elif defined FASTTENSOR_SIMD 10 | constexpr DeviceType device_type = DeviceType::Simd; 11 | #else 12 | # define FASTTENSOR_NORMAL 13 | constexpr DeviceType device_type = DeviceType::Normal; 14 | #endif 15 | 16 | } // namespace fasttensor 17 | -------------------------------------------------------------------------------- /fasttensor/DeviceFactory.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace fasttensor { 6 | 7 | template 8 | class DeviceFactory { 9 | private: 10 | static std::optional device; 11 | 12 | public: 13 | static DeviceType GetDevice() { 14 | if (!device) { 15 | device = std::make_optional(); 16 | } 17 | return *device; 18 | } 19 | }; 20 | 21 | template 22 | std::optional DeviceFactory::device = std::nullopt; 23 | 24 | } // namespace fasttensor 25 | -------------------------------------------------------------------------------- /fasttensor/DeviceProperties.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace fasttensor { 4 | 5 | struct DeviceProperties { 6 | DeviceProperties(int block_size, int max_blocks) 7 | : block_size(block_size), max_blocks(max_blocks) {} 8 | inline int blockSize() const { return block_size; } 9 | 10 | inline int maxBlocks() const { return max_blocks; } 11 | 12 | private: 13 | int block_size; 14 | int max_blocks; 15 | }; 16 | 17 | } // namespace fasttensor 18 | -------------------------------------------------------------------------------- /fasttensor/GpuDevice.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "DeviceProperties.hpp" 4 | #include 5 | 6 | namespace fasttensor { 7 | 8 | struct GpuDevice { 9 | GpuDevice() { 10 | int num_devices; 11 | cudaGetDeviceCount(&num_devices); 12 | for (int i = 0; i < num_devices; i++) { 13 | cudaDeviceProp prop; 14 | cudaGetDeviceProperties(&prop, i); 15 | auto block_size = prop.maxThreadsPerBlock; 16 | auto max_blocks = prop.multiProcessorCount * prop.maxThreadsPerMultiProcessor / block_size; 17 | device_props.emplace_back(block_size, max_blocks); 18 | } 19 | } 20 | 21 | inline auto &deviceProps() const { return device_props; } 22 | 23 | private: 24 | std::vector device_props; 25 | }; 26 | 27 | } // namespace fasttensor 28 | -------------------------------------------------------------------------------- /fasttensor/GpuDeviceFunction.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Device.hpp" 4 | 5 | namespace fasttensor { 6 | 7 | #ifdef FASTTENSOR_GPU 8 | # define GPU_DEVICE_FUNC __device__ __host__ 9 | #else 10 | # define GPU_DEVICE_FUNC 11 | #endif 12 | 13 | } // namespace fasttensor 14 | -------------------------------------------------------------------------------- /fasttensor/Memory.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Simd/Simd.hpp" 4 | #include 5 | 6 | namespace fasttensor { 7 | 8 | #if defined FASTTENSOR_SIMD || defined FASTTENSOR_NORMAL 9 | 10 | template 11 | inline ElementType *AllocateMemory(std::ptrdiff_t num_elements) { 12 | if constexpr (simd::PacketTraits::is_vectorizable) { 13 | return reinterpret_cast( 14 | std::aligned_alloc(simd::PacketSize, sizeof(ElementType) * num_elements)); 15 | } else { 16 | return new ElementType[num_elements]; 17 | } 18 | } 19 | 20 | template 21 | inline void DeallocateMemory(ElementType *memory) { 22 | if constexpr (simd::PacketTraits::is_vectorizable) { 23 | std::free(memory); 24 | } else { 25 | delete[] memory; 26 | } 27 | } 28 | 29 | #elif defined FASTTENSOR_GPU 30 | 31 | template 32 | inline ElementType *AllocateMemory(std::ptrdiff_t num_elements) { 33 | ElementType *memory; 34 | cudaMallocManaged(&memory, num_elements * sizeof(ElementType)); 35 | return memory; 36 | } 37 | 38 | template 39 | inline void DeallocateMemory(ElementType *memory) { 40 | cudaFree(memory); 41 | } 42 | 43 | #endif 44 | 45 | } // namespace fasttensor 46 | -------------------------------------------------------------------------------- /fasttensor/RefSelector.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace fasttensor { 4 | 5 | template 6 | struct ref_selector { 7 | using type = T; 8 | }; 9 | 10 | template 11 | using ref_selector_t = typename ref_selector::type; 12 | 13 | } // namespace fasttensor 14 | -------------------------------------------------------------------------------- /fasttensor/Simd/Avx2.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace fasttensor::simd { 6 | 7 | constexpr int PacketSize = 32; 8 | 9 | template 10 | constexpr int NumElementsInPacket = PacketSize / sizeof(T); 11 | 12 | template 13 | struct PacketTraits { 14 | using type = T; 15 | static constexpr bool is_vectorizable = false; 16 | static constexpr int size = 1; 17 | }; 18 | 19 | template <> 20 | struct PacketTraits { 21 | using type = __m256i; 22 | static constexpr bool is_vectorizable = true; 23 | static constexpr int size = NumElementsInPacket; 24 | }; 25 | 26 | template <> 27 | struct PacketTraits { 28 | using type = __m256; 29 | static constexpr bool is_vectorizable = true; 30 | static constexpr int size = NumElementsInPacket; 31 | }; 32 | 33 | template <> 34 | struct PacketTraits { 35 | using type = __m256d; 36 | static constexpr bool is_vectorizable = true; 37 | static constexpr int size = NumElementsInPacket; 38 | }; 39 | 40 | template 41 | inline typename PacketTraits::type Load(ScalarType *source) { 42 | if constexpr (std::is_same_v) { 43 | return _mm256_load_si256(reinterpret_cast::type *>(source)); 44 | } else if constexpr (std::is_same_v) { 45 | return _mm256_load_ps(reinterpret_cast(source)); 46 | } else if constexpr (std::is_same_v) { 47 | return _mm256_load_pd(reinterpret_cast(source)); 48 | } 49 | } 50 | 51 | template 52 | inline void Store(ScalarType *dest, typename PacketTraits::type source) { 53 | if constexpr (std::is_same_v) { 54 | _mm256_store_si256(reinterpret_cast::type *>(dest), source); 55 | } else if constexpr (std::is_same_v) { 56 | _mm256_store_ps(dest, source); 57 | } else if constexpr (std::is_same_v) { 58 | _mm256_store_pd(dest, source); 59 | } 60 | } 61 | 62 | template 63 | inline PacketType Add(PacketType left, PacketType right) { 64 | if constexpr (std::is_same_v) { 65 | return _mm256_add_epi32(left, right); 66 | } else if constexpr (std::is_same_v) { 67 | return _mm256_add_ps(left, right); 68 | } else if constexpr (std::is_same_v) { 69 | return _mm256_add_pd(left, right); 70 | } 71 | } 72 | 73 | template 74 | inline PacketType Sub(PacketType left, PacketType right) { 75 | if constexpr (std::is_same_v) { 76 | return _mm256_sub_epi32(left, right); 77 | } else if constexpr (std::is_same_v) { 78 | return _mm256_sub_ps(left, right); 79 | } else if constexpr (std::is_same_v) { 80 | return _mm256_sub_pd(left, right); 81 | } 82 | } 83 | 84 | template 85 | inline PacketType Mult(PacketType left, PacketType right) { 86 | if constexpr (std::is_same_v) { 87 | return _mm256_mullo_epi32(left, right); 88 | } else if constexpr (std::is_same_v) { 89 | return _mm256_mul_ps(left, right); 90 | } else if constexpr (std::is_same_v) { 91 | return _mm256_mul_pd(left, right); 92 | } 93 | } 94 | 95 | template 96 | inline PacketType Div(PacketType dividend, PacketType divisor) { 97 | if constexpr (std::is_same_v) { 98 | return _mm256_div_ps(dividend, divisor); 99 | } else if constexpr (std::is_same_v) { 100 | return _mm256_div_pd(dividend, divisor); 101 | } 102 | } 103 | 104 | } // namespace fasttensor::simd 105 | -------------------------------------------------------------------------------- /fasttensor/Simd/Generic.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace fasttensor::simd { 4 | 5 | constexpr int PacketSize = 1; 6 | 7 | template 8 | constexpr int NumElementsInPacket = PacketSize / sizeof(T); 9 | 10 | template 11 | struct PacketTraits { 12 | using type = T; 13 | static constexpr bool is_vectorizable = false; 14 | static constexpr int size = 1; 15 | }; 16 | 17 | template 18 | inline T Load(T *source) { 19 | return source; 20 | } 21 | 22 | template 23 | inline void Store(T *dest, T *source) { 24 | dest = source; 25 | } 26 | 27 | template 28 | inline T Add(T *left, T *right) { 29 | return *left + *right; 30 | } 31 | 32 | template 33 | inline T Sub(T *left, T *right) { 34 | return *left - *right; 35 | } 36 | 37 | template 38 | inline T Mult(T *left, T *right) { 39 | return *left * *right; 40 | } 41 | 42 | template 43 | inline T Div(T *left, T *right) { 44 | return *left / *right; 45 | } 46 | 47 | } // namespace fasttensor::simd 48 | -------------------------------------------------------------------------------- /fasttensor/Simd/Simd.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "SimdMacros.hpp" 4 | 5 | #if SSE_INSTR_SET > 7 6 | # include "Avx2.hpp" 7 | #else 8 | # include "Generic.hpp" 9 | #endif 10 | -------------------------------------------------------------------------------- /fasttensor/Simd/SimdMacros.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace fasttensor { 4 | 5 | namespace simd { 6 | 7 | #if (defined(_M_AMD64) || defined(_M_X64) || defined(__amd64)) && !defined(__x86_64__) 8 | # define __x86_64__ 1 9 | #endif 10 | 11 | // Find sse instruction set from compiler macros if SSE_INSTR_SET not defined 12 | // Note: Not all compilers define these macros automatically 13 | #ifndef SSE_INSTR_SET 14 | # if defined(__AVX2__) 15 | # define SSE_INSTR_SET 8 16 | # elif defined(__AVX__) 17 | # define SSE_INSTR_SET 7 18 | # elif defined(__SSE4_2__) 19 | # define SSE_INSTR_SET 6 20 | # elif defined(__SSE4_1__) 21 | # define SSE_INSTR_SET 5 22 | # elif defined(__SSSE3__) 23 | # define SSE_INSTR_SET 4 24 | # elif defined(__SSE3__) 25 | # define SSE_INSTR_SET 3 26 | # elif defined(__SSE2__) || defined(__x86_64__) 27 | # define SSE_INSTR_SET 2 28 | # elif defined(__SSE__) 29 | # define SSE_INSTR_SET 1 30 | # elif defined(_M_IX86_FP) // Defined in MS compiler on 32bits system. 1: SSE, 2: SSE2 31 | # define SSE_INSTR_SET _M_IX86_FP 32 | # else 33 | # define SSE_INSTR_SET 0 34 | # endif // instruction set defines 35 | #endif // SSE_INSTR_SET 36 | 37 | } // namespace simd 38 | 39 | } // namespace fasttensor 40 | 41 | // Include the appropriate header file for intrinsic functions 42 | #if SSE_INSTR_SET > 7 // AVX2 and later 43 | # ifdef __GNUC__ 44 | # include // x86intrin.h includes header files for whatever instruction 45 | // sets are specified on the compiler command line, such as: 46 | // xopintrin.h, fma4intrin.h 47 | # else 48 | # include // MS version of immintrin.h covers AVX, AVX2 and FMA3 49 | # endif // __GNUC__ 50 | #elif SSE_INSTR_SET == 7 51 | # include // AVX 52 | #elif SSE_INSTR_SET == 6 53 | # include // SSE4.2 54 | #elif SSE_INSTR_SET == 5 55 | # include // SSE4.1 56 | #elif SSE_INSTR_SET == 4 57 | # include // SSSE3 58 | #elif SSE_INSTR_SET == 3 59 | # include // SSE3 60 | #elif SSE_INSTR_SET == 2 61 | # include // SSE2 62 | #elif SSE_INSTR_SET == 1 63 | # include // SSE 64 | #endif 65 | -------------------------------------------------------------------------------- /fasttensor/StorageUnwrapper.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "CWiseBinaryOp.hpp" 4 | #include "Tensor.hpp" 5 | #include "TensorStorageRef.hpp" 6 | 7 | namespace fasttensor { 8 | 9 | template 10 | auto inline UnwrapStorage(CWiseBinaryOp const &cwise_binary_op) { 11 | auto left_expr = UnwrapStorage(cwise_binary_op.leftExpr()); 12 | auto right_expr = UnwrapStorage(cwise_binary_op.rightExpr()); 13 | return CWiseBinaryOp(left_expr, 14 | right_expr); 15 | } 16 | 17 | template 18 | auto inline UnwrapStorage(Tensor const &tensor) { 19 | return TensorStorageRef(tensor.storage().elements()); 20 | } 21 | 22 | } // namespace fasttensor 23 | -------------------------------------------------------------------------------- /fasttensor/Tensor.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "DefaultDevice.hpp" 4 | #include "Device.hpp" 5 | #include "DeviceFactory.hpp" 6 | #include "GpuDeviceFunction.hpp" 7 | #include "RefSelector.hpp" 8 | #include "Simd/Simd.hpp" 9 | #include "TensorExpression.hpp" 10 | #include "TensorStorage.hpp" 11 | #include "UnrollUtils.hpp" 12 | 13 | #if defined FASTTENSOR_GPU 14 | # include "GpuDevice.hpp" 15 | #endif 16 | 17 | namespace fasttensor { 18 | 19 | #if defined FASTTENSOR_GPU 20 | using DefaultDeviceType = GpuDevice; 21 | #else 22 | using DefaultDeviceType = DefaultDevice; 23 | #endif 24 | 25 | template 26 | class Tensor; 27 | 28 | template 29 | constexpr bool is_tensor = false; 30 | 31 | template 32 | constexpr bool is_tensor> = true; 33 | 34 | template 35 | struct ref_selector>>> { 36 | using type = T &; 37 | }; 38 | 39 | template 40 | class Tensor : public TensorExpression { 41 | public: 42 | using TStorage = TensorStorage; 43 | using Self = Tensor; 44 | 45 | Tensor(std::array dimensions) 46 | : _storage(dimensions), _device(DeviceFactory::GetDevice()) {} 47 | 48 | Tensor(const Self &other) 49 | : _storage(other._storage), _device(DeviceFactory::GetDevice()) {} 50 | 51 | inline auto &storage() { return _storage; } 52 | 53 | inline const auto &storage() const { return _storage; } 54 | 55 | template 56 | inline ElementType &operator()(Index... indices) { 57 | return _storage(std::array{indices...}); 58 | } 59 | 60 | inline auto num_elements() { return _storage.num_elements(); } 61 | 62 | inline const auto &dimensions() { return _storage.dimensions(); } 63 | 64 | inline auto getPacket(std::ptrdiff_t n) const { return _storage.getPacket(n); } 65 | 66 | GPU_DEVICE_FUNC inline auto getCoeff(std::ptrdiff_t n) const { return _storage.getCoeff(n); } 67 | 68 | inline auto &device() const { return _device; } 69 | 70 | template > 71 | inline Tensor &operator=(OtherExpr const &); 72 | 73 | private: 74 | TStorage _storage; 75 | DeviceType _device; 76 | }; 77 | 78 | } // namespace fasttensor 79 | -------------------------------------------------------------------------------- /fasttensor/TensorExpression.fwd.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace fasttensor { 4 | 5 | class TensorExpression; 6 | 7 | } // namespace fasttensor 8 | -------------------------------------------------------------------------------- /fasttensor/TensorExpression.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "TensorExpression.fwd.hpp" 4 | #include 5 | 6 | namespace fasttensor { 7 | 8 | class TensorExpression {}; 9 | 10 | template 11 | constexpr bool is_tensor_expr = std::is_base_of_v; 12 | 13 | template 14 | constexpr bool are_tensor_exprs = (... && is_tensor_expr); 15 | 16 | template 17 | using enable_if_tensor_exprs = std::enable_if_t>; 18 | 19 | } // namespace fasttensor 20 | -------------------------------------------------------------------------------- /fasttensor/TensorStorage.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "GpuDeviceFunction.hpp" 4 | #include "Memory.hpp" 5 | #include "Simd/Simd.hpp" 6 | #include "UnrollUtils.hpp" 7 | #include 8 | #include 9 | 10 | namespace fasttensor { 11 | 12 | template 13 | class TensorStorage { 14 | public: 15 | using PacketType = typename simd::PacketTraits::type; 16 | int PacketSize = simd::PacketTraits::size; 17 | 18 | TensorStorage() : _dimensions(), _num_elements(0), _elements(nullptr) {} 19 | 20 | TensorStorage(std::array dimensions) : _dimensions(dimensions) { 21 | _num_elements = utils::fold(_dimensions, std::multiplies()); 22 | _elements = AllocateMemory(_num_elements); 23 | } 24 | 25 | TensorStorage(const TensorStorage &other) : TensorStorage(other._dimensions) { 26 | std::copy(other._elements, other._elements + _num_elements, _elements); 27 | } 28 | 29 | inline friend void swap(TensorStorage &first, TensorStorage &second) noexcept { 30 | using std::swap; 31 | 32 | swap(first._dimensions, second._dimensions); 33 | swap(first._num_elements, second._num_elements); 34 | swap(first._elements, second._elements); 35 | } 36 | 37 | TensorStorage(TensorStorage &&other) noexcept : TensorStorage() { swap(*this, other); } 38 | 39 | inline TensorStorage &operator=(TensorStorage other) { 40 | swap(*this, other); 41 | return *this; 42 | } 43 | 44 | inline auto elements() { return _elements; } 45 | 46 | inline auto elements() const { return _elements; } 47 | 48 | inline auto num_elements() { return _num_elements; } 49 | 50 | inline const auto &dimensions() { return _dimensions; } 51 | 52 | inline PacketType getPacket(std::ptrdiff_t index) const { 53 | return simd::Load(&_elements[index * PacketSize]); 54 | } 55 | 56 | inline void storePacket(std::ptrdiff_t index, PacketType packet) { 57 | simd::Store(&_elements[index * PacketSize], packet); 58 | } 59 | 60 | inline const ElementType &getCoeff(std::ptrdiff_t index) const { return _elements[index]; } 61 | 62 | inline const ElementType &getCoeff(std::array indices) const { 63 | return _elements[utils::getIndex(_dimensions, indices)]; 64 | } 65 | 66 | inline ElementType &operator()(std::array indices) { 67 | return _elements[utils::getIndex(_dimensions, indices)]; 68 | } 69 | 70 | inline void storeCoeff(ElementType element, std::ptrdiff_t index) { _elements[index] = element; } 71 | 72 | ~TensorStorage() { DeallocateMemory(_elements); } 73 | 74 | private: 75 | std::array _dimensions; 76 | std::ptrdiff_t _num_elements; 77 | ElementType *_elements; 78 | }; 79 | 80 | } // namespace fasttensor 81 | -------------------------------------------------------------------------------- /fasttensor/TensorStorageRef.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "GpuDeviceFunction.hpp" 4 | #include "TensorExpression.hpp" 5 | 6 | namespace fasttensor { 7 | 8 | template 9 | class TensorStorageRef : TensorExpression { 10 | public: 11 | TensorStorageRef(ElementType *elements) : _elements(elements) {} 12 | 13 | GPU_DEVICE_FUNC inline const auto &getCoeff(std::ptrdiff_t index) const { 14 | return _elements[index]; 15 | } 16 | 17 | inline auto elements() { return _elements; } 18 | 19 | inline auto elements() const { return _elements; } 20 | 21 | private: 22 | ElementType *_elements; 23 | }; 24 | 25 | } // namespace fasttensor 26 | -------------------------------------------------------------------------------- /fasttensor/UnrollUtils.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace fasttensor::utils { 6 | 7 | template 8 | inline Scalar fold(const std::array &list, const Op &op) { 9 | if constexpr (iter == 0) { 10 | return list[0]; 11 | } else { 12 | return op(list[iter], fold(list, op)); 13 | } 14 | } 15 | 16 | template 17 | inline std::ptrdiff_t getIndex(const std::array &dimensions, 18 | const std::array &indices) { 19 | if constexpr (iter == 0) { 20 | return indices[0]; 21 | } else { 22 | return indices[iter] + dimensions[iter] * getIndex(dimensions, indices); 23 | } 24 | } 25 | 26 | } // namespace fasttensor::utils 27 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14.6) 2 | 3 | SET(SOURCE_FILES 4 | cwiseops.cpp 5 | test.cpp) 6 | 7 | add_executable(tests ${SOURCE_FILES}) 8 | target_link_libraries(tests fasttensor gtest) 9 | 10 | target_compile_features(tests PRIVATE cxx_std_17) 11 | 12 | target_compile_options(tests PRIVATE -march=native) 13 | 14 | install(TARGETS tests DESTINATION bin) -------------------------------------------------------------------------------- /tests/cwiseops.cpp: -------------------------------------------------------------------------------- 1 | #include "Assign.hpp" 2 | #include "CWiseBinaryOp.hpp" 3 | #include "Tensor.hpp" 4 | #include "gtest/gtest.h" 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | using namespace fasttensor; 10 | 11 | template 12 | Tensor CreateTensor(array dimensions) { 13 | Tensor t(dimensions); 14 | auto num_elts = t.num_elements(); 15 | auto &_storage = t.storage(); 16 | for (int i = 0; i < num_elts; ++i) { 17 | _storage.storeCoeff(i, i); 18 | } 19 | return t; 20 | } 21 | 22 | TEST(CWiseOps, AddInt) { 23 | int n_i = 10, n_j = 11, n_k = 13; 24 | array dimensions{n_i, n_j, n_k}; 25 | auto a = CreateTensor(dimensions); 26 | auto b = CreateTensor(dimensions); 27 | Tensor result(dimensions); 28 | result = a + b; 29 | for (int i = 0; i < n_i; ++i) { 30 | for (int j = 0; j < n_j; ++j) { 31 | for (int k = 0; k < n_k; ++k) { 32 | EXPECT_EQ(result(i, j, k), 2 * (k + n_k * (j + (n_j * i)))); 33 | } 34 | } 35 | } 36 | } 37 | 38 | TEST(CWiseOps, SubInt) { 39 | int n_i = 10, n_j = 11, n_k = 13; 40 | array dimensions{n_i, n_j, n_k}; 41 | auto a = CreateTensor(dimensions); 42 | auto b = CreateTensor(dimensions); 43 | auto result = CreateTensor(dimensions); 44 | result = a - b; 45 | for (int i = 0; i < n_i; ++i) { 46 | for (int j = 0; j < n_j; ++j) { 47 | for (int k = 0; k < n_k; ++k) { 48 | EXPECT_EQ(result(i, j, k), 0); 49 | } 50 | } 51 | } 52 | } 53 | 54 | TEST(CWiseOps, MultInt) { 55 | int n_i = 10, n_j = 11, n_k = 13; 56 | array dimensions{n_i, n_j, n_k}; 57 | auto a = CreateTensor(dimensions); 58 | auto b = CreateTensor(dimensions); 59 | Tensor result(dimensions); 60 | result = a * b; 61 | for (int i = 0; i < n_i; ++i) { 62 | for (int j = 0; j < n_j; ++j) { 63 | for (int k = 0; k < n_k; ++k) { 64 | auto elt = (k + n_k * (j + (n_j * i))); 65 | EXPECT_EQ(result(i, j, k), elt * elt); 66 | } 67 | } 68 | } 69 | } 70 | 71 | TEST(CWiseOps, AddFloat) { 72 | int n_i = 10, n_j = 11, n_k = 13; 73 | array dimensions{n_i, n_j, n_k}; 74 | auto a = CreateTensor(dimensions); 75 | auto b = CreateTensor(dimensions); 76 | Tensor result(dimensions); 77 | result = a + b; 78 | for (int i = 0; i < n_i; ++i) { 79 | for (int j = 0; j < n_j; ++j) { 80 | for (int k = 0; k < n_k; ++k) { 81 | EXPECT_FLOAT_EQ(result(i, j, k), 2 * (k + n_k * (j + (n_j * i)))); 82 | } 83 | } 84 | } 85 | } 86 | 87 | TEST(CWiseOps, SubFloat) { 88 | int n_i = 10, n_j = 11, n_k = 13; 89 | array dimensions{n_i, n_j, n_k}; 90 | auto a = CreateTensor(dimensions); 91 | auto b = CreateTensor(dimensions); 92 | auto result = CreateTensor(dimensions); 93 | result = a - b; 94 | for (int i = 0; i < n_i; ++i) { 95 | for (int j = 0; j < n_j; ++j) { 96 | for (int k = 0; k < n_k; ++k) { 97 | EXPECT_FLOAT_EQ(result(i, j, k), 0); 98 | } 99 | } 100 | } 101 | } 102 | 103 | TEST(CWiseOps, MultFloat) { 104 | int n_i = 10, n_j = 11, n_k = 13; 105 | array dimensions{n_i, n_j, n_k}; 106 | auto a = CreateTensor(dimensions); 107 | auto b = CreateTensor(dimensions); 108 | Tensor result(dimensions); 109 | result = a * b; 110 | for (int i = 0; i < n_i; ++i) { 111 | for (int j = 0; j < n_j; ++j) { 112 | for (int k = 0; k < n_k; ++k) { 113 | auto elt = (k + n_k * (j + (n_j * i))); 114 | EXPECT_FLOAT_EQ(result(i, j, k), elt * elt); 115 | } 116 | } 117 | } 118 | } 119 | 120 | TEST(CWiseOps, DivFloat) { 121 | int n_i = 10, n_j = 11, n_k = 13; 122 | array dimensions{n_i, n_j, n_k}; 123 | auto a = CreateTensor(dimensions); 124 | auto b = CreateTensor(dimensions); 125 | a(0, 0, 0) = 1; 126 | b(0, 0, 0) = 1; 127 | Tensor result(dimensions); 128 | result = a / b; 129 | for (int i = 0; i < n_i; ++i) { 130 | for (int j = 0; j < n_j; ++j) { 131 | for (int k = 0; k < n_k; ++k) { 132 | EXPECT_FLOAT_EQ(result(i, j, k), 1); 133 | } 134 | } 135 | } 136 | } 137 | 138 | TEST(CWiseOps, MultipleOpsFloat) { 139 | int n_i = 10, n_j = 11, n_k = 13; 140 | array dimensions{n_i, n_j, n_k}; 141 | auto a = CreateTensor(dimensions); 142 | auto b = CreateTensor(dimensions); 143 | auto c = CreateTensor(dimensions); 144 | auto result = CreateTensor(dimensions); 145 | 146 | result = a + b - c; 147 | for (int i = 0; i < n_i; ++i) { 148 | for (int j = 0; j < n_j; ++j) { 149 | for (int k = 0; k < n_k; ++k) { 150 | EXPECT_FLOAT_EQ(result(i, j, k), k + n_k * (j + (n_j * i))); 151 | } 152 | } 153 | } 154 | 155 | result = a + b + c; 156 | for (int i = 0; i < n_i; ++i) { 157 | for (int j = 0; j < n_j; ++j) { 158 | for (int k = 0; k < n_k; ++k) { 159 | EXPECT_FLOAT_EQ(result(i, j, k), 3 * (k + n_k * (j + (n_j * i)))); 160 | } 161 | } 162 | } 163 | } 164 | 165 | TEST(CWiseOps, AddDouble) { 166 | int n_i = 10, n_j = 11, n_k = 13; 167 | array dimensions{n_i, n_j, n_k}; 168 | auto a = CreateTensor(dimensions); 169 | auto b = CreateTensor(dimensions); 170 | Tensor result(dimensions); 171 | result = a + b; 172 | for (int i = 0; i < n_i; ++i) { 173 | for (int j = 0; j < n_j; ++j) { 174 | for (int k = 0; k < n_k; ++k) { 175 | EXPECT_DOUBLE_EQ(result(i, j, k), 2 * (k + n_k * (j + (n_j * i)))); 176 | } 177 | } 178 | } 179 | } 180 | 181 | TEST(CWiseOps, SubDouble) { 182 | int n_i = 10, n_j = 11, n_k = 13; 183 | array dimensions{n_i, n_j, n_k}; 184 | auto a = CreateTensor(dimensions); 185 | auto b = CreateTensor(dimensions); 186 | auto result = CreateTensor(dimensions); 187 | result = a - b; 188 | for (int i = 0; i < n_i; ++i) { 189 | for (int j = 0; j < n_j; ++j) { 190 | for (int k = 0; k < n_k; ++k) { 191 | EXPECT_DOUBLE_EQ(result(i, j, k), 0); 192 | } 193 | } 194 | } 195 | } 196 | 197 | TEST(CWiseOps, MultDouble) { 198 | int n_i = 10, n_j = 11, n_k = 13; 199 | array dimensions{n_i, n_j, n_k}; 200 | auto a = CreateTensor(dimensions); 201 | auto b = CreateTensor(dimensions); 202 | Tensor result(dimensions); 203 | result = a * b; 204 | for (int i = 0; i < n_i; ++i) { 205 | for (int j = 0; j < n_j; ++j) { 206 | for (int k = 0; k < n_k; ++k) { 207 | auto elt = (k + n_k * (j + (n_j * i))); 208 | EXPECT_DOUBLE_EQ(result(i, j, k), elt * elt); 209 | } 210 | } 211 | } 212 | } 213 | 214 | TEST(CWiseOps, DivDouble) { 215 | int n_i = 10, n_j = 11, n_k = 13; 216 | array dimensions{n_i, n_j, n_k}; 217 | auto a = CreateTensor(dimensions); 218 | auto b = CreateTensor(dimensions); 219 | a(0, 0, 0) = 1; 220 | b(0, 0, 0) = 1; 221 | Tensor result(dimensions); 222 | result = a / b; 223 | for (int i = 0; i < n_i; ++i) { 224 | for (int j = 0; j < n_j; ++j) { 225 | for (int k = 0; k < n_k; ++k) { 226 | EXPECT_DOUBLE_EQ(result(i, j, k), 1); 227 | } 228 | } 229 | } 230 | } 231 | -------------------------------------------------------------------------------- /tests/test.cpp: -------------------------------------------------------------------------------- 1 | #include "CWiseBinaryOp.hpp" 2 | #include "Simd/Simd.hpp" 3 | #include "Tensor.hpp" 4 | #include "TensorExpression.hpp" 5 | #include "gtest/gtest.h" 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | using namespace fasttensor; 11 | 12 | int main(int argc, char **argv) { 13 | ::testing::InitGoogleTest(&argc, argv); 14 | 15 | cout << simd::PacketTraits::size << endl; 16 | cout << simd::PacketTraits::size << endl; 17 | 18 | cout << "Device Type: " << static_cast(device_type) << endl; 19 | 20 | Tensor t(array{4, 2}); 21 | Tensor q(array{4, 2}); 22 | auto result = q + t; 23 | return RUN_ALL_TESTS(); 24 | } 25 | --------------------------------------------------------------------------------