├── .clang-format
├── .gitattributes
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── README.md
├── bench
    ├── CMakeLists.txt
    ├── bench.cpp
    └── timer.h
├── clang-format.cmake
├── fasttensor
    ├── Assign.hpp
    ├── CMakeLists.txt
    ├── CWiseBinaryOp.hpp
    ├── DefaultDevice.hpp
    ├── Device.hpp
    ├── DeviceFactory.hpp
    ├── DeviceProperties.hpp
    ├── GpuDevice.hpp
    ├── GpuDeviceFunction.hpp
    ├── Memory.hpp
    ├── RefSelector.hpp
    ├── Simd
    │   ├── Avx2.hpp
    │   ├── Generic.hpp
    │   ├── Simd.hpp
    │   └── SimdMacros.hpp
    ├── StorageUnwrapper.hpp
    ├── Tensor.hpp
    ├── TensorExpression.fwd.hpp
    ├── TensorExpression.hpp
    ├── TensorStorage.hpp
    ├── TensorStorageRef.hpp
    └── UnrollUtils.hpp
└── tests
    ├── CMakeLists.txt
    ├── cwiseops.cpp
    └── test.cpp


/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | Language:        Cpp
  3 | # BasedOnStyle:  LLVM
  4 | AccessModifierOffset: -2
  5 | AlignAfterOpenBracket: Align
  6 | AlignConsecutiveAssignments: false
  7 | AlignConsecutiveDeclarations: false
  8 | AlignEscapedNewlines: Right
  9 | AlignOperands:   true
 10 | AlignTrailingComments: true
 11 | AllowAllParametersOfDeclarationOnNextLine: true
 12 | AllowShortBlocksOnASingleLine: false
 13 | AllowShortCaseLabelsOnASingleLine: false
 14 | AllowShortFunctionsOnASingleLine: All
 15 | AllowShortIfStatementsOnASingleLine: false
 16 | AllowShortLoopsOnASingleLine: false
 17 | AlwaysBreakAfterDefinitionReturnType: None
 18 | AlwaysBreakAfterReturnType: None
 19 | AlwaysBreakBeforeMultilineStrings: false
 20 | AlwaysBreakTemplateDeclarations: Yes
 21 | BinPackArguments: true
 22 | BinPackParameters: true
 23 | BraceWrapping:   
 24 |   AfterClass:      false
 25 |   AfterControlStatement: false
 26 |   AfterEnum:       false
 27 |   AfterFunction:   false
 28 |   AfterNamespace:  false
 29 |   AfterObjCDeclaration: false
 30 |   AfterStruct:     false
 31 |   AfterUnion:      false
 32 |   AfterExternBlock: false
 33 |   BeforeCatch:     false
 34 |   BeforeElse:      false
 35 |   IndentBraces:    false
 36 |   SplitEmptyFunction: true
 37 |   SplitEmptyRecord: true
 38 |   SplitEmptyNamespace: true
 39 | BreakBeforeBinaryOperators: None
 40 | BreakBeforeBraces: Attach
 41 | BreakBeforeInheritanceComma: false
 42 | BreakInheritanceList: BeforeColon
 43 | BreakBeforeTernaryOperators: true
 44 | BreakConstructorInitializersBeforeComma: false
 45 | BreakConstructorInitializers: BeforeColon
 46 | BreakAfterJavaFieldAnnotations: false
 47 | BreakStringLiterals: true
 48 | ColumnLimit:     100
 49 | CommentPragmas:  '^ IWYU pragma:'
 50 | CompactNamespaces: false
 51 | ConstructorInitializerAllOnOneLineOrOnePerLine: false
 52 | ConstructorInitializerIndentWidth: 4
 53 | ContinuationIndentWidth: 4
 54 | Cpp11BracedListStyle: true
 55 | DerivePointerAlignment: false
 56 | DisableFormat:   false
 57 | ExperimentalAutoDetectBinPacking: false
 58 | FixNamespaceComments: true
 59 | ForEachMacros:   
 60 |   - foreach
 61 |   - Q_FOREACH
 62 |   - BOOST_FOREACH
 63 | IncludeBlocks:   Preserve
 64 | IncludeCategories: 
 65 |   - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
 66 |     Priority:        2
 67 |   - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
 68 |     Priority:        3
 69 |   - Regex:           '.*'
 70 |     Priority:        1
 71 | IncludeIsMainRegex: '(Test)?$'
 72 | IndentCaseLabels: false
 73 | IndentPPDirectives: AfterHash
 74 | IndentWidth:     2
 75 | IndentWrappedFunctionNames: false
 76 | JavaScriptQuotes: Leave
 77 | JavaScriptWrapImports: true
 78 | KeepEmptyLinesAtTheStartOfBlocks: true
 79 | MacroBlockBegin: ''
 80 | MacroBlockEnd:   ''
 81 | MaxEmptyLinesToKeep: 1
 82 | NamespaceIndentation: None
 83 | ObjCBinPackProtocolList: Auto
 84 | ObjCBlockIndentWidth: 2
 85 | ObjCSpaceAfterProperty: false
 86 | ObjCSpaceBeforeProtocolList: true
 87 | PenaltyBreakAssignment: 2
 88 | PenaltyBreakBeforeFirstCallParameter: 19
 89 | PenaltyBreakComment: 300
 90 | PenaltyBreakFirstLessLess: 120
 91 | PenaltyBreakString: 1000
 92 | PenaltyBreakTemplateDeclaration: 10
 93 | PenaltyExcessCharacter: 1000000
 94 | PenaltyReturnTypeOnItsOwnLine: 60
 95 | PointerAlignment: Right
 96 | ReflowComments:  true
 97 | SortIncludes:    true
 98 | SortUsingDeclarations: true
 99 | SpaceAfterCStyleCast: false
100 | SpaceAfterTemplateKeyword: true
101 | SpaceBeforeAssignmentOperators: true
102 | SpaceBeforeCpp11BracedList: false
103 | SpaceBeforeCtorInitializerColon: true
104 | SpaceBeforeInheritanceColon: true
105 | SpaceBeforeParens: ControlStatements
106 | SpaceBeforeRangeBasedForLoopColon: true
107 | SpaceInEmptyParentheses: false
108 | SpacesBeforeTrailingComments: 1
109 | SpacesInAngles:  false
110 | SpacesInContainerLiterals: true
111 | SpacesInCStyleCastParentheses: false
112 | SpacesInParentheses: false
113 | SpacesInSquareBrackets: false
114 | Standard:        Cpp11
115 | StatementMacros: 
116 |   - Q_UNUSED
117 |   - QT_REQUIRE_VERSION
118 | TabWidth:        8
119 | UseTab:          Never
120 | ...
121 | 
122 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Set the default behavior, in case people don't have core.autocrlf set.
 2 | * text=auto
 3 | 
 4 | # Explicitly declare text files you want to always be normalized and converted
 5 | # to native line endings on checkout.
 6 | *.c text
 7 | *.cc text
 8 | *.cxx text
 9 | *.cpp text
10 | *.c++ text
11 | *.hpp text
12 | *.h text
13 | *.h++ text
14 | *.hh text
15 | *.txt text
16 | *.md text
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | 
3 | fasttensor.sublime-project
4 | fasttensor.sublime-workspace
5 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "ext/googletest"]
2 | 	path = ext/googletest
3 | 	url = https://github.com/google/googletest
4 | 	branch = master
5 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.14.6)
 2 | project(fasttensor LANGUAGES CXX)
 3 | 
 4 | set(CMAKE_CXX_FLAGS "-Wall -Wextra")
 5 | set(CMAKE_CXX_FLAGS_DEBUG "-g")
 6 | set(CMAKE_CXX_FLAGS_RELEASE "-O3")
 7 | 
 8 | set(USE_LIBCPP ON CACHE BOOL "Use libc++ if compiling with clang")
 9 | if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" AND USE_LIBCPP)
10 | 	set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -stdlib=libc++")
11 | endif()
12 | 
13 | option(FORCE_COLORED_OUTPUT "Always produce ANSI-colored output (GNU/Clang only)." TRUE)
14 | if(${FORCE_COLORED_OUTPUT})
15 | 	if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
16 | 		add_compile_options(-fdiagnostics-color=always)
17 | 	elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
18 | 		add_compile_options(-fcolor-diagnostics)
19 | 	endif()
20 | endif()
21 | 
22 | include(clang-format.cmake)
23 | 
24 | option(BUILD_TESTS "Set to ON to build tests" ON)
25 | option(BUILD_BENCHMARKS "Set to ON to build benchmarks" ON)
26 | 
27 | set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
28 | 
29 | add_subdirectory(fasttensor)
30 | 
31 | if(BUILD_TESTS)
32 | 	add_subdirectory(ext/googletest)
33 | 	add_subdirectory(tests)
34 | endif()
35 | 
36 | if(BUILD_BENCHMARKS)
37 | 	add_subdirectory(bench)
38 | endif()
39 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # fasttensor
  2 | 
  3 | C++ library for tensor arithmetic.
  4 | 
  5 | Uses SIMD for CPU acceleration and CUDA for GPU acceleration. Supports multi-GPU if more than 1 is available. [Kernel fusion](https://stackoverflow.com/a/53311373) with [expression templates](https://en.wikipedia.org/wiki/Expression_templates) allows efficient computation of long arithmetic expressions.
  6 | 
  7 | ## Usage
  8 | 
  9 | fasttensor is header-only, simply add the location of the header files to your include path while compiling.
 10 | 
 11 | Example code:
 12 | 
 13 | ```cpp
 14 | using namespace fasttensor;
 15 | 
 16 | int main() {
 17 | 	int num_rows = 4;
 18 | 	int num_cols = 2;
 19 | 	// Create integer tensor of rank 2
 20 | 	// Dimensions: 4 rows, 2 columns (4x2)
 21 | 	Tensor<int, 2> a(array<ptrdiff_t, 2>{num_rows, num_cols});
 22 | 	Tensor<int, 2> b(array<ptrdiff_t, 2>{num_rows, num_cols});
 23 | 
 24 | 	for (int i = 0; i < num_rows; ++i) {
 25 | 		for (int j = 0; j < num_cols; ++j) {
 26 | 			// This is how you set/get elements
 27 | 			a(i, j) = j + num_cols * i;
 28 | 			b(i, j) = j + num_cols * i;
 29 | 		}
 30 | 	}
 31 | 
 32 | 	Tensor<int, 2> results(array<ptrdiff_t, 2>{num_rows, num_cols});
 33 | 
 34 | 	// Element-wise addition of the two tensors
 35 | 	// This will auto-magically use GPU/SIMD instructions
 36 | 	// Need to compile with appropriate compiler flags and hardware
 37 | 	results = a + b;
 38 | 
 39 | 	for (int i = 0; i < num_rows; ++i) {
 40 | 		for (int j = 0; j < num_cols; ++j) {
 41 | 			// Just checking if we got the right answer
 42 | 			assert(results(i, j) == 2 * (j + num_cols * i));
 43 | 		}
 44 | 	}
 45 | 
 46 | 	return 0;
 47 | }
 48 | ```
 49 | 
 50 | ## Benchmarks
 51 | 
 52 | **Eager mode** is equivalent to a naive implementation of arithmetic expressions, creating a temporary variable after each operation. This behaviour was simulated with a helper function that forces eager evaluation of a given arithmetic expression.
 53 | 
 54 | **Lazy mode** constructs an expression at compile time using expression templates and only evaluates the expression when assigned to a matrix.
 55 | 
 56 | ### Config:
 57 | 
 58 | **CPU**: Intel Xeon E5-2690 v3 @ 2.60 GHz  
 59 | **GPU**: NVidia Tesla P4  
 60 | **Compiler**: Clang 9.0.1  
 61 | **CUDA Toolkit Version**: 10.0  
 62 | 
 63 | ### Results:
 64 | 
 65 | - The variables are 3-dimensional float tensors of size 10<sup>4</sup> &times; 10<sup>2</sup> &times; 10<sup>2</sup> filled with random values.
 66 | - The results were obtained by running 10 trials.
 67 | - Each trial consisted of evaluating the expression 100 times.
 68 | 
 69 | 
 70 | #### X = A + B + C + D
 71 | 
 72 | <table>
 73 |   <tr>
 74 |     <th rowspan="2">Devices</th>
 75 |     <th colspan="2">Eager</th>
 76 |     <th colspan="2">Lazy</th>
 77 |   </tr>
 78 |   <tr>
 79 |     <th>Time</th>
 80 |     <th>GFlops</th>
 81 |     <th>Time</th>
 82 |     <th>GFlops</th>
 83 |   </tr>
 84 |   <tr>
 85 |     <td>AVX2 on CPU</td>
 86 |     <td>28.26 &plusmn; 0.21s</td>
 87 |     <td>0.99</td>
 88 |     <td>17.73 &plusmn; 0.05s</td>
 89 |     <td>1.58</td>
 90 |   </tr>
 91 |   <tr>
 92 |     <td>1 Tesla P4 GPU</td>
 93 |     <td>2.65 &plusmn; 0.00s</td>
 94 |     <td>10.56</td>
 95 |     <td>1.51 &plusmn; 0.12s</td>
 96 |     <td>18.52</td>
 97 |   </tr>
 98 |   <tr>
 99 |     <td>2 Tesla P4 GPUs</td>
100 |     <td>1.56 &plusmn; 0.20s</td>
101 |     <td>17.92</td>
102 |     <td>0.89 &plusmn; 0.08s</td>
103 |     <td>31.25</td>
104 |   </tr>
105 | </table>
106 | 
107 | ## Development
108 | 
109 | To run the tests and benchmarks on Linux:
110 | 
111 | (Dependencies: CMake >= 3.14.6, clang++ >= 8, CUDA >= 9)
112 | 
113 | 1. Clone [this repo](https://github.com/JHurricane96/fasttensor)
114 | 
115 | 2. `mkdir build && cd build`
116 | 
117 | 3. Run CMake to generate build files (detailed instructions below). Add `-DBUILD_TESTS=OFF` to not build tests, `-DBUILD_BENCHMARKS=OFF` to not build benchmarks.
118 | 
119 | 4. `cmake --build .`
120 | 
121 | 5. `./tests` to run the tests and `./bench/bench` to run the benchmarks
122 | 
123 | ### Running CMake
124 | 
125 | The build can be configured with various build options. The full command to run is:
126 | 
127 | ```
128 | CXX=<clang++ location> CC=<clang location> cmake.. \
129 | -DDEVICE_TYPE=<NORMAL|SIMD|GPU> -DCMAKE_BUILD_TYPE=<Release|Debug> \
130 | -DCUDA_PATH=<CUDA toolkit path> -DGPU_ARCH=<GPU arch>
131 | ```
132 | 
133 | - Use `CXX` and `CC` to set the C and C++ compiler to clang.
134 | - Set `DEVICE_TYPE` to `NORMAL` for normal CPU mode, `SIMD` to use SIMD vectorized instructions, and `GPU` to use the GPU.
135 | - Set `CMAKE_BUILD_TYPE` to `Release` or `Debug` depending on your need.
136 | - Set `CUDA_PATH` to the location of the CUDA toolkit, and `GPU_ARCH` to the GPU's CUDA compute capability (3.7 means you should set it to 37, that is, simply remove the decimal). These options are only required if `DEVICE_TYPE` is `GPU`.
137 | 


--------------------------------------------------------------------------------
/bench/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.14.6)
2 | 
3 | add_executable(bench bench.cpp)
4 | target_link_libraries(bench fasttensor rt)
5 | target_compile_options(bench PRIVATE -O3)
6 | target_compile_options(bench PRIVATE -DNDEBUG)
7 | target_compile_options(bench PRIVATE -march=native)
8 | 


--------------------------------------------------------------------------------
/bench/bench.cpp:
--------------------------------------------------------------------------------
 1 | #include "Assign.hpp"
 2 | #include "CWiseBinaryOp.hpp"
 3 | #include "Tensor.hpp"
 4 | #include "timer.h"
 5 | #include <cmath>
 6 | #include <iostream>
 7 | #include <limits>
 8 | #include <random>
 9 | 
10 | using namespace std;
11 | using namespace fasttensor;
12 | 
13 | const int TRIES = 10;
14 | const int REPEAT = 100;
15 | 
16 | template <typename ElementType, int Rank>
17 | Tensor<ElementType, Rank> make_rand_tensor(array<ptrdiff_t, Rank> dimensions) {
18 |   static_assert(is_floating_point<ElementType>::value);
19 |   static std::random_device rd;
20 |   static std::mt19937 gen(rd());
21 |   static std::uniform_real_distribution<ElementType> dis(-100.0, 100.0);
22 | 
23 |   Tensor<ElementType, Rank> t(dimensions);
24 |   auto num_elts = t.num_elements();
25 |   auto &_storage = t.storage();
26 |   for (int i = 0; i < num_elts; ++i) {
27 |     _storage.storeCoeff(dis(gen), i);
28 |   }
29 |   return t;
30 | }
31 | 
32 | double square(const double num) { return num * num; }
33 | 
34 | double timer_mean(const BenchTimer &timer) { return timer.total() / TRIES; }
35 | 
36 | double timer_sd(const BenchTimer &timer) {
37 |   double mean_squares = timer.squared_total() / TRIES;
38 |   double squared_mean = square(timer_mean(timer));
39 |   return sqrt(mean_squares - squared_mean);
40 | }
41 | 
42 | void print_results(string test_name, const BenchTimer &timer, double flops_factor) {
43 |   auto mean = timer_mean(timer);
44 |   auto sd = timer_sd(timer);
45 |   auto flops = flops_factor * REPEAT * TRIES / (pow(1024., 3) * timer.total());
46 |   std::cout << test_name << " :: Mean: " << mean << "s, SD: " << sd << "s; " << flops
47 |             << " GFlops\n";
48 | }
49 | 
50 | template <typename Result, typename Operand, typename... OtherOperands>
51 | inline void _eager_eval(Result &result, Operand &operand, OtherOperands &... other_operands) {
52 |   result = result + operand;
53 |   if constexpr (sizeof...(other_operands) > 0) {
54 |     _eager_eval(result, other_operands...);
55 |   }
56 | }
57 | 
58 | template <typename Result, typename Operand1, typename Operand2, typename... OtherOperands>
59 | inline void eager_eval(Result &result, Operand1 &operand1, Operand2 &operand2,
60 |                        OtherOperands &... other_operands) {
61 |   result = operand1 + operand2;
62 |   _eager_eval(result, other_operands...);
63 | }
64 | 
65 | int main() {
66 |   ptrdiff_t row = 1E4, col = 1E2, dep = 1E2;
67 |   array<ptrdiff_t, 3> dimensions{row, col, dep};
68 |   auto a = make_rand_tensor<float, 3>(dimensions);
69 |   auto b = make_rand_tensor<float, 3>(dimensions);
70 |   auto c = make_rand_tensor<float, 3>(dimensions);
71 |   auto d = make_rand_tensor<float, 3>(dimensions);
72 |   Tensor<float, 3> result(dimensions);
73 | 
74 |   BenchTimer timer;
75 | 
76 |   BENCH(timer, TRIES, REPEAT, result = a + b + c + d);
77 |   print_results("a+b+c+d (Lazy)", timer, double(row * col * dep * 3));
78 |   std::cout << result.getCoeff(0) << "\n";
79 | 
80 |   BENCH(timer, TRIES, REPEAT, eager_eval(result, a, b, c, d));
81 |   print_results("a+b+c+d (Eager)", timer, double(row * col * dep * 3));
82 |   std::cout << result.getCoeff(0) << "\n";
83 | 
84 |   return 0;
85 | }
86 | 


--------------------------------------------------------------------------------
/bench/timer.h:
--------------------------------------------------------------------------------
  1 | // The original file is part of Eigen, a lightweight C++ template library
  2 | // for linear algebra.
  3 | //
  4 | // Original work Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
  5 | // Original work Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
  6 | // Modified work Copyright (C) 2019 Arun Ramachandran <ramachandran.arun@outlook.com>
  7 | //
  8 | // This Source Code Form is subject to the terms of the Mozilla
  9 | // Public License v. 2.0. If a copy of the MPL was not distributed
 10 | // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 11 | 
 12 | #pragma once
 13 | 
 14 | #include <array>
 15 | 
 16 | #if defined(_WIN32) || defined(__CYGWIN__)
 17 | #  ifndef NOMINMAX
 18 | #    define NOMINMAX
 19 | #    define FASTTENSOR_BT_UNDEF_NOMINMAX
 20 | #  endif
 21 | #  ifndef WIN32_LEAN_AND_MEAN
 22 | #    define WIN32_LEAN_AND_MEAN
 23 | #    define FASTTENSOR_BT_UNDEF_WIN32_LEAN_AND_MEAN
 24 | #  endif
 25 | #  include <windows.h>
 26 | #elif defined(__APPLE__)
 27 | #  include <mach/mach_time.h>
 28 | #else
 29 | #  include <time.h>
 30 | #  include <unistd.h>
 31 | #endif
 32 | 
 33 | static void escape(void *p) { asm volatile("" : : "g"(p) : "memory"); }
 34 | 
 35 | static void clobber() { asm volatile("" : : : "memory"); }
 36 | 
 37 | enum { CPU_TIMER = 0, REAL_TIMER = 1 };
 38 | 
 39 | /** Elapsed time timer keeping the best try.
 40 |  *
 41 |  * On POSIX platforms we use clock_gettime with CLOCK_PROCESS_CPUTIME_ID.
 42 |  * On Windows we use QueryPerformanceCounter
 43 |  *
 44 |  * Important: on linux, you must link with -lrt
 45 |  */
 46 | class BenchTimer {
 47 | public:
 48 |   BenchTimer() {
 49 | #if defined(_WIN32) || defined(__CYGWIN__)
 50 |     LARGE_INTEGER freq;
 51 |     QueryPerformanceFrequency(&freq);
 52 |     m_frequency = (double)freq.QuadPart;
 53 | #endif
 54 |     reset();
 55 |   }
 56 | 
 57 |   ~BenchTimer() {}
 58 | 
 59 |   inline void reset() {
 60 |     m_bests.fill(1e9);
 61 |     m_worsts.fill(0);
 62 |     m_totals.fill(0);
 63 |     m_squared_totals.fill(0);
 64 |   }
 65 |   inline void start() {
 66 |     m_starts[CPU_TIMER] = getCpuTime();
 67 |     m_starts[REAL_TIMER] = getRealTime();
 68 |   }
 69 |   inline void stop() {
 70 |     m_times[CPU_TIMER] = getCpuTime() - m_starts[CPU_TIMER];
 71 |     m_times[REAL_TIMER] = getRealTime() - m_starts[REAL_TIMER];
 72 |     m_bests[0] = std::min(m_bests[0], m_times[0]);
 73 |     m_bests[1] = std::min(m_bests[1], m_times[1]);
 74 |     m_worsts[0] = std::max(m_worsts[0], m_times[0]);
 75 |     m_worsts[1] = std::max(m_worsts[1], m_times[1]);
 76 |     m_totals[0] += m_times[0];
 77 |     m_totals[1] += m_times[1];
 78 |     m_squared_totals[0] += m_times[0] * m_times[0];
 79 |     m_squared_totals[1] += m_times[1] * m_times[1];
 80 |   }
 81 | 
 82 |   /** Return the elapsed time in seconds between the last start/stop pair
 83 |    */
 84 |   inline double value(int TIMER = CPU_TIMER) const { return m_times[TIMER]; }
 85 | 
 86 |   /** Return the best elapsed time in seconds
 87 |    */
 88 |   inline double best(int TIMER = CPU_TIMER) const { return m_bests[TIMER]; }
 89 | 
 90 |   /** Return the worst elapsed time in seconds
 91 |    */
 92 |   inline double worst(int TIMER = CPU_TIMER) const { return m_worsts[TIMER]; }
 93 | 
 94 |   /** Return the total elapsed time in seconds.
 95 |    */
 96 |   inline double total(int TIMER = CPU_TIMER) const { return m_totals[TIMER]; }
 97 | 
 98 |   /** Return the total of squares of elapsed time in seconds.
 99 |    */
100 |   inline double squared_total(int TIMER = CPU_TIMER) const { return m_squared_totals[TIMER]; }
101 | 
102 |   inline double getCpuTime() const {
103 | #ifdef _WIN32
104 |     LARGE_INTEGER query_ticks;
105 |     QueryPerformanceCounter(&query_ticks);
106 |     return query_ticks.QuadPart / m_frequency;
107 | #elif __APPLE__
108 |     return double(mach_absolute_time()) * 1e-9;
109 | #else
110 |     timespec ts;
111 |     clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
112 |     return double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec);
113 | #endif
114 |   }
115 | 
116 |   inline double getRealTime() const {
117 | #ifdef _WIN32
118 |     SYSTEMTIME st;
119 |     GetSystemTime(&st);
120 |     return (double)st.wSecond + 1.e-3 * (double)st.wMilliseconds;
121 | #elif __APPLE__
122 |     return double(mach_absolute_time()) * 1e-9;
123 | #else
124 |     timespec ts;
125 |     clock_gettime(CLOCK_REALTIME, &ts);
126 |     return double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec);
127 | #endif
128 |   }
129 | 
130 | protected:
131 | #if defined(_WIN32) || defined(__CYGWIN__)
132 |   double m_frequency;
133 | #endif
134 |   std::array<double, 2> m_starts;
135 |   std::array<double, 2> m_times;
136 |   std::array<double, 2> m_bests;
137 |   std::array<double, 2> m_worsts;
138 |   std::array<double, 2> m_totals;
139 |   std::array<double, 2> m_squared_totals;
140 | };
141 | 
142 | #define BENCH(TIMER, TRIES, REP, CODE)                                                             \
143 |   {                                                                                                \
144 |     TIMER.reset();                                                                                 \
145 |     for (int uglyvarname1 = 0; uglyvarname1 < TRIES; ++uglyvarname1) {                             \
146 |       TIMER.start();                                                                               \
147 |       for (int uglyvarname2 = 0; uglyvarname2 < REP; ++uglyvarname2) {                             \
148 |         CODE;                                                                                      \
149 |       }                                                                                            \
150 |       TIMER.stop();                                                                                \
151 |       clobber();                                                                                   \
152 |     }                                                                                              \
153 |   }
154 | 
155 | // clean #defined tokens
156 | #ifdef FASTTENSOR_BT_UNDEF_NOMINMAX
157 | #  undef FASTTENSOR_BT_UNDEF_NOMINMAX
158 | #  undef NOMINMAX
159 | #endif
160 | 
161 | #ifdef FASTTENSOR_BT_UNDEF_WIN32_LEAN_AND_MEAN
162 | #  undef FASTTENSOR_BT_UNDEF_WIN32_LEAN_AND_MEAN
163 | #  undef WIN32_LEAN_AND_MEAN
164 | #endif
165 | 


--------------------------------------------------------------------------------
/clang-format.cmake:
--------------------------------------------------------------------------------
 1 | # Creates additional target to perform clang-format run, requires clang-format
 2 | 
 3 | # Find clang format
 4 | find_program(CLANG_FORMAT_EXECUTABLE "clang-format")
 5 | if(NOT CLANG_FORMAT_EXECUTABLE)
 6 | 	return()
 7 | endif()
 8 | 
 9 | # Get all project files
10 | file(GLOB_RECURSE ALL_SOURCE_FILES fasttensor/*.hpp tests/*.cpp bench/*.h bench/*.cpp)
11 | 
12 | # Add target to build
13 | add_custom_target(
14 | 	clangformat
15 | 	COMMAND ${CLANG_FORMAT_EXECUTABLE}
16 | 	-style=file
17 | 	-i
18 | 	${ALL_SOURCE_FILES}
19 | )


--------------------------------------------------------------------------------
/fasttensor/Assign.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "Device.hpp"
  4 | #include "StorageUnwrapper.hpp"
  5 | #include "Tensor.hpp"
  6 | 
  7 | #if defined FASTTENSOR_GPU
  8 | #  include "GpuDevice.hpp"
  9 | #endif
 10 | 
 11 | namespace fasttensor {
 12 | 
 13 | #if defined FASTTENSOR_SIMD || defined FASTTENSOR_NORMAL
 14 | 
 15 | template <typename ElementType, int Rank, typename OtherExpr,
 16 |           typename = enable_if_tensor_exprs<OtherExpr>>
 17 | inline void Assign(Tensor<ElementType, Rank, DefaultDevice> &lhs, OtherExpr const &rhs) {
 18 |   auto &storage = lhs.storage();
 19 |   if constexpr (device_type == DeviceType::Simd &&
 20 |                 simd::PacketTraits<ElementType>::is_vectorizable) {
 21 |     auto packet_size = simd::PacketTraits<ElementType>::size;
 22 |     auto num_packets = storage.num_elements() / packet_size;
 23 |     for (std::ptrdiff_t i = 0; i < num_packets; ++i) {
 24 |       storage.storePacket(i, rhs.getPacket(i));
 25 |     }
 26 |     for (std::ptrdiff_t i = num_packets * packet_size; i < storage.num_elements(); ++i) {
 27 |       storage.storeCoeff(rhs.getCoeff(i), i);
 28 |     }
 29 |   } else {
 30 |     for (std::ptrdiff_t i = 0; i < storage.num_elements(); ++i) {
 31 |       storage.storeCoeff(rhs.getCoeff(i), i);
 32 |     }
 33 |   }
 34 | }
 35 | 
 36 | #elif defined FASTTENSOR_GPU
 37 | 
 38 | template <typename ElementType, typename OtherExpr, typename = enable_if_tensor_exprs<OtherExpr>>
 39 | __global__ void Kernel(ElementType *lhs_storage, OtherExpr rhs, int start_offset, int end_offset) {
 40 |   int index = blockIdx.x * blockDim.x + threadIdx.x + start_offset;
 41 |   int stride = blockDim.x * gridDim.x;
 42 |   for (int i = index; i < end_offset; i += stride) {
 43 |     lhs_storage[i] = rhs.getCoeff(i);
 44 |   }
 45 | }
 46 | 
 47 | template <typename ElementType, int Rank, typename OtherExpr,
 48 |           typename = enable_if_tensor_exprs<OtherExpr>>
 49 | inline void Assign(Tensor<ElementType, Rank, GpuDevice> &lhs, OtherExpr const &rhs) {
 50 |   auto unwrapped_rhs = UnwrapStorage(rhs);
 51 | 
 52 |   auto &device_props = lhs.device().deviceProps();
 53 |   auto num_devices = device_props.size();
 54 |   auto num_elements = lhs.num_elements();
 55 |   auto num_elts_per_device_floored = num_elements / num_devices;
 56 |   auto partition_point = num_devices - (num_elements % num_devices);
 57 | 
 58 |   auto num_calculted_elts = 0;
 59 |   for (int i = 0; i < num_devices; ++i) {
 60 |     cudaSetDevice(i);
 61 | 
 62 |     auto &device = device_props[i];
 63 |     auto block_size = device.blockSize();
 64 |     auto max_blocks = device.maxBlocks();
 65 |     decltype(num_elements) num_elts_current_device = 0;
 66 | 
 67 |     if (i >= partition_point) {
 68 |       num_elts_current_device = num_elts_per_device_floored + 1;
 69 |     } else {
 70 |       num_elts_current_device = num_elts_per_device_floored;
 71 |     }
 72 | 
 73 |     int num_blocks = std::min((decltype(num_elements))max_blocks,
 74 |                               (num_elts_current_device + block_size - 1) / block_size);
 75 |     auto end_offset = num_calculted_elts + num_elts_current_device;
 76 | 
 77 |     Kernel<<<num_blocks, block_size>>>(lhs.storage().elements(), unwrapped_rhs, num_calculted_elts,
 78 |                                        end_offset);
 79 | 
 80 |     num_calculted_elts = end_offset;
 81 |   }
 82 | 
 83 |   for (int i = 0; i < device_props.size(); ++i) {
 84 |     cudaSetDevice(i);
 85 |     cudaDeviceSynchronize();
 86 |   }
 87 | }
 88 | 
 89 | #endif
 90 | 
 91 | template <typename ElementType, int Rank, typename DeviceType>
 92 | template <typename OtherExpr, typename>
 93 | inline Tensor<ElementType, Rank, DeviceType> &
 94 | Tensor<ElementType, Rank, DeviceType>::operator=(OtherExpr const &other) {
 95 |   Assign(*this, other);
 96 |   return *this;
 97 | }
 98 | 
 99 | } // namespace fasttensor
100 | 


--------------------------------------------------------------------------------
/fasttensor/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.14.6)
 2 | 
 3 | add_library(fasttensor INTERFACE)
 4 | 
 5 | target_include_directories(fasttensor INTERFACE
 6 | 	$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
 7 | 	$<INSTALL_INTERFACE:.>
 8 | )
 9 | 
10 | set(DEVICE_TYPE "SIMD" CACHE STRING "Device type")
11 | message("** Device type is set to ${DEVICE_TYPE}")
12 | 
13 | set(CUDA_PATH "" CACHE PATH "Path to CUDA toolkit")
14 | set(GPU_ARCH "" CACHE STRING "CUDA GPU compute architecture")
15 | 
16 | if(DEVICE_TYPE STREQUAL "SIMD")
17 | 	target_compile_definitions(fasttensor INTERFACE FASTTENSOR_SIMD)
18 | elseif(DEVICE_TYPE STREQUAL "GPU")
19 | 	target_compile_definitions(fasttensor INTERFACE FASTTENSOR_GPU)
20 | 	target_compile_options(fasttensor INTERFACE -x cuda -pthread --cuda-gpu-arch=sm_${GPU_ARCH} --cuda-path=${CUDA_PATH})
21 | 	target_link_options(fasttensor INTERFACE -lcudart -ldl -lrt -L${CUDA_PATH}/lib64)
22 | endif()
23 | 
24 | target_compile_features(fasttensor INTERFACE cxx_std_17)
25 | 
26 | install(TARGETS fasttensor EXPORT fasttensor_config
27 | 	ARCHIVE DESTINATION lib
28 | 	LIBRARY DESTINATION lib
29 | 	RUNTIME DESTINATION bin
30 | )
31 | 
32 | install(EXPORT fasttensor_config DESTINATION lib)
33 | install(DIRECTORY ./ DESTINATION fasttensor)
34 | 


--------------------------------------------------------------------------------
/fasttensor/CWiseBinaryOp.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "GpuDeviceFunction.hpp"
 4 | #include "RefSelector.hpp"
 5 | #include "Simd/Simd.hpp"
 6 | #include "Tensor.hpp"
 7 | #include "TensorExpression.hpp"
 8 | 
 9 | namespace fasttensor {
10 | 
11 | enum class BinaryOp { Plus, Minus, Multiplies, Divides };
12 | 
13 | template <typename LeftExpr, typename RightExpr, BinaryOp Op>
14 | class CWiseBinaryOp;
15 | 
16 | template <typename LeftExpr, typename RightExpr, BinaryOp Op>
17 | class CWiseBinaryOp : public TensorExpression {
18 |   static_assert(are_tensor_exprs<LeftExpr, RightExpr>,
19 |                 "Expressions in CWiseBinaryOp must inherit from TensorExpression");
20 | 
21 | public:
22 |   using LeftExprRefType = ref_selector_t<LeftExpr>;
23 |   using RightExprRefType = ref_selector_t<RightExpr>;
24 | 
25 |   CWiseBinaryOp(const LeftExpr &left_expr, const RightExpr &right_expr)
26 |       : _left_expr(left_expr), _right_expr(right_expr) {}
27 | 
28 |   inline auto getPacket(std::ptrdiff_t n) const {
29 |     if constexpr (Op == BinaryOp::Plus) {
30 |       return simd::Add(_left_expr.getPacket(n), _right_expr.getPacket(n));
31 |     } else if constexpr (Op == BinaryOp::Minus) {
32 |       return simd::Sub(_left_expr.getPacket(n), _right_expr.getPacket(n));
33 |     } else if constexpr (Op == BinaryOp::Multiplies) {
34 |       return simd::Mult(_left_expr.getPacket(n), _right_expr.getPacket(n));
35 |     } else if constexpr (Op == BinaryOp::Divides) {
36 |       return simd::Div(_left_expr.getPacket(n), _right_expr.getPacket(n));
37 |     }
38 |   }
39 | 
40 |   GPU_DEVICE_FUNC inline auto getCoeff(std::ptrdiff_t n) const {
41 |     if constexpr (Op == BinaryOp::Plus) {
42 |       return _left_expr.getCoeff(n) + _right_expr.getCoeff(n);
43 |     } else if constexpr (Op == BinaryOp::Minus) {
44 |       return _left_expr.getCoeff(n) - _right_expr.getCoeff(n);
45 |     } else if constexpr (Op == BinaryOp::Multiplies) {
46 |       return _left_expr.getCoeff(n) * _right_expr.getCoeff(n);
47 |     } else if constexpr (Op == BinaryOp::Divides) {
48 |       return _left_expr.getCoeff(n) / _right_expr.getCoeff(n);
49 |     }
50 |   }
51 | 
52 |   inline LeftExprRefType leftExpr() const { return _left_expr; }
53 | 
54 |   inline RightExprRefType rightExpr() const { return _right_expr; }
55 | 
56 | private:
57 |   LeftExprRefType _left_expr;
58 |   RightExprRefType _right_expr;
59 | };
60 | 
61 | template <typename LeftExpr, typename RightExpr,
62 |           typename = enable_if_tensor_exprs<LeftExpr, RightExpr>>
63 | inline auto operator+(LeftExpr const &left_expr, RightExpr const &right_expr) {
64 |   return CWiseBinaryOp<const LeftExpr, const RightExpr, BinaryOp::Plus>(left_expr, right_expr);
65 | }
66 | 
67 | template <typename LeftExpr, typename RightExpr,
68 |           typename = enable_if_tensor_exprs<LeftExpr, RightExpr>>
69 | inline auto operator-(LeftExpr const &left_expr, RightExpr const &right_expr) {
70 |   return CWiseBinaryOp<const LeftExpr, const RightExpr, BinaryOp::Minus>(left_expr, right_expr);
71 | }
72 | 
73 | template <typename LeftExpr, typename RightExpr,
74 |           typename = enable_if_tensor_exprs<LeftExpr, RightExpr>>
75 | inline auto operator*(LeftExpr const &left_expr, RightExpr const &right_expr) {
76 |   return CWiseBinaryOp<const LeftExpr, const RightExpr, BinaryOp::Multiplies>(left_expr,
77 |                                                                               right_expr);
78 | }
79 | 
80 | template <typename LeftExpr, typename RightExpr,
81 |           typename = enable_if_tensor_exprs<LeftExpr, RightExpr>>
82 | inline auto operator/(LeftExpr const &left_expr, RightExpr const &right_expr) {
83 |   return CWiseBinaryOp<const LeftExpr, const RightExpr, BinaryOp::Divides>(left_expr, right_expr);
84 | }
85 | 
86 | } // namespace fasttensor
87 | 


--------------------------------------------------------------------------------
/fasttensor/DefaultDevice.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "DeviceProperties.hpp"
 4 | #include <vector>
 5 | 
 6 | namespace fasttensor {
 7 | 
 8 | struct DefaultDevice {
 9 |   DefaultDevice() { device_props.emplace_back(1, 1); }
10 | 
11 | private:
12 |   std::vector<DeviceProperties> device_props;
13 | };
14 | 
15 | } // namespace fasttensor
16 | 


--------------------------------------------------------------------------------
/fasttensor/Device.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace fasttensor {
 4 | 
 5 | enum class DeviceType { Normal, Simd, Gpu };
 6 | 
 7 | #if defined FASTTENSOR_GPU
 8 | constexpr DeviceType device_type = DeviceType::Gpu;
 9 | #elif defined FASTTENSOR_SIMD
10 | constexpr DeviceType device_type = DeviceType::Simd;
11 | #else
12 | #  define FASTTENSOR_NORMAL
13 | constexpr DeviceType device_type = DeviceType::Normal;
14 | #endif
15 | 
16 | } // namespace fasttensor
17 | 


--------------------------------------------------------------------------------
/fasttensor/DeviceFactory.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <optional>
 4 | 
 5 | namespace fasttensor {
 6 | 
 7 | template <typename DeviceType>
 8 | class DeviceFactory {
 9 | private:
10 |   static std::optional<DeviceType> device;
11 | 
12 | public:
13 |   static DeviceType GetDevice() {
14 |     if (!device) {
15 |       device = std::make_optional<DeviceType>();
16 |     }
17 |     return *device;
18 |   }
19 | };
20 | 
21 | template <typename DeviceType>
22 | std::optional<DeviceType> DeviceFactory<DeviceType>::device = std::nullopt;
23 | 
24 | } // namespace fasttensor
25 | 


--------------------------------------------------------------------------------
/fasttensor/DeviceProperties.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace fasttensor {
 4 | 
 5 | struct DeviceProperties {
 6 |   DeviceProperties(int block_size, int max_blocks)
 7 |       : block_size(block_size), max_blocks(max_blocks) {}
 8 |   inline int blockSize() const { return block_size; }
 9 | 
10 |   inline int maxBlocks() const { return max_blocks; }
11 | 
12 | private:
13 |   int block_size;
14 |   int max_blocks;
15 | };
16 | 
17 | } // namespace fasttensor
18 | 


--------------------------------------------------------------------------------
/fasttensor/GpuDevice.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "DeviceProperties.hpp"
 4 | #include <vector>
 5 | 
 6 | namespace fasttensor {
 7 | 
 8 | struct GpuDevice {
 9 |   GpuDevice() {
10 |     int num_devices;
11 |     cudaGetDeviceCount(&num_devices);
12 |     for (int i = 0; i < num_devices; i++) {
13 |       cudaDeviceProp prop;
14 |       cudaGetDeviceProperties(&prop, i);
15 |       auto block_size = prop.maxThreadsPerBlock;
16 |       auto max_blocks = prop.multiProcessorCount * prop.maxThreadsPerMultiProcessor / block_size;
17 |       device_props.emplace_back(block_size, max_blocks);
18 |     }
19 |   }
20 | 
21 |   inline auto &deviceProps() const { return device_props; }
22 | 
23 | private:
24 |   std::vector<DeviceProperties> device_props;
25 | };
26 | 
27 | } // namespace fasttensor
28 | 


--------------------------------------------------------------------------------
/fasttensor/GpuDeviceFunction.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "Device.hpp"
 4 | 
 5 | namespace fasttensor {
 6 | 
 7 | #ifdef FASTTENSOR_GPU
 8 | #  define GPU_DEVICE_FUNC __device__ __host__
 9 | #else
10 | #  define GPU_DEVICE_FUNC
11 | #endif
12 | 
13 | } // namespace fasttensor
14 | 


--------------------------------------------------------------------------------
/fasttensor/Memory.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "Simd/Simd.hpp"
 4 | #include <cstdlib>
 5 | 
 6 | namespace fasttensor {
 7 | 
 8 | #if defined FASTTENSOR_SIMD || defined FASTTENSOR_NORMAL
 9 | 
10 | template <typename ElementType>
11 | inline ElementType *AllocateMemory(std::ptrdiff_t num_elements) {
12 |   if constexpr (simd::PacketTraits<ElementType>::is_vectorizable) {
13 |     return reinterpret_cast<ElementType *>(
14 |         std::aligned_alloc(simd::PacketSize, sizeof(ElementType) * num_elements));
15 |   } else {
16 |     return new ElementType[num_elements];
17 |   }
18 | }
19 | 
20 | template <typename ElementType>
21 | inline void DeallocateMemory(ElementType *memory) {
22 |   if constexpr (simd::PacketTraits<ElementType>::is_vectorizable) {
23 |     std::free(memory);
24 |   } else {
25 |     delete[] memory;
26 |   }
27 | }
28 | 
29 | #elif defined FASTTENSOR_GPU
30 | 
31 | template <typename ElementType>
32 | inline ElementType *AllocateMemory(std::ptrdiff_t num_elements) {
33 |   ElementType *memory;
34 |   cudaMallocManaged(&memory, num_elements * sizeof(ElementType));
35 |   return memory;
36 | }
37 | 
38 | template <typename ElementType>
39 | inline void DeallocateMemory(ElementType *memory) {
40 |   cudaFree(memory);
41 | }
42 | 
43 | #endif
44 | 
45 | } // namespace fasttensor
46 | 


--------------------------------------------------------------------------------
/fasttensor/RefSelector.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace fasttensor {
 4 | 
 5 | template <typename T, typename = void>
 6 | struct ref_selector {
 7 |   using type = T;
 8 | };
 9 | 
10 | template <typename T>
11 | using ref_selector_t = typename ref_selector<T>::type;
12 | 
13 | } // namespace fasttensor
14 | 


--------------------------------------------------------------------------------
/fasttensor/Simd/Avx2.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <type_traits>
  4 | 
  5 | namespace fasttensor::simd {
  6 | 
  7 | constexpr int PacketSize = 32;
  8 | 
  9 | template <typename T>
 10 | constexpr int NumElementsInPacket = PacketSize / sizeof(T);
 11 | 
 12 | template <typename T>
 13 | struct PacketTraits {
 14 |   using type = T;
 15 |   static constexpr bool is_vectorizable = false;
 16 |   static constexpr int size = 1;
 17 | };
 18 | 
 19 | template <>
 20 | struct PacketTraits<int> {
 21 |   using type = __m256i;
 22 |   static constexpr bool is_vectorizable = true;
 23 |   static constexpr int size = NumElementsInPacket<int>;
 24 | };
 25 | 
 26 | template <>
 27 | struct PacketTraits<float> {
 28 |   using type = __m256;
 29 |   static constexpr bool is_vectorizable = true;
 30 |   static constexpr int size = NumElementsInPacket<float>;
 31 | };
 32 | 
 33 | template <>
 34 | struct PacketTraits<double> {
 35 |   using type = __m256d;
 36 |   static constexpr bool is_vectorizable = true;
 37 |   static constexpr int size = NumElementsInPacket<double>;
 38 | };
 39 | 
 40 | template <typename ScalarType>
 41 | inline typename PacketTraits<ScalarType>::type Load(ScalarType *source) {
 42 |   if constexpr (std::is_same_v<ScalarType, int>) {
 43 |     return _mm256_load_si256(reinterpret_cast<const PacketTraits<int>::type *>(source));
 44 |   } else if constexpr (std::is_same_v<ScalarType, float>) {
 45 |     return _mm256_load_ps(reinterpret_cast<const float *>(source));
 46 |   } else if constexpr (std::is_same_v<ScalarType, double>) {
 47 |     return _mm256_load_pd(reinterpret_cast<const double *>(source));
 48 |   }
 49 | }
 50 | 
 51 | template <typename ScalarType>
 52 | inline void Store(ScalarType *dest, typename PacketTraits<ScalarType>::type source) {
 53 |   if constexpr (std::is_same_v<ScalarType, int>) {
 54 |     _mm256_store_si256(reinterpret_cast<PacketTraits<int>::type *>(dest), source);
 55 |   } else if constexpr (std::is_same_v<ScalarType, float>) {
 56 |     _mm256_store_ps(dest, source);
 57 |   } else if constexpr (std::is_same_v<ScalarType, double>) {
 58 |     _mm256_store_pd(dest, source);
 59 |   }
 60 | }
 61 | 
 62 | template <typename PacketType>
 63 | inline PacketType Add(PacketType left, PacketType right) {
 64 |   if constexpr (std::is_same_v<PacketType, __m256i>) {
 65 |     return _mm256_add_epi32(left, right);
 66 |   } else if constexpr (std::is_same_v<PacketType, __m256>) {
 67 |     return _mm256_add_ps(left, right);
 68 |   } else if constexpr (std::is_same_v<PacketType, __m256d>) {
 69 |     return _mm256_add_pd(left, right);
 70 |   }
 71 | }
 72 | 
 73 | template <typename PacketType>
 74 | inline PacketType Sub(PacketType left, PacketType right) {
 75 |   if constexpr (std::is_same_v<PacketType, __m256i>) {
 76 |     return _mm256_sub_epi32(left, right);
 77 |   } else if constexpr (std::is_same_v<PacketType, __m256>) {
 78 |     return _mm256_sub_ps(left, right);
 79 |   } else if constexpr (std::is_same_v<PacketType, __m256d>) {
 80 |     return _mm256_sub_pd(left, right);
 81 |   }
 82 | }
 83 | 
 84 | template <typename PacketType>
 85 | inline PacketType Mult(PacketType left, PacketType right) {
 86 |   if constexpr (std::is_same_v<PacketType, __m256i>) {
 87 |     return _mm256_mullo_epi32(left, right);
 88 |   } else if constexpr (std::is_same_v<PacketType, __m256>) {
 89 |     return _mm256_mul_ps(left, right);
 90 |   } else if constexpr (std::is_same_v<PacketType, __m256d>) {
 91 |     return _mm256_mul_pd(left, right);
 92 |   }
 93 | }
 94 | 
 95 | template <typename PacketType>
 96 | inline PacketType Div(PacketType dividend, PacketType divisor) {
 97 |   if constexpr (std::is_same_v<PacketType, __m256>) {
 98 |     return _mm256_div_ps(dividend, divisor);
 99 |   } else if constexpr (std::is_same_v<PacketType, __m256d>) {
100 |     return _mm256_div_pd(dividend, divisor);
101 |   }
102 | }
103 | 
104 | } // namespace fasttensor::simd
105 | 


--------------------------------------------------------------------------------
/fasttensor/Simd/Generic.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace fasttensor::simd {
 4 | 
 5 | constexpr int PacketSize = 1;
 6 | 
 7 | template <typename T>
 8 | constexpr int NumElementsInPacket = PacketSize / sizeof(T);
 9 | 
10 | template <typename T>
11 | struct PacketTraits {
12 |   using type = T;
13 |   static constexpr bool is_vectorizable = false;
14 |   static constexpr int size = 1;
15 | };
16 | 
17 | template <typename T>
18 | inline T Load(T *source) {
19 |   return source;
20 | }
21 | 
22 | template <typename T>
23 | inline void Store(T *dest, T *source) {
24 |   dest = source;
25 | }
26 | 
27 | template <typename T>
28 | inline T Add(T *left, T *right) {
29 |   return *left + *right;
30 | }
31 | 
32 | template <typename T>
33 | inline T Sub(T *left, T *right) {
34 |   return *left - *right;
35 | }
36 | 
37 | template <typename T>
38 | inline T Mult(T *left, T *right) {
39 |   return *left * *right;
40 | }
41 | 
42 | template <typename T>
43 | inline T Div(T *left, T *right) {
44 |   return *left / *right;
45 | }
46 | 
47 | } // namespace fasttensor::simd
48 | 


--------------------------------------------------------------------------------
/fasttensor/Simd/Simd.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "SimdMacros.hpp"
 4 | 
 5 | #if SSE_INSTR_SET > 7
 6 | #  include "Avx2.hpp"
 7 | #else
 8 | #  include "Generic.hpp"
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/fasttensor/Simd/SimdMacros.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace fasttensor {
 4 | 
 5 | namespace simd {
 6 | 
 7 | #if (defined(_M_AMD64) || defined(_M_X64) || defined(__amd64)) && !defined(__x86_64__)
 8 | #  define __x86_64__ 1
 9 | #endif
10 | 
11 | // Find sse instruction set from compiler macros if SSE_INSTR_SET not defined
12 | // Note: Not all compilers define these macros automatically
13 | #ifndef SSE_INSTR_SET
14 | #  if defined(__AVX2__)
15 | #    define SSE_INSTR_SET 8
16 | #  elif defined(__AVX__)
17 | #    define SSE_INSTR_SET 7
18 | #  elif defined(__SSE4_2__)
19 | #    define SSE_INSTR_SET 6
20 | #  elif defined(__SSE4_1__)
21 | #    define SSE_INSTR_SET 5
22 | #  elif defined(__SSSE3__)
23 | #    define SSE_INSTR_SET 4
24 | #  elif defined(__SSE3__)
25 | #    define SSE_INSTR_SET 3
26 | #  elif defined(__SSE2__) || defined(__x86_64__)
27 | #    define SSE_INSTR_SET 2
28 | #  elif defined(__SSE__)
29 | #    define SSE_INSTR_SET 1
30 | #  elif defined(_M_IX86_FP) // Defined in MS compiler on 32bits system. 1: SSE, 2: SSE2
31 | #    define SSE_INSTR_SET _M_IX86_FP
32 | #  else
33 | #    define SSE_INSTR_SET 0
34 | #  endif // instruction set defines
35 | #endif   // SSE_INSTR_SET
36 | 
37 | } // namespace simd
38 | 
39 | } // namespace fasttensor
40 | 
41 | // Include the appropriate header file for intrinsic functions
42 | #if SSE_INSTR_SET > 7 // AVX2 and later
43 | #  ifdef __GNUC__
44 | #    include <x86intrin.h> // x86intrin.h includes header files for whatever instruction
45 |                            // sets are specified on the compiler command line, such as:
46 |                            // xopintrin.h, fma4intrin.h
47 | #  else
48 | #    include <immintrin.h> // MS version of immintrin.h covers AVX, AVX2 and FMA3
49 | #  endif                   // __GNUC__
50 | #elif SSE_INSTR_SET == 7
51 | #  include <immintrin.h> // AVX
52 | #elif SSE_INSTR_SET == 6
53 | #  include <nmmintrin.h> // SSE4.2
54 | #elif SSE_INSTR_SET == 5
55 | #  include <smmintrin.h> // SSE4.1
56 | #elif SSE_INSTR_SET == 4
57 | #  include <tmmintrin.h> // SSSE3
58 | #elif SSE_INSTR_SET == 3
59 | #  include <pmmintrin.h> // SSE3
60 | #elif SSE_INSTR_SET == 2
61 | #  include <emmintrin.h> // SSE2
62 | #elif SSE_INSTR_SET == 1
63 | #  include <xmmintrin.h> // SSE
64 | #endif
65 | 


--------------------------------------------------------------------------------
/fasttensor/StorageUnwrapper.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "CWiseBinaryOp.hpp"
 4 | #include "Tensor.hpp"
 5 | #include "TensorStorageRef.hpp"
 6 | 
 7 | namespace fasttensor {
 8 | 
 9 | template <typename LeftExpr, typename RightExpr, BinaryOp Op>
10 | auto inline UnwrapStorage(CWiseBinaryOp<LeftExpr, RightExpr, Op> const &cwise_binary_op) {
11 |   auto left_expr = UnwrapStorage(cwise_binary_op.leftExpr());
12 |   auto right_expr = UnwrapStorage(cwise_binary_op.rightExpr());
13 |   return CWiseBinaryOp<const typeof(left_expr), const typeof(right_expr), Op>(left_expr,
14 |                                                                               right_expr);
15 | }
16 | 
17 | template <typename ElementType, int Rank>
18 | auto inline UnwrapStorage(Tensor<ElementType, Rank> const &tensor) {
19 |   return TensorStorageRef(tensor.storage().elements());
20 | }
21 | 
22 | } // namespace fasttensor
23 | 


--------------------------------------------------------------------------------
/fasttensor/Tensor.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "DefaultDevice.hpp"
 4 | #include "Device.hpp"
 5 | #include "DeviceFactory.hpp"
 6 | #include "GpuDeviceFunction.hpp"
 7 | #include "RefSelector.hpp"
 8 | #include "Simd/Simd.hpp"
 9 | #include "TensorExpression.hpp"
10 | #include "TensorStorage.hpp"
11 | #include "UnrollUtils.hpp"
12 | 
13 | #if defined FASTTENSOR_GPU
14 | #  include "GpuDevice.hpp"
15 | #endif
16 | 
17 | namespace fasttensor {
18 | 
19 | #if defined FASTTENSOR_GPU
20 | using DefaultDeviceType = GpuDevice;
21 | #else
22 | using DefaultDeviceType = DefaultDevice;
23 | #endif
24 | 
25 | template <typename ElementType, int Rank, typename DeviceType = DefaultDeviceType>
26 | class Tensor;
27 | 
28 | template <typename T>
29 | constexpr bool is_tensor = false;
30 | 
31 | template <typename ElementType, int Rank>
32 | constexpr bool is_tensor<Tensor<ElementType, Rank>> = true;
33 | 
34 | template <typename T>
35 | struct ref_selector<T, typename std::enable_if_t<is_tensor<std::remove_cv_t<T>>>> {
36 |   using type = T &;
37 | };
38 | 
39 | template <typename ElementType, int Rank, typename DeviceType>
40 | class Tensor : public TensorExpression {
41 | public:
42 |   using TStorage = TensorStorage<ElementType, Rank>;
43 |   using Self = Tensor<ElementType, Rank, DeviceType>;
44 | 
45 |   Tensor(std::array<std::ptrdiff_t, Rank> dimensions)
46 |       : _storage(dimensions), _device(DeviceFactory<DeviceType>::GetDevice()) {}
47 | 
48 |   Tensor(const Self &other)
49 |       : _storage(other._storage), _device(DeviceFactory<DeviceType>::GetDevice()) {}
50 | 
51 |   inline auto &storage() { return _storage; }
52 | 
53 |   inline const auto &storage() const { return _storage; }
54 | 
55 |   template <typename... Index>
56 |   inline ElementType &operator()(Index... indices) {
57 |     return _storage(std::array<std::ptrdiff_t, Rank>{indices...});
58 |   }
59 | 
60 |   inline auto num_elements() { return _storage.num_elements(); }
61 | 
62 |   inline const auto &dimensions() { return _storage.dimensions(); }
63 | 
64 |   inline auto getPacket(std::ptrdiff_t n) const { return _storage.getPacket(n); }
65 | 
66 |   GPU_DEVICE_FUNC inline auto getCoeff(std::ptrdiff_t n) const { return _storage.getCoeff(n); }
67 | 
68 |   inline auto &device() const { return _device; }
69 | 
70 |   template <typename OtherExpr, typename = enable_if_tensor_exprs<OtherExpr>>
71 |   inline Tensor &operator=(OtherExpr const &);
72 | 
73 | private:
74 |   TStorage _storage;
75 |   DeviceType _device;
76 | };
77 | 
78 | } // namespace fasttensor
79 | 


--------------------------------------------------------------------------------
/fasttensor/TensorExpression.fwd.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | namespace fasttensor {
4 | 
5 | class TensorExpression;
6 | 
7 | } // namespace fasttensor
8 | 


--------------------------------------------------------------------------------
/fasttensor/TensorExpression.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "TensorExpression.fwd.hpp"
 4 | #include <type_traits>
 5 | 
 6 | namespace fasttensor {
 7 | 
 8 | class TensorExpression {};
 9 | 
10 | template <typename T>
11 | constexpr bool is_tensor_expr = std::is_base_of_v<TensorExpression, T>;
12 | 
13 | template <typename... T>
14 | constexpr bool are_tensor_exprs = (... && is_tensor_expr<T>);
15 | 
16 | template <typename... T>
17 | using enable_if_tensor_exprs = std::enable_if_t<are_tensor_exprs<T...>>;
18 | 
19 | } // namespace fasttensor
20 | 


--------------------------------------------------------------------------------
/fasttensor/TensorStorage.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "GpuDeviceFunction.hpp"
 4 | #include "Memory.hpp"
 5 | #include "Simd/Simd.hpp"
 6 | #include "UnrollUtils.hpp"
 7 | #include <algorithm>
 8 | #include <functional>
 9 | 
10 | namespace fasttensor {
11 | 
12 | template <typename ElementType, int Rank>
13 | class TensorStorage {
14 | public:
15 |   using PacketType = typename simd::PacketTraits<ElementType>::type;
16 |   int PacketSize = simd::PacketTraits<ElementType>::size;
17 | 
18 |   TensorStorage() : _dimensions(), _num_elements(0), _elements(nullptr) {}
19 | 
20 |   TensorStorage(std::array<std::ptrdiff_t, Rank> dimensions) : _dimensions(dimensions) {
21 |     _num_elements = utils::fold<Rank - 1, std::ptrdiff_t>(_dimensions, std::multiplies());
22 |     _elements = AllocateMemory<ElementType>(_num_elements);
23 |   }
24 | 
25 |   TensorStorage(const TensorStorage &other) : TensorStorage(other._dimensions) {
26 |     std::copy(other._elements, other._elements + _num_elements, _elements);
27 |   }
28 | 
29 |   inline friend void swap(TensorStorage &first, TensorStorage &second) noexcept {
30 |     using std::swap;
31 | 
32 |     swap(first._dimensions, second._dimensions);
33 |     swap(first._num_elements, second._num_elements);
34 |     swap(first._elements, second._elements);
35 |   }
36 | 
37 |   TensorStorage(TensorStorage &&other) noexcept : TensorStorage() { swap(*this, other); }
38 | 
39 |   inline TensorStorage &operator=(TensorStorage other) {
40 |     swap(*this, other);
41 |     return *this;
42 |   }
43 | 
44 |   inline auto elements() { return _elements; }
45 | 
46 |   inline auto elements() const { return _elements; }
47 | 
48 |   inline auto num_elements() { return _num_elements; }
49 | 
50 |   inline const auto &dimensions() { return _dimensions; }
51 | 
52 |   inline PacketType getPacket(std::ptrdiff_t index) const {
53 |     return simd::Load(&_elements[index * PacketSize]);
54 |   }
55 | 
56 |   inline void storePacket(std::ptrdiff_t index, PacketType packet) {
57 |     simd::Store(&_elements[index * PacketSize], packet);
58 |   }
59 | 
60 |   inline const ElementType &getCoeff(std::ptrdiff_t index) const { return _elements[index]; }
61 | 
62 |   inline const ElementType &getCoeff(std::array<std::ptrdiff_t, Rank> indices) const {
63 |     return _elements[utils::getIndex<Rank - 1>(_dimensions, indices)];
64 |   }
65 | 
66 |   inline ElementType &operator()(std::array<std::ptrdiff_t, Rank> indices) {
67 |     return _elements[utils::getIndex<Rank - 1>(_dimensions, indices)];
68 |   }
69 | 
70 |   inline void storeCoeff(ElementType element, std::ptrdiff_t index) { _elements[index] = element; }
71 | 
72 |   ~TensorStorage() { DeallocateMemory(_elements); }
73 | 
74 | private:
75 |   std::array<std::ptrdiff_t, Rank> _dimensions;
76 |   std::ptrdiff_t _num_elements;
77 |   ElementType *_elements;
78 | };
79 | 
80 | } // namespace fasttensor
81 | 


--------------------------------------------------------------------------------
/fasttensor/TensorStorageRef.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "GpuDeviceFunction.hpp"
 4 | #include "TensorExpression.hpp"
 5 | 
 6 | namespace fasttensor {
 7 | 
 8 | template <typename ElementType>
 9 | class TensorStorageRef : TensorExpression {
10 | public:
11 |   TensorStorageRef(ElementType *elements) : _elements(elements) {}
12 | 
13 |   GPU_DEVICE_FUNC inline const auto &getCoeff(std::ptrdiff_t index) const {
14 |     return _elements[index];
15 |   }
16 | 
17 |   inline auto elements() { return _elements; }
18 | 
19 |   inline auto elements() const { return _elements; }
20 | 
21 | private:
22 |   ElementType *_elements;
23 | };
24 | 
25 | } // namespace fasttensor
26 | 


--------------------------------------------------------------------------------
/fasttensor/UnrollUtils.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <array>
 4 | 
 5 | namespace fasttensor::utils {
 6 | 
 7 | template <std::size_t iter, typename Scalar, std::size_t N, typename Op>
 8 | inline Scalar fold(const std::array<Scalar, N> &list, const Op &op) {
 9 |   if constexpr (iter == 0) {
10 |     return list[0];
11 |   } else {
12 |     return op(list[iter], fold<iter - 1>(list, op));
13 |   }
14 | }
15 | 
16 | template <std::size_t iter, std::size_t N>
17 | inline std::ptrdiff_t getIndex(const std::array<std::ptrdiff_t, N> &dimensions,
18 |                                const std::array<std::ptrdiff_t, N> &indices) {
19 |   if constexpr (iter == 0) {
20 |     return indices[0];
21 |   } else {
22 |     return indices[iter] + dimensions[iter] * getIndex<iter - 1>(dimensions, indices);
23 |   }
24 | }
25 | 
26 | } // namespace fasttensor::utils
27 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.14.6)
 2 | 
 3 | SET(SOURCE_FILES
 4 | 	cwiseops.cpp
 5 | 	test.cpp)
 6 | 
 7 | add_executable(tests ${SOURCE_FILES})
 8 | target_link_libraries(tests fasttensor gtest)
 9 | 
10 | target_compile_features(tests PRIVATE cxx_std_17)
11 | 
12 | target_compile_options(tests PRIVATE -march=native)
13 | 
14 | install(TARGETS tests DESTINATION bin)


--------------------------------------------------------------------------------
/tests/cwiseops.cpp:
--------------------------------------------------------------------------------
  1 | #include "Assign.hpp"
  2 | #include "CWiseBinaryOp.hpp"
  3 | #include "Tensor.hpp"
  4 | #include "gtest/gtest.h"
  5 | #include <array>
  6 | #include <cstddef>
  7 | 
  8 | using namespace std;
  9 | using namespace fasttensor;
 10 | 
 11 | template <typename ElementType, int Rank>
 12 | Tensor<ElementType, Rank> CreateTensor(array<ptrdiff_t, Rank> dimensions) {
 13 |   Tensor<ElementType, Rank> t(dimensions);
 14 |   auto num_elts = t.num_elements();
 15 |   auto &_storage = t.storage();
 16 |   for (int i = 0; i < num_elts; ++i) {
 17 |     _storage.storeCoeff(i, i);
 18 |   }
 19 |   return t;
 20 | }
 21 | 
 22 | TEST(CWiseOps, AddInt) {
 23 |   int n_i = 10, n_j = 11, n_k = 13;
 24 |   array<ptrdiff_t, 3> dimensions{n_i, n_j, n_k};
 25 |   auto a = CreateTensor<int, 3>(dimensions);
 26 |   auto b = CreateTensor<int, 3>(dimensions);
 27 |   Tensor<int, 3> result(dimensions);
 28 |   result = a + b;
 29 |   for (int i = 0; i < n_i; ++i) {
 30 |     for (int j = 0; j < n_j; ++j) {
 31 |       for (int k = 0; k < n_k; ++k) {
 32 |         EXPECT_EQ(result(i, j, k), 2 * (k + n_k * (j + (n_j * i))));
 33 |       }
 34 |     }
 35 |   }
 36 | }
 37 | 
 38 | TEST(CWiseOps, SubInt) {
 39 |   int n_i = 10, n_j = 11, n_k = 13;
 40 |   array<ptrdiff_t, 3> dimensions{n_i, n_j, n_k};
 41 |   auto a = CreateTensor<int, 3>(dimensions);
 42 |   auto b = CreateTensor<int, 3>(dimensions);
 43 |   auto result = CreateTensor<int, 3>(dimensions);
 44 |   result = a - b;
 45 |   for (int i = 0; i < n_i; ++i) {
 46 |     for (int j = 0; j < n_j; ++j) {
 47 |       for (int k = 0; k < n_k; ++k) {
 48 |         EXPECT_EQ(result(i, j, k), 0);
 49 |       }
 50 |     }
 51 |   }
 52 | }
 53 | 
 54 | TEST(CWiseOps, MultInt) {
 55 |   int n_i = 10, n_j = 11, n_k = 13;
 56 |   array<ptrdiff_t, 3> dimensions{n_i, n_j, n_k};
 57 |   auto a = CreateTensor<int, 3>(dimensions);
 58 |   auto b = CreateTensor<int, 3>(dimensions);
 59 |   Tensor<int, 3> result(dimensions);
 60 |   result = a * b;
 61 |   for (int i = 0; i < n_i; ++i) {
 62 |     for (int j = 0; j < n_j; ++j) {
 63 |       for (int k = 0; k < n_k; ++k) {
 64 |         auto elt = (k + n_k * (j + (n_j * i)));
 65 |         EXPECT_EQ(result(i, j, k), elt * elt);
 66 |       }
 67 |     }
 68 |   }
 69 | }
 70 | 
 71 | TEST(CWiseOps, AddFloat) {
 72 |   int n_i = 10, n_j = 11, n_k = 13;
 73 |   array<ptrdiff_t, 3> dimensions{n_i, n_j, n_k};
 74 |   auto a = CreateTensor<float, 3>(dimensions);
 75 |   auto b = CreateTensor<float, 3>(dimensions);
 76 |   Tensor<float, 3> result(dimensions);
 77 |   result = a + b;
 78 |   for (int i = 0; i < n_i; ++i) {
 79 |     for (int j = 0; j < n_j; ++j) {
 80 |       for (int k = 0; k < n_k; ++k) {
 81 |         EXPECT_FLOAT_EQ(result(i, j, k), 2 * (k + n_k * (j + (n_j * i))));
 82 |       }
 83 |     }
 84 |   }
 85 | }
 86 | 
 87 | TEST(CWiseOps, SubFloat) {
 88 |   int n_i = 10, n_j = 11, n_k = 13;
 89 |   array<ptrdiff_t, 3> dimensions{n_i, n_j, n_k};
 90 |   auto a = CreateTensor<float, 3>(dimensions);
 91 |   auto b = CreateTensor<float, 3>(dimensions);
 92 |   auto result = CreateTensor<float, 3>(dimensions);
 93 |   result = a - b;
 94 |   for (int i = 0; i < n_i; ++i) {
 95 |     for (int j = 0; j < n_j; ++j) {
 96 |       for (int k = 0; k < n_k; ++k) {
 97 |         EXPECT_FLOAT_EQ(result(i, j, k), 0);
 98 |       }
 99 |     }
100 |   }
101 | }
102 | 
103 | TEST(CWiseOps, MultFloat) {
104 |   int n_i = 10, n_j = 11, n_k = 13;
105 |   array<ptrdiff_t, 3> dimensions{n_i, n_j, n_k};
106 |   auto a = CreateTensor<float, 3>(dimensions);
107 |   auto b = CreateTensor<float, 3>(dimensions);
108 |   Tensor<float, 3> result(dimensions);
109 |   result = a * b;
110 |   for (int i = 0; i < n_i; ++i) {
111 |     for (int j = 0; j < n_j; ++j) {
112 |       for (int k = 0; k < n_k; ++k) {
113 |         auto elt = (k + n_k * (j + (n_j * i)));
114 |         EXPECT_FLOAT_EQ(result(i, j, k), elt * elt);
115 |       }
116 |     }
117 |   }
118 | }
119 | 
120 | TEST(CWiseOps, DivFloat) {
121 |   int n_i = 10, n_j = 11, n_k = 13;
122 |   array<ptrdiff_t, 3> dimensions{n_i, n_j, n_k};
123 |   auto a = CreateTensor<float, 3>(dimensions);
124 |   auto b = CreateTensor<float, 3>(dimensions);
125 |   a(0, 0, 0) = 1;
126 |   b(0, 0, 0) = 1;
127 |   Tensor<float, 3> result(dimensions);
128 |   result = a / b;
129 |   for (int i = 0; i < n_i; ++i) {
130 |     for (int j = 0; j < n_j; ++j) {
131 |       for (int k = 0; k < n_k; ++k) {
132 |         EXPECT_FLOAT_EQ(result(i, j, k), 1);
133 |       }
134 |     }
135 |   }
136 | }
137 | 
138 | TEST(CWiseOps, MultipleOpsFloat) {
139 |   int n_i = 10, n_j = 11, n_k = 13;
140 |   array<ptrdiff_t, 3> dimensions{n_i, n_j, n_k};
141 |   auto a = CreateTensor<float, 3>(dimensions);
142 |   auto b = CreateTensor<float, 3>(dimensions);
143 |   auto c = CreateTensor<float, 3>(dimensions);
144 |   auto result = CreateTensor<float, 3>(dimensions);
145 | 
146 |   result = a + b - c;
147 |   for (int i = 0; i < n_i; ++i) {
148 |     for (int j = 0; j < n_j; ++j) {
149 |       for (int k = 0; k < n_k; ++k) {
150 |         EXPECT_FLOAT_EQ(result(i, j, k), k + n_k * (j + (n_j * i)));
151 |       }
152 |     }
153 |   }
154 | 
155 |   result = a + b + c;
156 |   for (int i = 0; i < n_i; ++i) {
157 |     for (int j = 0; j < n_j; ++j) {
158 |       for (int k = 0; k < n_k; ++k) {
159 |         EXPECT_FLOAT_EQ(result(i, j, k), 3 * (k + n_k * (j + (n_j * i))));
160 |       }
161 |     }
162 |   }
163 | }
164 | 
165 | TEST(CWiseOps, AddDouble) {
166 |   int n_i = 10, n_j = 11, n_k = 13;
167 |   array<ptrdiff_t, 3> dimensions{n_i, n_j, n_k};
168 |   auto a = CreateTensor<double, 3>(dimensions);
169 |   auto b = CreateTensor<double, 3>(dimensions);
170 |   Tensor<double, 3> result(dimensions);
171 |   result = a + b;
172 |   for (int i = 0; i < n_i; ++i) {
173 |     for (int j = 0; j < n_j; ++j) {
174 |       for (int k = 0; k < n_k; ++k) {
175 |         EXPECT_DOUBLE_EQ(result(i, j, k), 2 * (k + n_k * (j + (n_j * i))));
176 |       }
177 |     }
178 |   }
179 | }
180 | 
181 | TEST(CWiseOps, SubDouble) {
182 |   int n_i = 10, n_j = 11, n_k = 13;
183 |   array<ptrdiff_t, 3> dimensions{n_i, n_j, n_k};
184 |   auto a = CreateTensor<double, 3>(dimensions);
185 |   auto b = CreateTensor<double, 3>(dimensions);
186 |   auto result = CreateTensor<double, 3>(dimensions);
187 |   result = a - b;
188 |   for (int i = 0; i < n_i; ++i) {
189 |     for (int j = 0; j < n_j; ++j) {
190 |       for (int k = 0; k < n_k; ++k) {
191 |         EXPECT_DOUBLE_EQ(result(i, j, k), 0);
192 |       }
193 |     }
194 |   }
195 | }
196 | 
197 | TEST(CWiseOps, MultDouble) {
198 |   int n_i = 10, n_j = 11, n_k = 13;
199 |   array<ptrdiff_t, 3> dimensions{n_i, n_j, n_k};
200 |   auto a = CreateTensor<double, 3>(dimensions);
201 |   auto b = CreateTensor<double, 3>(dimensions);
202 |   Tensor<double, 3> result(dimensions);
203 |   result = a * b;
204 |   for (int i = 0; i < n_i; ++i) {
205 |     for (int j = 0; j < n_j; ++j) {
206 |       for (int k = 0; k < n_k; ++k) {
207 |         auto elt = (k + n_k * (j + (n_j * i)));
208 |         EXPECT_DOUBLE_EQ(result(i, j, k), elt * elt);
209 |       }
210 |     }
211 |   }
212 | }
213 | 
214 | TEST(CWiseOps, DivDouble) {
215 |   int n_i = 10, n_j = 11, n_k = 13;
216 |   array<ptrdiff_t, 3> dimensions{n_i, n_j, n_k};
217 |   auto a = CreateTensor<double, 3>(dimensions);
218 |   auto b = CreateTensor<double, 3>(dimensions);
219 |   a(0, 0, 0) = 1;
220 |   b(0, 0, 0) = 1;
221 |   Tensor<double, 3> result(dimensions);
222 |   result = a / b;
223 |   for (int i = 0; i < n_i; ++i) {
224 |     for (int j = 0; j < n_j; ++j) {
225 |       for (int k = 0; k < n_k; ++k) {
226 |         EXPECT_DOUBLE_EQ(result(i, j, k), 1);
227 |       }
228 |     }
229 |   }
230 | }
231 | 


--------------------------------------------------------------------------------
/tests/test.cpp:
--------------------------------------------------------------------------------
 1 | #include "CWiseBinaryOp.hpp"
 2 | #include "Simd/Simd.hpp"
 3 | #include "Tensor.hpp"
 4 | #include "TensorExpression.hpp"
 5 | #include "gtest/gtest.h"
 6 | #include <array>
 7 | #include <iostream>
 8 | 
 9 | using namespace std;
10 | using namespace fasttensor;
11 | 
12 | int main(int argc, char **argv) {
13 |   ::testing::InitGoogleTest(&argc, argv);
14 | 
15 |   cout << simd::PacketTraits<int>::size << endl;
16 |   cout << simd::PacketTraits<char>::size << endl;
17 | 
18 |   cout << "Device Type: " << static_cast<int>(device_type) << endl;
19 | 
20 |   Tensor<int, 2> t(array<ptrdiff_t, 2>{4, 2});
21 |   Tensor<int, 2> q(array<ptrdiff_t, 2>{4, 2});
22 |   auto result = q + t;
23 |   return RUN_ALL_TESTS();
24 | }
25 | 


--------------------------------------------------------------------------------