├── .clang-format ├── .clang_complete ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── apps ├── CMakeLists.txt ├── address_packer_search.cpp ├── baselines.hpp ├── gflops.cpp ├── loop_nest.cpp ├── loop_nest_baseline.hpp ├── loop_nest_bench.cpp ├── loop_nest_bench.hpp ├── loop_nest_fp16.cpp ├── loop_nest_stress_test.cpp ├── loop_nest_tensillica.cpp ├── loop_nest_test.hpp ├── loop_nest_tests.cpp ├── loop_tree.cpp ├── serialization.cpp ├── tensillica_play.cpp ├── transposer.cpp ├── transposer_baseline.hpp ├── transposer_bench.hpp └── transposer_stress_test.cpp ├── assets └── logo │ ├── fulllogo.jpg │ ├── fulllogo.png │ ├── fulllogo_nobuffer.jpg │ ├── fulllogo_nobuffer.png │ ├── fulllogo_transparent.png │ ├── fulllogo_transparent_nobuffer.png │ ├── grayscale.png │ ├── grayscale_nobuffer.png │ ├── grayscale_transparent.png │ ├── grayscale_transparent_nobuffer.png │ ├── icononly.png │ ├── icononly_nobuffer.png │ ├── icononly_transparent.png │ ├── icononly_transparent_nobuffer.png │ ├── icononly_transparent_nobuffer_padded.png │ ├── print.eps │ ├── print.pdf │ ├── print.svg │ ├── print_transparent.eps │ ├── print_transparent.pdf │ ├── print_transparent.svg │ ├── textonly.png │ └── textonly_nobuffer.png ├── cmake ├── aarch64 │ └── dabun.cmake └── x86_64 │ └── dabun.cmake ├── dabun_config.hpp.in ├── include └── dabun │ ├── aligned_vector.hpp │ ├── arithmetic_operation.hpp │ ├── arm │ ├── arithmetic_operation.hpp │ ├── configuration.hpp │ ├── elementwise_operation.hpp │ ├── loop_nest.hpp │ ├── loop_nest_fp16.hpp │ ├── meta_mnemonics.hpp │ ├── multi_vreg.hpp │ ├── peak_gflops.hpp │ ├── transposer.hpp │ └── xbyak.hpp │ ├── check.hpp │ ├── code_generator.hpp │ ├── code_generator │ ├── aot_fn.hpp │ ├── code_generator.hpp │ ├── memory_resource.hpp │ └── xbyak.hpp │ ├── common.hpp │ ├── configuration.hpp │ ├── core.hpp │ ├── elementwise_operation.hpp │ ├── hask │ └── apple.hpp │ ├── isa.hpp │ ├── loop_nest.hpp │ ├── loop_nest_descriptor.hpp │ ├── loop_tree │ ├── all_nodes.hpp │ ├── compiled_loop_nest_node.hpp │ ├── compiled_transpose_node.hpp │ ├── compute_node.hpp │ ├── for_loop_node.hpp │ ├── nested_for_loops_node.hpp │ ├── node.hpp │ ├── program.hpp │ ├── report.hpp │ ├── transpose_node.hpp │ ├── types.hpp │ └── utility.hpp │ ├── math.hpp │ ├── numeric.hpp │ ├── one_constant.hpp │ ├── peak_gflops.hpp │ ├── predef.hpp │ ├── random_vector.hpp │ ├── serialization.hpp │ ├── tensillica │ ├── cpp_intrinsics_code_generator.hpp │ ├── dl_compiled_fn.hpp │ ├── loop_nest.hpp │ ├── multi_vmm.hpp │ └── peak_gflops.hpp │ ├── transposer.hpp │ ├── utility │ ├── log.hpp │ ├── most_frequent_queue.hpp │ └── tmp_file_name.hpp │ └── x86 │ ├── address_packer.hpp │ ├── arithmetic_operation.hpp │ ├── configuration.hpp │ ├── denormals.hpp │ ├── elementwise_operation.hpp │ ├── loop_nest.hpp │ ├── multi_vmm.hpp │ ├── peak_gflops.hpp │ ├── transposer.hpp │ └── xbyak.hpp ├── src ├── loop_nest.cpp ├── peak_gflops.cpp ├── transposer.cpp └── x86 │ └── multi_vmm.cpp ├── tests ├── CMakeLists.txt ├── baseline │ ├── loop_nest_baseline.hpp │ └── matrix_transpose_baseline.hpp ├── catch2_main.cpp ├── handpicked_loop_nest_test.cpp ├── sentinel.cpp └── transpose_meta_mnemonics.cpp └── utils.h /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | IndentWidth: 4 3 | --- 4 | Language: Cpp 5 | # Force pointers to the type for C++. 6 | DerivePointerAlignment: false 7 | PointerAlignment: Left 8 | # QualifierAlignmentStyle: Right 9 | # ReferenceAlignmentStyle: Left 10 | 11 | UseTab: Never 12 | IndentWidth: 4 13 | BreakBeforeBraces: Allman 14 | AllowShortIfStatementsOnASingleLine: false 15 | IndentCaseLabels: false 16 | ColumnLimit: 80 17 | AccessModifierOffset: -4 18 | AlignConsecutiveAssignments: true 19 | AlignConsecutiveDeclarations: true 20 | AlignOperands: true 21 | BreakBeforeBraces: Allman 22 | AlwaysBreakTemplateDeclarations: true 23 | BreakConstructorInitializersBeforeComma: true 24 | IndentPPDirectives: AfterHash -------------------------------------------------------------------------------- /.clang_complete: -------------------------------------------------------------------------------- 1 | -I. 2 | -I./xbyak -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | loop_nest 34 | 35 | build 36 | cmake-build-debug 37 | cmake-build-release 38 | *.asm 39 | tmp 40 | .idea 41 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "xbyak"] 2 | path = extern/xbyak 3 | url = https://github.com/herumi/xbyak 4 | [submodule "xbyak_aarch64"] 5 | path = extern/xbyak_aarch64 6 | url = https://github.com/zlateski/xbyak_aarch64 7 | branch = fjmaster 8 | [submodule "extern/fmt"] 9 | path = extern/fmt 10 | url = https://github.com/fmtlib/fmt 11 | [submodule "extern/Catch2"] 12 | path = extern/Catch2 13 | url = https://github.com/catchorg/Catch2 14 | branch = v2.x 15 | [submodule "extern/cpuinfo"] 16 | path = extern/cpuinfo 17 | url = https://github.com/pytorch/cpuinfo 18 | [submodule "extern/libsysml"] 19 | path = extern/libsysml 20 | url = git@github.com:facebookresearch/libsysml.git 21 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.15.0) 2 | 3 | add_subdirectory(extern/Catch2) 4 | 5 | add_subdirectory(extern/libsysml/cpp) 6 | 7 | 8 | # TODO: This is probably a very bad and non-standard solution, figure 9 | # out what's standard and/or better 10 | 11 | get_target_property(LIBSYSML_INCLUDES sysmlcpp INCLUDE_DIRECTORIES) 12 | 13 | # foreach(dir ${LIBSYSML_INCLUDES}) 14 | # message(STATUS "Including libsysml include dir: ${dir}") 15 | # include_directories(${dir}) 16 | #endforeach() 17 | 18 | 19 | message(STATUS "${libsysml_INCLUDE_DIRS} ????") 20 | include_directories(${libsysml_INCLUDE_DIRS}) 21 | 22 | # SET (CMAKE_CXX_COMPILER "/usr/bin/clang++") 23 | 24 | project(dabun 25 | LANGUAGES CXX 26 | VERSION 0.0.0) 27 | 28 | configure_file(dabun_config.hpp.in config/dabun_config.hpp) 29 | 30 | set(CMAKE_CXX_STANDARD 20) 31 | 32 | option(DABUN_DEBUG "Set to ON to build debug version" OFF) 33 | option(DABUN_DEBUG_WERROR "Set to ON to enable all warnings in debug mode" ON) 34 | option(DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX "Set to ON to build apps for all supported extensions" ON) 35 | option(DABUN_BUILD_TESTS_FOR_ALL_ARCH_VEX "Set to ON to build tests for all supported extensions" ON) 36 | 37 | if (DABUN_DEBUG) 38 | message(STATUS "Will compile libdabun in debug mode.") 39 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") 40 | if (DABUN_DEBUG_WERROR) 41 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror -Wno-sign-compare") 42 | endif() 43 | else() 44 | message(STATUS "Will compile libdabun in release mode.") 45 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG -O3 -Wall -Wextra -Werror -Wno-sign-compare") 46 | endif() 47 | 48 | set(DABUN_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) 49 | set(DABUN_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) 50 | set(DABUN_THIRDPARTY_DIR ${DABUN_BINARY_DIR}/extern) 51 | 52 | ## 53 | ## Find Boost 54 | ## 55 | set(Boost_USE_STATIC_LIBS ON) # only find static libs 56 | set(Boost_USE_DEBUG_LIBS ${DABUN_DEBUG}) # ignore debug libs and 57 | set(Boost_USE_RELEASE_LIBS ON) # only find release libs 58 | set(Boost_USE_MULTITHREADED ON) 59 | set(Boost_USE_STATIC_RUNTIME OFF) 60 | find_package(Boost 1.66.0) 61 | if(Boost_FOUND) 62 | include_directories(${Boost_INCLUDE_DIRS}) 63 | else() 64 | message(FATAL_ERROR "Can't find boost libraries") 65 | endif() 66 | 67 | ## 68 | ## Find Cpuinfo 69 | ## 70 | if(NOT TARGET cpuinfo) 71 | #Download cpuinfo from github if CPUINFO_SOURCE_DIR is not specified. 72 | if(NOT DEFINED CPUINFO_SOURCE_DIR) 73 | set(CPUINFO_SOURCE_DIR "${DABUN_SOURCE_DIR}/extern/cpuinfo" 74 | CACHE STRING "cpuinfo source directory from submodules") 75 | endif() 76 | 77 | #build cpuinfo 78 | set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL "Do not build cpuinfo unit tests") 79 | set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL "Do not build cpuinfo mock tests") 80 | set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL "Do not build cpuinfo benchmarks") 81 | set(CPUINFO_LIBRARY_TYPE static CACHE STRING "Set lib type to static") 82 | #Select static runtime, needed for static build for MSVC 83 | set(CPUINFO_RUNTIME_TYPE static CACHE STRING "Set runtime to static") 84 | 85 | add_subdirectory("${CPUINFO_SOURCE_DIR}" "${DABUN_BINARY_DIR}/cpuinfo") 86 | set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON) 87 | endif() 88 | 89 | # include_directories(${CPUINFO_SOURCE_DIR}/include) 90 | 91 | include_directories(extern/libsysml/cpp/include) 92 | 93 | ## 94 | ## Detect host architecture 95 | ## 96 | set(DABUN_HOST_ARCHITECTURE_SUPPORTED x86_64 aarch64 arm64) 97 | 98 | execute_process(COMMAND uname -m 99 | COMMAND tr -d '\n' 100 | OUTPUT_VARIABLE DABUN_HOST_ARCHITECTURE) 101 | 102 | if("${DABUN_HOST_ARCHITECTURE}" STREQUAL "arm64") 103 | set(DABUN_HOST_ARCHITECTURE aarch64) 104 | endif() 105 | 106 | 107 | message(STATUS "Host architecture detected: ${DABUN_HOST_ARCHITECTURE}") 108 | 109 | set(DABUN_COMMON_SRC_CPP_FILES 110 | src/loop_nest.cpp 111 | src/transposer.cpp 112 | src/peak_gflops.cpp) 113 | 114 | if("${DABUN_HOST_ARCHITECTURE}" STREQUAL "x86_64") 115 | include(cmake/x86_64/dabun.cmake) 116 | elseif("${DABUN_HOST_ARCHITECTURE}" STREQUAL "aarch64") 117 | include(cmake/aarch64/dabun.cmake) 118 | else() 119 | message(FATAL_ERROR "Host architecture ${DABUN_HOST_ARCHITECTURE} not supported.") 120 | endif() 121 | 122 | add_subdirectory(apps) 123 | add_subdirectory(tests) 124 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq 81 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to loop_nest 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `main`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Facebook's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## License 30 | By contributing to loop_nest, you agree that your contributions will be licensed 31 | under the LICENSE file in the root directory of this source tree. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Meta Platforms, Inc. and affiliates. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![dabun logo](/assets/logo/icononly_transparent_nobuffer.png) 2 | -------------------------------------------------------------------------------- /apps/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(DABUN_APPS_DIR ${CMAKE_CURRENT_SOURCE_DIR}) 2 | 3 | function(dabun_extension_specific_app name vex float isa) 4 | message(STATUS "dabun_extension_specific_app ${name}.${vex}.${float} ${name}.cpp") 5 | add_executable(${name}.${vex}.${float} ${name}.cpp) 6 | target_link_libraries(${name}.${vex}.${float} 7 | PUBLIC dabun 8 | PUBLIC -ldl 9 | PUBLIC -lpthread) 10 | target_compile_options(${name}.${vex}.${float} 11 | PRIVATE "-DDABUN_ISA=${isa}" 12 | PRIVATE "-DDABUN_VEX=extension::${vex}" 13 | PRIVATE "-DDABUN_ARITHMETIC=dabun::${float}") 14 | endfunction(dabun_extension_specific_app) 15 | 16 | function(dabun_app name) 17 | message(STATUS "dabun_app ${name}.cpp ${name}.cpp") 18 | add_executable(${name} ${name}.cpp) 19 | target_link_libraries(${name} 20 | PUBLIC dabun 21 | PUBLIC -ldl 22 | PUBLIC -lpthread) 23 | endfunction(dabun_app) 24 | 25 | function(dabun_x86_apps vex float isa) 26 | dabun_extension_specific_app(address_packer_search ${vex} ${float} ${isa}) 27 | dabun_extension_specific_app(gflops ${vex} ${float} ${isa}) 28 | dabun_extension_specific_app(loop_nest ${vex} ${float} ${isa}) 29 | dabun_extension_specific_app(loop_nest_bench ${vex} ${float} ${isa}) 30 | dabun_extension_specific_app(loop_nest_stress_test ${vex} ${float} ${isa}) 31 | dabun_extension_specific_app(loop_nest_tests ${vex} ${float} ${isa}) 32 | dabun_extension_specific_app(loop_tree ${vex} ${float} ${isa}) 33 | dabun_extension_specific_app(serialization ${vex} ${float} ${isa}) 34 | dabun_extension_specific_app(transposer ${vex} ${float} ${isa}) 35 | dabun_extension_specific_app(transposer_stress_test ${vex} ${float} ${isa}) 36 | endfunction(dabun_x86_apps) 37 | 38 | function(dabun_arm_apps vex float isa) 39 | dabun_extension_specific_app(gflops ${vex} ${float} ${isa}) 40 | dabun_extension_specific_app(loop_nest ${vex} ${float} ${isa}) 41 | dabun_extension_specific_app(loop_nest_bench ${vex} ${float} ${isa}) 42 | dabun_extension_specific_app(loop_nest_stress_test ${vex} ${float} ${isa}) 43 | dabun_extension_specific_app(loop_nest_tests ${vex} ${float} ${isa}) 44 | # dabun_extension_specific_app(loop_tree ${vex} ${float} ${isa}) 45 | # dabun_extension_specific_app(serialization ${vex} ${float} ${isa}) 46 | # dabun_extension_specific_app(transposer ${vex} ${float} ${isa}) 47 | # dabun_extension_specific_app(transposer_stress_test ${vex} ${float} ${isa}) 48 | endfunction(dabun_arm_apps) 49 | 50 | dabun_app(tensillica_play) 51 | dabun_app(loop_nest_tensillica) 52 | 53 | if("${DABUN_HOST_ARCHITECTURE}" STREQUAL "x86_64") 54 | if (DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX OR DABUN_BUILD_APPS_FOR_AVX2) 55 | dabun_x86_apps(avx2 fp32_t avx2) 56 | endif() 57 | if (DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX OR DABUN_BUILD_APPS_FOR_AVX512) 58 | dabun_x86_apps(avx512 fp32_t avx512) 59 | dabun_extension_specific_app(gflops avx512_ymm fp32_t avx2_plus) 60 | endif() 61 | elseif("${DABUN_HOST_ARCHITECTURE}" STREQUAL "aarch64") 62 | if (DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX OR DABUN_BUILD_APPS_FOR_NEON) 63 | dabun_arm_apps(neon fp32_t aarch64) 64 | endif() 65 | if (DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX OR DABUN_BUILD_APPS_FOR_NEON_FP16) 66 | dabun_arm_apps(neon_fp16 fp16_t aarch64) 67 | endif() 68 | elseif("${DABUN_HOST_ARCHITECTURE}" STREQUAL "arm64") 69 | if (DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX OR DABUN_BUILD_APPS_FOR_NEON) 70 | dabun_arm_apps(neon fp32_t aarch64) 71 | endif() 72 | if (DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX OR DABUN_BUILD_APPS_FOR_NEON_FP16) 73 | dabun_arm_apps(neon_fp16 fp16_t aarch64) 74 | endif() 75 | else() 76 | endif() 77 | -------------------------------------------------------------------------------- /apps/address_packer_search.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #include 7 | 8 | #include "dabun/x86/xbyak.hpp" 9 | 10 | auto get_size(bool is_broadcast, int reg_idx) 11 | { 12 | return [=](int off) 13 | { 14 | Xbyak::CodeGenerator cg; 15 | if (is_broadcast) 16 | { 17 | cg.vfmadd231ps(cg.zmm0, cg.zmm1, 18 | cg.ptr_b[Xbyak::Reg64(reg_idx) + off * 0x4]); 19 | } 20 | else 21 | { 22 | cg.vfmadd231ps(cg.zmm0, cg.zmm1, 23 | cg.ptr[Xbyak::Reg64(reg_idx) + off * 0x40]); 24 | } 25 | return cg.getSize(); 26 | }; 27 | } 28 | 29 | auto get_size2(bool is_broadcast, int reg_idx) 30 | { 31 | return [=](int off) 32 | { 33 | Xbyak::CodeGenerator cg; 34 | if (is_broadcast) 35 | { 36 | cg.vmovups(cg.zmm0, cg.ptr_b[Xbyak::Reg64(reg_idx) + off * 0x4]); 37 | } 38 | else 39 | { 40 | cg.vmovups(cg.zmm0, cg.ptr[Xbyak::Reg64(reg_idx) + off * 0x40]); 41 | } 42 | return cg.getSize(); 43 | }; 44 | } 45 | 46 | template 47 | int binary_search(F const& f, int begin, int end, int s) 48 | { 49 | if (begin == end) 50 | { 51 | return begin; 52 | } 53 | 54 | int mid = begin + (end - begin) / 2; 55 | 56 | if (f(mid) == s) 57 | { 58 | return binary_search(f, mid + 1, end, s); 59 | } 60 | else 61 | { 62 | return binary_search(f, begin, mid, s); 63 | } 64 | } 65 | 66 | template 67 | int do_search(F const& f) 68 | { 69 | std::cout << "F(0) = " << f(0) << "; F(1) = " << f(1) << "\n"; 70 | int s = f(1); 71 | auto ret = binary_search(f, 1, 0xFFFFFF, s); 72 | return ret; 73 | } 74 | 75 | int main() 76 | { 77 | auto fn = get_size2(true, 0); 78 | 79 | std::cout << "line "; 80 | std::cout << fn(0) << ' '; 81 | std::cout << fn(10) << ' '; 82 | std::cout << do_search(fn) << ' '; 83 | std::cout << std::endl; 84 | } 85 | -------------------------------------------------------------------------------- /apps/gflops.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #include "dabun/numeric.hpp" 7 | #include "dabun/peak_gflops.hpp" 8 | 9 | #include 10 | 11 | using namespace dabun; 12 | 13 | #ifndef DABUN_ARITHMETIC 14 | # define DABUN_ARITHMETIC float 15 | #endif 16 | 17 | #ifndef DABUN_ISA 18 | # define DABUN_ISA avx2 19 | #endif 20 | 21 | int main() 22 | { 23 | std::cout << measure_peak_gflops(1) << "\n"; 24 | } 25 | -------------------------------------------------------------------------------- /apps/loop_nest_bench.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #include "loop_nest_bench.hpp" 7 | #include "loop_nest_baseline.hpp" 8 | 9 | int main() 10 | { 11 | 12 | using namespace dabun; 13 | 14 | // 2D convolution NCHW example: 15 | // O(c_out, o_h, o_w) = I(c_i, o_h + k_h, ow + k_w) * K(c_o, c_i, 16 | // k_h, k_w) 17 | { 18 | int CIN = 64; 19 | int COUT = 64; 20 | int OS = 112; 21 | int KS = 3; 22 | int IS = OS + KS - 1; 23 | 24 | loop_nest_bench( 25 | {{"c_out", 16}, // 26 | {"o_h", 1}, 27 | {"o_w", 28}, 28 | {"c_in", 16}, 29 | {"c_in", 1}, 30 | {"c_out", 1}, // 31 | {"o_w", 1}, // 32 | {"k_h", 1}, // 33 | {"k_w", 1}}, // 34 | // The second argument is a map of the dimension sizes 35 | {{"c_out", COUT}, 36 | {"o_w", OS}, 37 | {"k_w", KS}, 38 | {"c_in", CIN}, 39 | {"o_h", OS}, 40 | {"k_h", KS}, 41 | {"i_w", IS}, 42 | {"i_h", IS}}, 43 | // Vars of C (other variables are reduction variables) 44 | {"c_out", "o_w", "o_h"}, 45 | // Variables of A, note that i_w and i_h are not used 46 | {"c_in", "i_w", "i_h"}, 47 | // Variables of B 48 | {"c_in", "c_out", "k_w", "k_h"}, 49 | // C's strides for each variable 50 | {{"o_w", 1}, {"c_out", OS * OS}, {"o_h", OS}}, 51 | // A's strides for each variable Note how we 52 | // provide strides for i/k_h and i/k_w, this is 53 | // because the access to A is based on output 54 | // and reduction variables 55 | {{"o_w", 1}, 56 | {"k_w", 1}, 57 | {"c_in", IS * IS}, 58 | {"o_h", IS}, 59 | {"k_h", IS}}, 60 | // B's strides for each variable 61 | {{"c_out", KS * KS * CIN}, 62 | {"c_in", KS * KS}, 63 | {"k_w", 1}, 64 | {"k_h", KS}}, 65 | 64); 66 | } 67 | 68 | // 2D convolution on NCHW16c layout example: 69 | // O(g_out, c_out, o_h, o_w) = I(g_in, c_in, o_h + k_h, ow + k_w) * 70 | // K(g_in, g_out, c_in, c_out, k_h, k_w) 71 | // if (0) 72 | { 73 | int GIN = 128 / 16; 74 | int CIN = 16; 75 | int GOUT = 128 / 16; 76 | int COUT = 16; 77 | int OS = 56; 78 | int KS = 3; 79 | int IS = OS + KS - 1; 80 | 81 | loop_nest_bench( 82 | {{"g_out", 1}, // 83 | {"o_w", 28}, 84 | {"o_h", 1}, 85 | {"g_in", 1}, 86 | {"c_in", 1}, 87 | {"o_w", 1}, // 88 | //{"o_w", 1}, // 89 | {"k_h", 1}, // 90 | {"k_w", 1}, // 91 | {"c_out", 1}}, // 92 | // The second argument is a map of the dimension sizes 93 | {{"g_out", GOUT}, 94 | {"c_out", COUT}, 95 | {"o_w", OS}, 96 | {"k_w", KS}, 97 | {"g_in", GIN}, 98 | {"c_in", CIN}, 99 | {"o_h", OS}, 100 | {"k_h", KS}, 101 | {"i_h", IS}, 102 | {"i_w", IS}}, 103 | // Vars of C (other variables are reduction variables) 104 | {"g_out", "c_out", "o_w", "o_h"}, 105 | // Variables of A, note that i_w and i_h are not used 106 | {"g_in", "c_in", "i_w", "i_h"}, 107 | // Variables of B 108 | {"g_out", "g_in", "c_in", "c_out", "k_w", "k_h"}, 109 | // C's strides for each variable 110 | {{"g_out", OS * OS * COUT}, 111 | {"o_h", OS * COUT}, 112 | {"o_w", COUT}, 113 | {"c_out", 1}}, 114 | // A's strides for each variable Note how we 115 | // provide strides for i/k_h and i/k_w, this is 116 | // because the access to A is based on output 117 | // and reduction variables 118 | {{"g_in", IS * IS * CIN}, 119 | {"o_h", IS * CIN}, 120 | {"k_h", IS * CIN}, 121 | {"o_w", CIN}, 122 | {"k_w", CIN}, 123 | {"c_in", 1}}, 124 | // B's strides for each variable 125 | {{"g_in", COUT * KS * KS * CIN * GOUT}, 126 | {"g_out", COUT * KS * KS * CIN}, 127 | {"c_in", COUT * KS * KS}, 128 | {"k_h", COUT * KS}, 129 | {"k_w", COUT}, 130 | {"c_out", 1}}); 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /apps/loop_nest_bench.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/arithmetic_operation.hpp" 9 | #include "dabun/loop_nest.hpp" 10 | #include "dabun/random_vector.hpp" 11 | 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | namespace dabun 22 | { 23 | 24 | template 25 | void loop_nest_bench(std::vector> const& order, 26 | std::map const& sizes, 27 | std::set const& C_formula, 28 | std::set const& A_formula, 29 | std::set const& B_formula, 30 | std::map const& C_strides, 31 | std::map const& A_strides, 32 | std::map const& B_strides, int alpha = 0, 33 | int max_unrolled_fmas = 320, int total_iterations = 100) 34 | { 35 | std::int64_t C_size = 1; 36 | std::int64_t A_size = 1; 37 | std::int64_t B_size = 1; 38 | 39 | double flops = 2.0; 40 | 41 | for (auto const& s : sizes) 42 | { 43 | if (C_strides.count(s.first)) 44 | C_size += (s.second - 1) * C_strides.at(s.first); 45 | if (A_strides.count(s.first)) 46 | A_size += (s.second - 1) * A_strides.at(s.first); 47 | if (B_strides.count(s.first)) 48 | B_size += (s.second - 1) * B_strides.at(s.first); 49 | if (C_strides.count(s.first) || B_strides.count(s.first) || 50 | A_strides.count(s.first)) 51 | flops *= s.second; 52 | } 53 | 54 | auto A = get_random_vector(A_size); 55 | auto B = get_random_vector(B_size); 56 | auto CN = get_random_vector(C_size); 57 | 58 | auto jit_fn = loop_nest_code_generator( 59 | order, sizes, C_formula, A_formula, B_formula, C_strides, 60 | A_strides, B_strides, dabun::fma, max_unrolled_fmas) 61 | .get_shared(); 62 | 63 | jit_fn.save_to_file("zi.asm"); 64 | 65 | auto secs = sysml::measure_fastest( 66 | [&]() { jit_fn(CN.data(), A.data(), B.data(), alpha); }, 67 | total_iterations); 68 | 69 | double gflops = flops / 1000000000; 70 | 71 | std::cout << "gflops: " << gflops << "\n"; 72 | 73 | std::cout << "GFLOPS: " << (gflops / secs) << "\n"; 74 | } 75 | 76 | } // namespace dabun 77 | -------------------------------------------------------------------------------- /apps/loop_nest_stress_test.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #include "baselines.hpp" 7 | #include "loop_nest_baseline.hpp" 8 | 9 | #include "dabun/arithmetic_operation.hpp" 10 | #include "dabun/check.hpp" 11 | #include "dabun/isa.hpp" 12 | #include "dabun/loop_nest.hpp" 13 | #include "dabun/random_vector.hpp" 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #ifndef DABUN_ISA 28 | #define DABUN_ISA avx2 29 | #endif 30 | 31 | #ifndef DABUN_ARITHMETIC 32 | #define DABUN_ARITHMETIC float 33 | #endif 34 | 35 | #ifndef DABUN_VEX 36 | #if defined(DABUN_ARCH_AARCH64) 37 | #define DABUN_VEX ::dabun::extension::neon 38 | #else 39 | #define DABUN_VEX ::dabun::extension::avx2 40 | #endif 41 | #endif 42 | 43 | int main() 44 | { 45 | using namespace dabun; 46 | 47 | using float_t = DABUN_ARITHMETIC; 48 | 49 | for (int rounds = 0; rounds < 1000000; ++rounds) 50 | { 51 | int ArCr = (1 << rand() % 10) + rand() % 16; 52 | int AcBr = (1 << rand() % 10) + rand() % 16; 53 | int BcCc = (1 << rand() % 10) + rand() % 16; 54 | int max_fmas_unrolled = 1 << (rand() % 10); 55 | 56 | std::vector> order = { 57 | {"AcBr", 1}, {"BcCc", 1}, {"ArCr", 1}}; 58 | 59 | std::vector> hyper_order = { 60 | {"AcBr", (rand() % AcBr) + 2}, // It's OK to go oversize 61 | // (tests whether it's 62 | // handled appropriately) 63 | {"BcCc", (rand() % BcCc) + 2}, // - || - 64 | {"ArCr", (rand() % ArCr) + 2}}; // - || - 65 | 66 | std::sort(hyper_order.begin(), hyper_order.end()); 67 | 68 | do 69 | { 70 | std::sort(order.begin(), order.end()); 71 | do 72 | { 73 | auto full_order = hyper_order; 74 | full_order.insert(full_order.end(), order.begin(), order.end()); 75 | 76 | std::cout << "DIF: ORDER: "; 77 | for (auto& o : full_order) 78 | { 79 | if (o.first == full_order.back().first) 80 | { 81 | if (o.second != 1) 82 | { 83 | o.second = round_up( 84 | o.second, 85 | isa_traits>::vector_size * 87 | 4 / sizeof(float_t)); 88 | } 89 | } 90 | std::cout << o.first << "=" << o.second << " "; 91 | } 92 | 93 | std::cout << "ArCr=" << ArCr << " "; 94 | std::cout << "AcBr=" << AcBr << " "; 95 | std::cout << "BcCc=" << BcCc << " "; 96 | 97 | std::cout << "MU=" << max_fmas_unrolled << std::endl; 98 | 99 | auto fn = 100 | loop_nest_compiler( 101 | full_order, // The second argument is a map of the 102 | // dimension sizes 103 | {{"AcBr", AcBr}, {"ArCr", ArCr}, {"BcCc", BcCc}}, 104 | // Vars of C (other variables are reduction variables) 105 | {"ArCr", "BcCc"}, 106 | // Variables of A 107 | {"ArCr", "AcBr"}, 108 | // Variables of B 109 | {"AcBr", "BcCc"}, 110 | // C's strides for each variable. 111 | {{"ArCr", BcCc}, {"BcCc", 1}}, 112 | // A's strides for each variable 113 | {{"ArCr", AcBr}, {"AcBr", 1}}, 114 | // B's strides for each variable 115 | {{"AcBr", BcCc}, {"BcCc", 1}}, dabun::fma, 116 | max_fmas_unrolled, nullptr) 117 | .get_shared(); 118 | 119 | auto A = get_random_vector(AcBr * ArCr); 120 | auto B = get_random_vector(AcBr * BcCc); 121 | 122 | auto CN = get_random_vector(ArCr * BcCc); 123 | auto CJ = CN; 124 | 125 | baseline_MM(ArCr, AcBr, BcCc, AcBr, 1, BcCc, 1, BcCc, 1, 126 | A.data(), B.data(), CN.data(), 1); 127 | 128 | // apply_relu(CN.data(), CN.data() + CN.size()); 129 | 130 | fn(CJ.data(), A.data(), B.data(), 1); 131 | 132 | auto madiff = max_abs_difference( 133 | CJ.data(), CJ.data() + ArCr * BcCc, CN.data()); 134 | 135 | std::cout << "MAXABSDIFF: " << madiff << std::endl; 136 | 137 | // assert(madiff < 0.001); 138 | } while (std::next_permutation(order.begin(), order.end())); 139 | } while (std::next_permutation(hyper_order.begin(), hyper_order.end())); 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /apps/loop_nest_tensillica.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #include 7 | 8 | #include 9 | 10 | #include "dabun/check.hpp" 11 | #include "dabun/random_vector.hpp" 12 | #include "dabun/tensillica/loop_nest.hpp" 13 | 14 | #include "baselines.hpp" 15 | 16 | int main() 17 | { 18 | using namespace dabun; 19 | 20 | // Playing with weird schedules 21 | // Matrix-Matrix product 22 | // C(r, c) = A(r, k) * B(k, c) 23 | // if (0) 24 | { 25 | int ArCr = 324; 26 | int AcBr = 124; 27 | int BcCc = 54; 28 | 29 | auto gen_loop_nest = [&]() 30 | { 31 | return dabun::tensillica::loop_nest_code_generator( 32 | // The first argument is the loop order in the form of 33 | // {dimension, stride}. For now the outer dimension 34 | // has to divide the stride. This is effectively the 35 | // same as Halide's split into outer and inner 36 | // variable, but can have arbitray number of splits. 37 | { {"ArCr", 6}, 38 | {"BcCc", 16}, 39 | {"AcBr", 4}, 40 | {"AcBr", 1}, 41 | {"ArCr", 1}, 42 | {"BcCc", 1}}, 43 | // The second argument is a map of the dimension sizes 44 | {{"AcBr", AcBr}, {"ArCr", ArCr}, {"BcCc", BcCc}}, 45 | // Vars of C (other variables are reduction variables) 46 | {"ArCr", "BcCc"}, 47 | // Variables of A 48 | {"ArCr", "AcBr"}, 49 | // Variables of B 50 | {"AcBr", "BcCc"}, 51 | // C's strides for each variable. Note that the 52 | // strides data is a superset of the previous argument 53 | // (variables of C). I'm still deciding on the final 54 | // design, possibly allowing for null strides that 55 | // will just deduce them from the sizes, or some 56 | // special structs indicating the layout (ie 57 | // row-major, col-major). In this case the vars have 58 | // to be ordered though... Many decisions to make... 59 | {{"ArCr", BcCc}, {"BcCc", 1}}, 60 | // A's strides for each variable 61 | {{"ArCr", AcBr}, {"AcBr", 1}}, 62 | // B's strides for each variable 63 | {{"AcBr", BcCc}, {"BcCc", 1}}, nullptr) 64 | .get_shared(); 65 | }; 66 | 67 | auto compile_secs = sysml::measure_fastest(gen_loop_nest, 1); 68 | std::cout << "Compile: " << compile_secs << std::endl; 69 | 70 | auto fn = gen_loop_nest(); 71 | // fn.save_to_file("zi.asm"); 72 | // fn.register_perf("fn1"); 73 | 74 | auto A = get_random_vector(AcBr * ArCr + 1024); 75 | auto B = get_random_vector(AcBr * BcCc + 1024); 76 | 77 | auto CN = get_random_vector(ArCr * BcCc + 1024); 78 | auto CJ = CN; 79 | 80 | baseline_MM(ArCr, AcBr, BcCc, AcBr, BcCc, BcCc, A.data(), B.data(), 81 | CN.data(), 1); 82 | 83 | fn(CJ.data(), A.data(), B.data(), 1); 84 | // apply_relu(CN.data(), CN.data() + CN.size()); 85 | 86 | std::cout << "MAXABSDIFF: " 87 | << max_abs_difference(CJ.data(), CJ.data() + ArCr * BcCc, 88 | CN.data()) 89 | << "\n"; 90 | 91 | auto secs = sysml::measure_fastest( 92 | [&]() { fn(CJ.data(), A.data(), B.data(), 0); }, 10); 93 | 94 | double gflops = 1.0 * AcBr * ArCr * BcCc * 2 / 1000000000; 95 | 96 | std::cout << "GFLOPS: " << (gflops / secs) << "\n"; 97 | 98 | // bench_implementation_fmas_per_cycle( 99 | // fn, AcBr * ArCr, AcBr * BcCc, ArCr * BcCc, 100 | // 1.0 * AcBr * ArCr * BcCc * 2, 10, 10); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /apps/loop_nest_test.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/arithmetic_operation.hpp" 9 | #include "dabun/check.hpp" 10 | #include "dabun/loop_nest.hpp" 11 | #include "dabun/random_vector.hpp" 12 | #include "loop_nest_baseline.hpp" 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | namespace dabun 22 | { 23 | 24 | template 25 | void test_loop_nest_against_slow_baseline( 26 | std::vector> const& order, 27 | std::map const& sizes, 28 | std::set const& C_formula, 29 | std::set const& A_formula, 30 | std::set const& B_formula, 31 | std::map const& C_strides, 32 | std::map const& A_strides, 33 | std::map const& B_strides, int max_unrolled_fmas = 512, 34 | int alpha = 1) 35 | { 36 | std::int64_t C_size = 1; 37 | std::int64_t A_size = 1; 38 | std::int64_t B_size = 1; 39 | 40 | alpha = alpha ? 1 : 0; 41 | 42 | for (auto const& s : sizes) 43 | { 44 | if (C_strides.count(s.first)) 45 | C_size += (s.second - 1) * C_strides.at(s.first); 46 | if (A_strides.count(s.first)) 47 | A_size += (s.second - 1) * A_strides.at(s.first); 48 | if (B_strides.count(s.first)) 49 | B_size += (s.second - 1) * B_strides.at(s.first); 50 | } 51 | 52 | auto A = get_random_vector(A_size); 53 | auto B = get_random_vector(B_size); 54 | auto CN = get_random_vector(C_size); 55 | auto CJ = CN; 56 | 57 | auto jit_fn = loop_nest_code_generator( 58 | order, sizes, C_formula, A_formula, B_formula, C_strides, 59 | A_strides, B_strides, fma, max_unrolled_fmas) 60 | .get_shared(); 61 | 62 | jit_fn.save_to_file("zi.asm"); 63 | 64 | auto baseline_fn = 65 | loop_nest_baseline(order, sizes, C_formula, A_formula, B_formula, 66 | C_strides, A_strides, B_strides, false); 67 | 68 | jit_fn(CJ.data(), A.data(), B.data(), alpha); 69 | baseline_fn(CN.data(), A.data(), B.data(), alpha); 70 | 71 | std::cout << "MAXABSDIFF: ( " << C_size << " ) " 72 | << max_abs_difference(CJ.data(), CJ.data() + C_size, CN.data()) 73 | << "\n"; 74 | } 75 | 76 | } // namespace dabun 77 | -------------------------------------------------------------------------------- /apps/serialization.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "dabun/serialization.hpp" 12 | 13 | int main() 14 | { 15 | int ArCr = 256; 16 | int AcBr = 256; 17 | int BcCc = 256; 18 | auto s = dabun::serialized_loop_nest_inputs( 19 | // The first argument is the loop order in the form of 20 | // {dimension, stride}. For now the outer dimension 21 | // has to divide the stride. This is effectively the 22 | // same as Halide's split into outer and inner 23 | // variable, but can have arbitray number of splits. 24 | {{"AcBr", 256}, 25 | {"ArCr", 3}, 26 | {"BcCc", 16}, 27 | {"AcBr", 1}, 28 | {"AcBr", 1}, 29 | {"ArCr", 1}, 30 | {"BcCc", 1}}, 31 | // The second argument is a map of the dimension sizes 32 | {{"AcBr", AcBr}, {"ArCr", ArCr}, {"BcCc", BcCc}}, 33 | // Vars of C (other variables are reduction variables) 34 | {"ArCr", "BcCc"}, 35 | // Variables of A 36 | {"ArCr", "AcBr"}, 37 | // Variables of B 38 | {"AcBr", "BcCc"}, 39 | // C's strides for each variable. Note that the 40 | // strides data is a superset of the previous argument 41 | // (variables of C). I'm still deciding on the final 42 | // design, possibly allowing for null strides that 43 | // will just deduce them from the sizes, or some 44 | // special structs indicating the layout (ie 45 | // row-major, col-major). In this case the vars have 46 | // to be ordered though... Many decisions to make... 47 | {{"ArCr", BcCc}, {"BcCc", 1}}, 48 | // A's strides for each variable 49 | {{"ArCr", AcBr}, {"AcBr", 1}}, 50 | // B's strides for each variable 51 | {{"AcBr", BcCc}, {"BcCc", 1}}, 1024); 52 | auto str_rep = s.str(); 53 | std::cout << str_rep << std::endl; 54 | 55 | auto s2 = 56 | dabun::serialized_loop_nest_inputs::from_str(str_rep); 57 | auto str_rep2 = s2.str(); 58 | std::cout << str_rep2 << std::endl; 59 | 60 | std::ofstream out("jose_test.txt"); 61 | out << str_rep2; 62 | out.close(); 63 | 64 | auto s3 = dabun::serialized_loop_nest_inputs::from_file( 65 | "jose_test.txt"); 66 | auto str_rep3 = s3.str(); 67 | std::cout << str_rep3 << std::endl; 68 | } 69 | -------------------------------------------------------------------------------- /apps/tensillica_play.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #include "dabun/tensillica/peak_gflops.hpp" 7 | #include "sysml/code_generator/code_generated_fn.hpp" 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | #include 16 | 17 | int main() 18 | { 19 | std::cout << "zi"; 20 | std::cout << std::endl; 21 | 22 | auto fn = dabun::tensillica::peak_gflops().get_shared(); 23 | 24 | if (fn) 25 | { 26 | 27 | float in[20] = {0.5f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 28 | 0.5f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f}; 29 | float out[10] = {0.f}; 30 | 31 | std::cout << fn(in, out, dabun::tensillica::dl_func_arg_cast(3.14f)) << "\n"; 32 | 33 | for (int i = 0; i < 10; ++i) 34 | { 35 | std::cout << out[i] << " --------------------\n"; 36 | } 37 | } 38 | else 39 | { 40 | std::cout << "Can't get fn_ptr\n"; 41 | } 42 | 43 | // dlclose(dlh); 44 | } 45 | -------------------------------------------------------------------------------- /apps/transposer.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #include "dabun/transposer.hpp" 7 | #include "dabun/check.hpp" 8 | #include "dabun/random_vector.hpp" 9 | #include "transposer_baseline.hpp" 10 | #include "transposer_bench.hpp" 11 | 12 | #ifndef DABUN_ARITHMETIC 13 | #define DABUN_ARITHMETIC float 14 | #endif 15 | 16 | int main() 17 | { 18 | using float_t = DABUN_ARITHMETIC; 19 | 20 | using namespace dabun; 21 | 22 | // for (int i = 0; i < 10; ++i) 23 | { 24 | 25 | int R = 11; 26 | int C = 13; 27 | 28 | auto A = get_random_vector(R * C); 29 | auto B = get_random_vector(R * C); 30 | auto BJ = get_random_vector(R * C); 31 | 32 | // ArCr=12 AcBr=6 ORDER: ArCr,12 :: AcBr,5 :: AcBr,4 :: ArCr,4 :: AcBr,1 33 | // :: ArCr,1 :: MU=32 34 | 35 | auto transpose = transposer_baseline( 36 | // Order 37 | {{"C", 13}, {"R", 16}, {"C", 9}, {"R", 16}, {"C", 1}, {"R", 1}}, 38 | // Sizes 39 | {{"R", R}, {"C", C}}, 40 | // Out Strides 41 | {{"R", C}, {"C", 1}}, 42 | // In Strides 43 | {{"R", 1}, {"C", R}}); 44 | 45 | auto transpose_jit = 46 | transposer_compiler( 47 | {{"C", 13}, {"R", 16}, {"C", 9}, {"R", 16}, {"C", 1}, {"R", 1}}, 48 | // Sizes 49 | {{"R", R}, {"C", C}}, 50 | // Out Strides 51 | {{"R", C}, {"C", 1}}, 52 | // In Strides 53 | {{"R", 1}, {"C", R}}, 32) 54 | .get_shared(); 55 | 56 | transpose(B.data(), A.data()); 57 | 58 | transpose_jit.save_to_file("zi.asm"); 59 | 60 | transpose_jit(BJ.data(), A.data()); 61 | 62 | std::cout << "MAXABSDIFF: " 63 | << max_abs_difference(BJ.data(), BJ.data() + R * C, B.data()) 64 | << "\n"; 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /apps/transposer_baseline.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | namespace dabun 16 | { 17 | 18 | template 19 | auto transposer_baseline(std::vector> const& order, 20 | std::map const& sizes, 21 | std::map const& out_strides, 22 | std::map const& in_strides) 23 | { 24 | // Just optimizing out the map lookups. 25 | 26 | std::map var_to_id; 27 | 28 | int next = 0; 29 | for (auto const& s : sizes) 30 | { 31 | var_to_id[s.first] = next++; 32 | } 33 | 34 | std::vector initial_limits(next); 35 | 36 | for (auto const& s : sizes) 37 | { 38 | initial_limits[var_to_id[s.first]] = s.second; 39 | } 40 | 41 | std::vector order_ids(order.size()); 42 | std::vector order_delta(order.size()); 43 | std::vector order_in_strides(order.size()); 44 | std::vector order_out_strides(order.size()); 45 | 46 | for (int i = 0; i < order.size(); ++i) 47 | { 48 | order_ids[i] = var_to_id[order[i].first]; 49 | order_delta[i] = order[i].second; 50 | 51 | order_in_strides[i] = 52 | in_strides.count(order[i].first) 53 | ? in_strides.at(order[i].first) * order_delta[i] 54 | : 0; 55 | order_out_strides[i] = 56 | out_strides.count(order[i].first) 57 | ? out_strides.at(order[i].first) * order_delta[i] 58 | : 0; 59 | } 60 | 61 | return [=](Arithmetic* out_ptr, Arithmetic const* in_ptr) { 62 | auto limits = initial_limits; 63 | 64 | std::function 65 | recursive_compute = 66 | [&](Arithmetic* out, Arithmetic const* in, int order_depth) { 67 | if (order_depth == order_ids.size()) 68 | { 69 | out[0] = in[0]; 70 | } 71 | else 72 | { 73 | auto var = order_ids[order_depth]; 74 | auto delta = order_delta[order_depth]; 75 | auto limit = limits[var]; 76 | auto full = limit / delta; 77 | auto rest = limit % delta; 78 | 79 | auto save = std::exchange(limits[var], delta); 80 | for (int i = 0; i < full; ++i) 81 | { 82 | recursive_compute(out, in, order_depth + 1); 83 | in += order_in_strides[order_depth]; 84 | out += order_out_strides[order_depth]; 85 | } 86 | limits[var] = save; 87 | 88 | if (rest) 89 | { 90 | int s = std::exchange(limits[var], rest); 91 | recursive_compute(out, in, order_depth + 1); 92 | limits[var] = s; 93 | } 94 | } 95 | }; 96 | 97 | recursive_compute(out_ptr, in_ptr, 0); 98 | }; 99 | } 100 | 101 | } // namespace dabun 102 | -------------------------------------------------------------------------------- /apps/transposer_bench.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #include "dabun/transposer.hpp" 7 | 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | namespace dabun 23 | { 24 | 25 | template 26 | void transposer_bench(std::vector> const& order, 27 | std::map const& sizes, 28 | std::map const& out_strides, 29 | std::map const& in_strides, 30 | int max_unrolled_fmas = 320, int total_iterations = 100) 31 | 32 | { 33 | auto total_moved_bytes = 34 | std::accumulate(sizes.begin(), sizes.end(), 1, 35 | [](auto a, auto b) { return a * b.second; }) * 36 | 4; 37 | 38 | std::int64_t in_size = 1; 39 | std::int64_t out_size = 1; 40 | 41 | for (auto const& s : sizes) 42 | { 43 | in_size += (s.second - 1) * in_strides.at(s.first); 44 | out_size += (s.second - 1) * out_strides.at(s.first); 45 | } 46 | 47 | auto A = get_random_vector(in_size); 48 | auto B = get_random_vector(out_size); 49 | 50 | auto jit_fn = transposer_code_generator(order, sizes, out_strides, 51 | in_strides, max_unrolled_fmas) 52 | .get_unique(); 53 | 54 | jit_fn.save_to_file("zi.asm"); 55 | 56 | auto secs = sysml::measure_fastest([&]() { jit_fn(B.data(), A.data()); }, 57 | total_iterations); 58 | 59 | double moved_gbytes = 1.0 * total_moved_bytes / 1000000000; 60 | 61 | std::cout << "GBPS: " << (moved_gbytes / secs) << "\n"; 62 | std::cout << "MSEC: " << (secs / 1000) << "\n"; 63 | } 64 | 65 | } // namespace dabun 66 | -------------------------------------------------------------------------------- /apps/transposer_stress_test.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #include "dabun/check.hpp" 7 | #include "dabun/random_vector.hpp" 8 | #include "dabun/transposer.hpp" 9 | #include "transposer_baseline.hpp" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #ifndef DABUN_ARITHMETIC 24 | #define DABUN_ARITHMETIC float 25 | #endif 26 | 27 | int main() 28 | { 29 | using namespace dabun; 30 | 31 | using float_t = DABUN_ARITHMETIC; 32 | 33 | srand(0); 34 | 35 | for (int rounds = 0; rounds < 1000000; ++rounds) 36 | { 37 | int ArCr = (1 << rand() % 2) + rand() % 16; 38 | int AcBr = (1 << rand() % 2) + rand() % 16; 39 | int max_unrolled = 1 << (rand() % 8); 40 | 41 | std::vector> order = {{"AcBr", 1}, 42 | {"ArCr", 1}}; 43 | 44 | std::vector> hyper_order = { 45 | {"AcBr", (rand() % AcBr) + 2}, // It's OK to go oversize 46 | // (tests whether it's 47 | // handled appropriately) 48 | {"ArCr", (rand() % ArCr) + 2}, // - || - 49 | {"AcBr", (rand() % AcBr) + 2}, // - || - 50 | {"ArCr", (rand() % ArCr) + 2}}; // - || - 51 | 52 | std::sort(hyper_order.begin(), hyper_order.end(), 53 | [](auto a, auto b) { return a.second > b.second; }); 54 | 55 | { 56 | auto full_order = hyper_order; 57 | full_order.insert(full_order.end(), order.begin(), order.end()); 58 | 59 | std::cout << "DIF: ORDER: "; 60 | for (auto& o : full_order) 61 | { 62 | if (o.first == full_order.back().first) 63 | { 64 | if (o.second != 1) 65 | { 66 | o.second = round_up(o.second, 67 | isa_traits::vector_size * 68 | 4 / sizeof(float_t)); 69 | } 70 | } 71 | std::cout << o.first << "=" << o.second << " "; 72 | } 73 | 74 | std::cout << "ArCr=" << ArCr << " "; 75 | std::cout << "AcBr=" << AcBr << " "; 76 | 77 | std::cout << "MU=" << max_unrolled << std::endl; 78 | 79 | auto fn_baselome = transposer_baseline( 80 | full_order, // The second argument is a 81 | // map of the dimension sizes 82 | {{"AcBr", AcBr}, {"ArCr", ArCr}}, 83 | // out's strides for each variable. 84 | {{"ArCr", AcBr}, {"AcBr", 1}}, 85 | // in's strides for each variable 86 | {{"ArCr", 1}, {"AcBr", ArCr}}); 87 | 88 | auto fn = transposer_compiler( 89 | full_order, // The second argument is a map of the 90 | // dimension sizes 91 | {{"AcBr", AcBr}, {"ArCr", ArCr}}, 92 | // out's strides for each variable. 93 | {{"ArCr", AcBr}, {"AcBr", 1}}, 94 | // in's strides for each variable 95 | {{"ArCr", 1}, {"AcBr", ArCr}}, max_unrolled) 96 | .get_shared(); 97 | 98 | fn.save_to_file("zi.asm"); 99 | 100 | auto A = get_random_vector(AcBr * ArCr); 101 | auto CN = get_random_vector(ArCr * AcBr); 102 | auto CJ = CN; 103 | 104 | fn_baselome(CN.data(), A.data()); 105 | fn(CJ.data(), A.data()); 106 | 107 | auto madiff = max_abs_difference(CJ.data(), CJ.data() + ArCr * AcBr, 108 | CN.data()); 109 | 110 | std::cout << "ArCr=" << ArCr << " "; 111 | std::cout << "AcBr=" << AcBr << " "; 112 | 113 | std::cout << "ORDER: "; 114 | 115 | for (auto const& o : full_order) 116 | { 117 | std::cout << o.first << ',' << o.second << " :: "; 118 | } 119 | 120 | std::cout << "\n"; 121 | 122 | std::cout << "MU=" << max_unrolled << std::endl; 123 | std::cout << "MAXABSDIFF: " << madiff << std::endl; 124 | 125 | assert(madiff < 0.000001); 126 | } 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /assets/logo/fulllogo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/fulllogo.jpg -------------------------------------------------------------------------------- /assets/logo/fulllogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/fulllogo.png -------------------------------------------------------------------------------- /assets/logo/fulllogo_nobuffer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/fulllogo_nobuffer.jpg -------------------------------------------------------------------------------- /assets/logo/fulllogo_nobuffer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/fulllogo_nobuffer.png -------------------------------------------------------------------------------- /assets/logo/fulllogo_transparent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/fulllogo_transparent.png -------------------------------------------------------------------------------- /assets/logo/fulllogo_transparent_nobuffer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/fulllogo_transparent_nobuffer.png -------------------------------------------------------------------------------- /assets/logo/grayscale.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/grayscale.png -------------------------------------------------------------------------------- /assets/logo/grayscale_nobuffer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/grayscale_nobuffer.png -------------------------------------------------------------------------------- /assets/logo/grayscale_transparent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/grayscale_transparent.png -------------------------------------------------------------------------------- /assets/logo/grayscale_transparent_nobuffer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/grayscale_transparent_nobuffer.png -------------------------------------------------------------------------------- /assets/logo/icononly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/icononly.png -------------------------------------------------------------------------------- /assets/logo/icononly_nobuffer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/icononly_nobuffer.png -------------------------------------------------------------------------------- /assets/logo/icononly_transparent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/icononly_transparent.png -------------------------------------------------------------------------------- /assets/logo/icononly_transparent_nobuffer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/icononly_transparent_nobuffer.png -------------------------------------------------------------------------------- /assets/logo/icononly_transparent_nobuffer_padded.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/icononly_transparent_nobuffer_padded.png -------------------------------------------------------------------------------- /assets/logo/print.eps: -------------------------------------------------------------------------------- 1 | %!PS-Adobe-3.0 EPSF-3.0 2 | %%Creator: cairo 1.16.0 (https://cairographics.org) 3 | %%CreationDate: Fri Sep 24 21:07:31 2021 4 | %%Pages: 1 5 | %%DocumentData: Clean7Bit 6 | %%LanguageLevel: 2 7 | %%BoundingBox: 0 0 960 768 8 | %%EndComments 9 | %%BeginProlog 10 | 50 dict begin 11 | /q { gsave } bind def 12 | /Q { grestore } bind def 13 | /cm { 6 array astore concat } bind def 14 | /w { setlinewidth } bind def 15 | /J { setlinecap } bind def 16 | /j { setlinejoin } bind def 17 | /M { setmiterlimit } bind def 18 | /d { setdash } bind def 19 | /m { moveto } bind def 20 | /l { lineto } bind def 21 | /c { curveto } bind def 22 | /h { closepath } bind def 23 | /re { exch dup neg 3 1 roll 5 3 roll moveto 0 rlineto 24 | 0 exch rlineto 0 rlineto closepath } bind def 25 | /S { stroke } bind def 26 | /f { fill } bind def 27 | /f* { eofill } bind def 28 | /n { newpath } bind def 29 | /W { clip } bind def 30 | /W* { eoclip } bind def 31 | /BT { } bind def 32 | /ET { } bind def 33 | /BDC { mark 3 1 roll /BDC pdfmark } bind def 34 | /EMC { mark /EMC pdfmark } bind def 35 | /cairo_store_point { /cairo_point_y exch def /cairo_point_x exch def } def 36 | /Tj { show currentpoint cairo_store_point } bind def 37 | /TJ { 38 | { 39 | dup 40 | type /stringtype eq 41 | { show } { -0.001 mul 0 cairo_font_matrix dtransform rmoveto } ifelse 42 | } forall 43 | currentpoint cairo_store_point 44 | } bind def 45 | /cairo_selectfont { cairo_font_matrix aload pop pop pop 0 0 6 array astore 46 | cairo_font exch selectfont cairo_point_x cairo_point_y moveto } bind def 47 | /Tf { pop /cairo_font exch def /cairo_font_matrix where 48 | { pop cairo_selectfont } if } bind def 49 | /Td { matrix translate cairo_font_matrix matrix concatmatrix dup 50 | /cairo_font_matrix exch def dup 4 get exch 5 get cairo_store_point 51 | /cairo_font where { pop cairo_selectfont } if } bind def 52 | /Tm { 2 copy 8 2 roll 6 array astore /cairo_font_matrix exch def 53 | cairo_store_point /cairo_font where { pop cairo_selectfont } if } bind def 54 | /g { setgray } bind def 55 | /rg { setrgbcolor } bind def 56 | /d1 { setcachedevice } bind def 57 | /cairo_data_source { 58 | CairoDataIndex CairoData length lt 59 | { CairoData CairoDataIndex get /CairoDataIndex CairoDataIndex 1 add def } 60 | { () } ifelse 61 | } def 62 | /cairo_flush_ascii85_file { cairo_ascii85_file status { cairo_ascii85_file flushfile } if } def 63 | /cairo_image { image cairo_flush_ascii85_file } def 64 | /cairo_imagemask { imagemask cairo_flush_ascii85_file } def 65 | %%EndProlog 66 | %%BeginSetup 67 | %%EndSetup 68 | %%Page: 1 1 69 | %%BeginPageSetup 70 | %%PageBoundingBox: 0 0 960 768 71 | %%EndPageSetup 72 | q 0 0 960 768 rectclip 73 | 1 0 0 -1 0 768 cm q 74 | 0 g 75 | 0 0 960 768 rectfill 76 | 0.6 0.831373 0.12549 rg 77 | 291.887 406.699 m 433.641 347.023 l 433.641 207.727 l 251.422 168.02 l 78 | 291.887 406.699 l f* 79 | 0.176471 0.478431 0.784314 rg 80 | 433.641 126.5 m 280.879 157.719 l 433.641 191.004 l h 81 | 433.641 126.5 m f* 82 | 0 0.678431 0.937255 rg 83 | 526.543 384.84 m 526.543 490.191 l 653.219 420.633 l h 84 | 526.543 384.84 m f* 85 | 1 0.94902 0 rg 86 | 442.328 361.039 m 307.781 417.68 l 510.195 492.281 l 510.195 380.215 l 87 | 442.328 361.039 l f* 88 | 0.662745 0.231373 0.639216 rg 89 | 526.539 367.879 m 668.094 407.879 l 708.578 169.082 l 526.539 223.898 l 90 | 526.539 367.879 l f* 91 | 0.956863 0.447059 0.0862745 rg 92 | 449.996 211.293 m 449.996 346.25 l 510.188 363.258 l 510.188 224.41 l 449.996 93 | 211.293 l f* 94 | 0.929412 0.109804 0.141176 rg 95 | 681.43 160.199 m 449.988 125.977 l 449.988 194.57 l 518.039 209.402 l 681.43 96 | 160.199 l f* 97 | 0.968627 0.933333 0.0156863 rg 98 | 357.324 624.074 m 330.984 624.074 l 330.984 551.641 l 357.324 551.641 l 99 | 381.297 551.641 393.285 563.051 393.285 585.875 c 393.285 611.34 381.297 100 | 624.074 357.324 624.074 c h 101 | 340.348 559.234 m 340.348 616.48 l 357.324 616.48 l 375.047 616.48 383.906 102 | 606.277 383.906 585.875 c 383.906 568.113 375.047 559.234 357.324 559.234 103 | c h 104 | 403.414 608.023 m 403.414 597.109 410.609 591.652 425 591.652 c 429.621 105 | 591.652 434.246 591.988 438.875 592.66 c 438.875 587.648 l 438.875 581.539 106 | 434.098 578.484 424.539 578.484 c 419.062 578.484 413.457 579.328 407.723 107 | 581.02 c 407.723 573.41 l 413.457 571.73 419.062 570.891 424.539 570.891 108 | c 440.34 570.891 448.242 576.395 448.242 587.402 c 448.242 624.074 l 442.98 109 | 624.074 l 439.738 618.801 l 434.438 622.316 428.66 624.074 422.406 624.074 110 | c 409.746 624.074 403.414 618.723 403.414 608.023 c h 111 | 412.781 607.863 m 412.781 613.605 415.988 616.48 422.406 616.48 c 428.891 112 | 616.48 434.379 614.754 438.875 611.309 c 438.875 600.27 l 434.246 599.586 113 | 429.621 599.246 425 599.246 c 416.855 599.246 412.781 602.117 412.781 607.863 114 | c h 115 | 463.441 621.898 m 463.441 551.641 l 472.809 551.641 l 472.809 573.41 l 116 | 476.832 571.73 481.141 570.891 485.734 570.891 c 502.551 570.891 510.961 117 | 579.395 510.961 596.406 c 510.961 614.852 502.129 624.074 484.465 624.074 118 | c 476.828 624.074 469.82 623.348 463.441 621.898 c h 119 | 472.809 581.727 m 472.809 614.809 l 475.883 615.789 479.613 616.277 484.004 120 | 616.277 c 495.828 616.277 501.742 609.539 501.742 596.062 c 501.742 584.68 121 | 496.387 578.988 485.676 578.988 c 480.812 578.988 476.527 579.902 472.809 122 | 581.727 c h 123 | 523.629 604.262 m 523.629 570.891 l 532.992 570.891 l 532.992 604.418 l 124 | 532.992 612.324 536.91 616.277 544.75 616.277 c 550.121 616.277 555.234 125 | 614.031 560.094 609.535 c 560.094 570.891 l 569.461 570.891 l 569.461 624.074 126 | l 563.035 624.074 l 561.406 617.285 l 554.961 621.812 548.477 624.074 541.957 127 | 624.074 c 529.738 624.074 523.629 617.469 523.629 604.262 c h 128 | 594.027 624.074 m 584.664 624.074 l 584.664 570.891 l 590.988 570.891 l 129 | 592.66 577.676 l 598.098 573.152 604.074 570.891 610.598 570.891 c 623.875 130 | 570.891 630.512 577.488 630.512 590.688 c 630.512 624.074 l 621.129 624.074 131 | l 621.129 590.543 l 621.129 582.637 617.195 578.688 609.328 578.688 c 603.961 132 | 578.688 598.859 580.934 594.027 585.43 c h 133 | 594.027 624.074 m f 134 | Q Q 135 | showpage 136 | %%Trailer 137 | end 138 | %%EOF 139 | -------------------------------------------------------------------------------- /assets/logo/print.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/print.pdf -------------------------------------------------------------------------------- /assets/logo/print.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Created with Fabric.js 4.4.0 5 | 6 | 7 | 10 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /assets/logo/print_transparent.eps: -------------------------------------------------------------------------------- 1 | %!PS-Adobe-3.0 EPSF-3.0 2 | %%Creator: cairo 1.16.0 (https://cairographics.org) 3 | %%CreationDate: Fri Sep 24 21:07:36 2021 4 | %%Pages: 1 5 | %%DocumentData: Clean7Bit 6 | %%LanguageLevel: 2 7 | %%BoundingBox: 251 143 709 643 8 | %%EndComments 9 | %%BeginProlog 10 | 50 dict begin 11 | /q { gsave } bind def 12 | /Q { grestore } bind def 13 | /cm { 6 array astore concat } bind def 14 | /w { setlinewidth } bind def 15 | /J { setlinecap } bind def 16 | /j { setlinejoin } bind def 17 | /M { setmiterlimit } bind def 18 | /d { setdash } bind def 19 | /m { moveto } bind def 20 | /l { lineto } bind def 21 | /c { curveto } bind def 22 | /h { closepath } bind def 23 | /re { exch dup neg 3 1 roll 5 3 roll moveto 0 rlineto 24 | 0 exch rlineto 0 rlineto closepath } bind def 25 | /S { stroke } bind def 26 | /f { fill } bind def 27 | /f* { eofill } bind def 28 | /n { newpath } bind def 29 | /W { clip } bind def 30 | /W* { eoclip } bind def 31 | /BT { } bind def 32 | /ET { } bind def 33 | /BDC { mark 3 1 roll /BDC pdfmark } bind def 34 | /EMC { mark /EMC pdfmark } bind def 35 | /cairo_store_point { /cairo_point_y exch def /cairo_point_x exch def } def 36 | /Tj { show currentpoint cairo_store_point } bind def 37 | /TJ { 38 | { 39 | dup 40 | type /stringtype eq 41 | { show } { -0.001 mul 0 cairo_font_matrix dtransform rmoveto } ifelse 42 | } forall 43 | currentpoint cairo_store_point 44 | } bind def 45 | /cairo_selectfont { cairo_font_matrix aload pop pop pop 0 0 6 array astore 46 | cairo_font exch selectfont cairo_point_x cairo_point_y moveto } bind def 47 | /Tf { pop /cairo_font exch def /cairo_font_matrix where 48 | { pop cairo_selectfont } if } bind def 49 | /Td { matrix translate cairo_font_matrix matrix concatmatrix dup 50 | /cairo_font_matrix exch def dup 4 get exch 5 get cairo_store_point 51 | /cairo_font where { pop cairo_selectfont } if } bind def 52 | /Tm { 2 copy 8 2 roll 6 array astore /cairo_font_matrix exch def 53 | cairo_store_point /cairo_font where { pop cairo_selectfont } if } bind def 54 | /g { setgray } bind def 55 | /rg { setrgbcolor } bind def 56 | /d1 { setcachedevice } bind def 57 | /cairo_data_source { 58 | CairoDataIndex CairoData length lt 59 | { CairoData CairoDataIndex get /CairoDataIndex CairoDataIndex 1 add def } 60 | { () } ifelse 61 | } def 62 | /cairo_flush_ascii85_file { cairo_ascii85_file status { cairo_ascii85_file flushfile } if } def 63 | /cairo_image { image cairo_flush_ascii85_file } def 64 | /cairo_imagemask { imagemask cairo_flush_ascii85_file } def 65 | %%EndProlog 66 | %%BeginSetup 67 | %%EndSetup 68 | %%Page: 1 1 69 | %%BeginPageSetup 70 | %%PageBoundingBox: 251 143 709 643 71 | %%EndPageSetup 72 | q 251 143 458 500 rectclip 73 | 1 0 0 -1 0 768 cm q 74 | 0.6 0.831373 0.12549 rg 75 | 291.887 406.699 m 433.641 347.023 l 433.641 207.727 l 251.422 168.02 l 76 | 291.887 406.699 l f* 77 | 0.176471 0.478431 0.784314 rg 78 | 433.641 126.5 m 280.879 157.719 l 433.641 191.004 l h 79 | 433.641 126.5 m f* 80 | 0 0.678431 0.937255 rg 81 | 526.543 384.84 m 526.543 490.191 l 653.219 420.633 l h 82 | 526.543 384.84 m f* 83 | 1 0.94902 0 rg 84 | 442.328 361.039 m 307.781 417.68 l 510.195 492.281 l 510.195 380.215 l 85 | 442.328 361.039 l f* 86 | 0.662745 0.231373 0.639216 rg 87 | 526.539 367.879 m 668.094 407.879 l 708.578 169.082 l 526.539 223.898 l 88 | 526.539 367.879 l f* 89 | 0.956863 0.447059 0.0862745 rg 90 | 449.996 211.293 m 449.996 346.25 l 510.188 363.258 l 510.188 224.41 l 449.996 91 | 211.293 l f* 92 | 0.929412 0.109804 0.141176 rg 93 | 681.43 160.199 m 449.988 125.977 l 449.988 194.57 l 518.039 209.402 l 681.43 94 | 160.199 l f* 95 | 0.968627 0.933333 0.0156863 rg 96 | 357.324 624.074 m 330.984 624.074 l 330.984 551.641 l 357.324 551.641 l 97 | 381.297 551.641 393.285 563.051 393.285 585.875 c 393.285 611.34 381.297 98 | 624.074 357.324 624.074 c h 99 | 340.348 559.234 m 340.348 616.48 l 357.324 616.48 l 375.047 616.48 383.906 100 | 606.277 383.906 585.875 c 383.906 568.113 375.047 559.234 357.324 559.234 101 | c h 102 | 403.414 608.023 m 403.414 597.109 410.609 591.652 425 591.652 c 429.621 103 | 591.652 434.246 591.988 438.875 592.66 c 438.875 587.648 l 438.875 581.539 104 | 434.098 578.484 424.539 578.484 c 419.062 578.484 413.457 579.328 407.723 105 | 581.02 c 407.723 573.41 l 413.457 571.73 419.062 570.891 424.539 570.891 106 | c 440.34 570.891 448.242 576.395 448.242 587.402 c 448.242 624.074 l 442.98 107 | 624.074 l 439.738 618.801 l 434.438 622.316 428.66 624.074 422.406 624.074 108 | c 409.746 624.074 403.414 618.723 403.414 608.023 c h 109 | 412.781 607.863 m 412.781 613.605 415.988 616.48 422.406 616.48 c 428.891 110 | 616.48 434.379 614.754 438.875 611.309 c 438.875 600.27 l 434.246 599.586 111 | 429.621 599.246 425 599.246 c 416.855 599.246 412.781 602.117 412.781 607.863 112 | c h 113 | 463.441 621.898 m 463.441 551.641 l 472.809 551.641 l 472.809 573.41 l 114 | 476.832 571.73 481.141 570.891 485.734 570.891 c 502.551 570.891 510.961 115 | 579.395 510.961 596.406 c 510.961 614.852 502.129 624.074 484.465 624.074 116 | c 476.828 624.074 469.82 623.348 463.441 621.898 c h 117 | 472.809 581.727 m 472.809 614.809 l 475.883 615.789 479.613 616.277 484.004 118 | 616.277 c 495.828 616.277 501.742 609.539 501.742 596.062 c 501.742 584.68 119 | 496.387 578.988 485.676 578.988 c 480.812 578.988 476.527 579.902 472.809 120 | 581.727 c h 121 | 523.629 604.262 m 523.629 570.891 l 532.992 570.891 l 532.992 604.418 l 122 | 532.992 612.324 536.91 616.277 544.75 616.277 c 550.121 616.277 555.234 123 | 614.031 560.094 609.535 c 560.094 570.891 l 569.461 570.891 l 569.461 624.074 124 | l 563.035 624.074 l 561.406 617.285 l 554.961 621.812 548.477 624.074 541.957 125 | 624.074 c 529.738 624.074 523.629 617.469 523.629 604.262 c h 126 | 594.027 624.074 m 584.664 624.074 l 584.664 570.891 l 590.988 570.891 l 127 | 592.66 577.676 l 598.098 573.152 604.074 570.891 610.598 570.891 c 623.875 128 | 570.891 630.512 577.488 630.512 590.688 c 630.512 624.074 l 621.129 624.074 129 | l 621.129 590.543 l 621.129 582.637 617.195 578.688 609.328 578.688 c 603.961 130 | 578.688 598.859 580.934 594.027 585.43 c h 131 | 594.027 624.074 m f 132 | Q Q 133 | showpage 134 | %%Trailer 135 | end 136 | %%EOF 137 | -------------------------------------------------------------------------------- /assets/logo/print_transparent.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/print_transparent.pdf -------------------------------------------------------------------------------- /assets/logo/print_transparent.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Created with Fabric.js 4.4.0 5 | 6 | 7 | 10 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /assets/logo/textonly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/textonly.png -------------------------------------------------------------------------------- /assets/logo/textonly_nobuffer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/textonly_nobuffer.png -------------------------------------------------------------------------------- /cmake/aarch64/dabun.cmake: -------------------------------------------------------------------------------- 1 | option(DABUN_BUILD_APPS_FOR_NEON "Set to ON to build apps for NEON extension" OFF) 2 | option(DABUN_BUILD_APPS_FOR_NEON_FP16 "Set to ON to build apps for NEON FP16 extension" OFF) 3 | 4 | add_library(dabun 5 | ${DABUN_COMMON_SRC_CPP_FILES}) 6 | 7 | target_include_directories(${PROJECT_NAME} 8 | PUBLIC ${PROJECT_BINARY_DIR}) 9 | 10 | target_include_directories(${PROJECT_NAME} 11 | PUBLIC include) 12 | 13 | target_include_directories(${PROJECT_NAME} 14 | PUBLIC extern/xbyak_aarch64) 15 | 16 | target_include_directories(${PROJECT_NAME} 17 | PUBLIC ${Boost_INCLUDE_DIRS}) 18 | 19 | target_compile_options(dabun 20 | PRIVATE "-DDABUN_COMPILING_LIBDABUN") 21 | -------------------------------------------------------------------------------- /cmake/x86_64/dabun.cmake: -------------------------------------------------------------------------------- 1 | option(DABUN_BUILD_APPS_FOR_AVX2 "Set to ON to build apps for AVX2 extension" OFF) 2 | option(DABUN_BUILD_APPS_FOR_AVX2_PLUS "Set to ON to build apps for AVX512 extension using AVX512 instructions but only AVX2 (YMM) registers" OFF) 3 | option(DABUN_BUILD_APPS_FOR_AVX512 "Set to ON to build apps for AVX512 extension" OFF) 4 | option(DABUN_BUILD_APPS_FOR_AMX "Set to ON to build apps for AMX extension" OFF) 5 | 6 | add_library(dabun 7 | ${DABUN_COMMON_SRC_CPP_FILES}) 8 | 9 | target_include_directories(${PROJECT_NAME} 10 | PUBLIC ${PROJECT_BINARY_DIR}) 11 | 12 | target_include_directories(${PROJECT_NAME} 13 | PUBLIC include) 14 | 15 | # target_include_directories(${PROJECT_NAME} 16 | # PUBLIC extern/xbyak) 17 | 18 | target_compile_options(dabun 19 | PRIVATE "-DDABUN_COMPILING_LIBDABUN") 20 | -------------------------------------------------------------------------------- /dabun_config.hpp.in: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #define DABUN_VERSION_MAJOR @dabun_VERSION_MAJOR@ 9 | #define DABUN_VERSION_MINOR @dabun_VERSION_MINOR@ 10 | #define DABUN_VERSION_PATCH @dabun_VERSION_PATCH@ 11 | -------------------------------------------------------------------------------- /include/dabun/aligned_vector.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | 10 | #include 11 | 12 | namespace dabun 13 | { 14 | 15 | template 16 | using aligned_vector = std::vector>; 17 | 18 | } // namespace dabun 19 | -------------------------------------------------------------------------------- /include/dabun/arithmetic_operation.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/isa.hpp" 9 | 10 | #if defined(DABUN_ARCH_AARCH64) 11 | # include "dabun/arm/arithmetic_operation.hpp" 12 | #else 13 | # include "dabun/x86/arithmetic_operation.hpp" 14 | #endif 15 | 16 | namespace dabun 17 | { 18 | 19 | using DABUN_ISA_NAMESPACE ::fma; 20 | using DABUN_ISA_NAMESPACE ::multiply_max; 21 | using DABUN_ISA_NAMESPACE ::multiply_min; 22 | using DABUN_ISA_NAMESPACE ::non_fused_ma; 23 | using DABUN_ISA_NAMESPACE ::plus_max; 24 | 25 | using DABUN_ISA_NAMESPACE ::operation_pair; 26 | using DABUN_ISA_NAMESPACE ::operation_pair_base; 27 | 28 | namespace op 29 | { 30 | using DABUN_ISA_NAMESPACE ::basic_multiplies; 31 | using DABUN_ISA_NAMESPACE ::basic_plus; 32 | using DABUN_ISA_NAMESPACE ::duplicate_base_plus; 33 | using DABUN_ISA_NAMESPACE ::max; 34 | using DABUN_ISA_NAMESPACE ::min; 35 | } // namespace op 36 | 37 | } // namespace dabun 38 | -------------------------------------------------------------------------------- /include/dabun/arm/arithmetic_operation.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | 10 | namespace dabun 11 | { 12 | namespace arm 13 | { 14 | 15 | class operation_pair_base 16 | { 17 | }; 18 | 19 | template 20 | class operation_pair : public operation_pair_base 21 | { 22 | }; 23 | 24 | class basic_plus 25 | { 26 | }; 27 | 28 | class duplicate_base_plus 29 | { 30 | }; 31 | 32 | class max 33 | { 34 | }; 35 | 36 | class min 37 | { 38 | }; 39 | 40 | class basic_multiplies 41 | { 42 | }; 43 | 44 | inline std::shared_ptr const fma = 45 | std::make_shared(); 46 | 47 | // exclusively here to test non-fused operations as base case 48 | inline std::shared_ptr const non_fused_ma = 49 | std::make_shared(); 50 | 51 | inline std::shared_ptr const multiply_max = 52 | std::make_shared(); 53 | 54 | inline std::shared_ptr const multiply_min = 55 | std::make_shared(); 56 | 57 | inline std::shared_ptr const plus_max = 58 | std::make_shared(); 59 | 60 | } // namespace arm 61 | } // namespace dabun 62 | -------------------------------------------------------------------------------- /include/dabun/arm/configuration.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | namespace dabun 9 | { 10 | namespace arm 11 | { 12 | 13 | class OptimizationConfiguration 14 | { 15 | }; 16 | 17 | inline OptimizationConfiguration all_optims; 18 | 19 | inline OptimizationConfiguration no_optims; 20 | 21 | } // namespace arm 22 | } // namespace dabun 23 | -------------------------------------------------------------------------------- /include/dabun/arm/elementwise_operation.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | 10 | namespace dabun 11 | { 12 | namespace arm 13 | { 14 | 15 | template 16 | class elementwise_operation 17 | { 18 | private: 19 | bool is_relu_ = false; 20 | 21 | public: 22 | bool is_relu() const { return is_relu_; } 23 | 24 | explicit elementwise_operation(bool b) 25 | : is_relu_(b) 26 | { 27 | } 28 | }; 29 | 30 | template 31 | class relu_elementwise_operation 32 | { 33 | }; 34 | 35 | template 36 | class single_tensor_elementwise_operation 37 | { 38 | }; 39 | 40 | template 41 | inline auto elementwise_relu = std::make_shared>(true); 42 | 43 | template 44 | inline auto 45 | elementwise_bias = std::make_shared>(false); 46 | 47 | template 48 | inline auto 49 | elementwise_multiply = std::make_shared>(false); 50 | 51 | } // namespace arm 52 | } // namespace dabun 53 | -------------------------------------------------------------------------------- /include/dabun/arm/multi_vreg.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/numeric.hpp" 9 | 10 | #include 11 | #include 12 | 13 | namespace dabun 14 | { 15 | namespace arm 16 | { 17 | 18 | // The main usage of the multi_vreg class is to increase the amount of 19 | // independent operations when accumulating to a single vector 20 | // register. This is accomplished by using multiple vector registers 21 | // which are reduced to a single one at the end. Each of the size_ 22 | // registers is independent of all the other ones. 23 | 24 | template 25 | class multi_vreg 26 | { 27 | private: 28 | int size_ = 0; 29 | int first_ = 0; 30 | int current_ = 0; 31 | int vlen_ = 4; 32 | int original_size_ = 0; 33 | 34 | public: 35 | multi_vreg() {} 36 | 37 | multi_vreg(int s, int f) 38 | : size_(s) 39 | , first_(f) 40 | , current_(0) 41 | , original_size_(s) 42 | { 43 | assert(s > 0); 44 | } 45 | 46 | void reset() 47 | { 48 | size_ = original_size_; 49 | current_ = 0; 50 | } 51 | 52 | multi_vreg(multi_vreg const&) = delete; 53 | multi_vreg& operator=(multi_vreg const&) = delete; 54 | 55 | multi_vreg(multi_vreg&& o) { *this = std::move(o); } 56 | 57 | multi_vreg& operator=(multi_vreg&& o) 58 | { 59 | assert(o.size_ > 0); 60 | size_ = o.size_; 61 | first_ = o.first_; 62 | current_ = o.current_; 63 | original_size_ = o.original_size_; 64 | return *this; 65 | } 66 | 67 | int size() const { return size_; } 68 | 69 | VReg operator++(int) 70 | { 71 | int c = current_; 72 | current_ = (current_ + 1) % size_; 73 | return VReg(first_ + c); 74 | } 75 | 76 | VReg operator[](int s) const 77 | { 78 | assert(s < size_); 79 | return VReg(first_ + s); 80 | } 81 | 82 | VReg operator++() 83 | { 84 | current_ = (current_ + 1) % size_; 85 | return VReg(first_ + current_); 86 | } 87 | 88 | VReg current() const { return VReg(first_ + current_); } 89 | 90 | VReg first() const { return VReg(first_); } 91 | 92 | template 93 | void half(Code_Generator& code_generator) 94 | { 95 | static_assert(std::is_same_v || 96 | std::is_same_v); 97 | 98 | int h = (size_ + 1) / 2; 99 | for (int i = 0; i + h < size_; ++i) 100 | { 101 | if constexpr (std::is_same_v) 102 | { 103 | code_generator.fadd(VReg(first_ + i).s4, VReg(first_ + i).s4, 104 | VReg(first_ + i + h).s4); 105 | } 106 | else if constexpr (std::is_same_v) 107 | { 108 | code_generator.fadd(VReg(first_ + i).h8, VReg(first_ + i).h8, 109 | VReg(first_ + i + h).h8); 110 | } 111 | } 112 | size_ = h; 113 | current_ = 0; 114 | } 115 | 116 | template 117 | void reduce(Code_Generator& code_generator) 118 | { 119 | static_assert(std::is_same_v || 120 | std::is_same_v); 121 | 122 | while (size_ > 1) 123 | { 124 | half(code_generator); 125 | } 126 | } 127 | 128 | template 129 | void full_reduce(Code_Generator& code_generator, int mask = 4, 130 | int zero_vector = 0) 131 | { 132 | static_assert(std::is_same_v || 133 | std::is_same_v); 134 | 135 | reduce(code_generator); 136 | assert(size_ == 1); 137 | 138 | if constexpr (std::is_same_v) 139 | { 140 | if (mask == 3) 141 | { 142 | // x4/w4 is zero reg by convention in the loop_nest.hpp 143 | code_generator.ins(VReg(first_).s4[3], code_generator.w4); 144 | } 145 | if (mask > 2) 146 | { 147 | code_generator.faddp(VReg(first_).s4, VReg(first_).s4, 148 | VReg(first_).s4); 149 | } 150 | if (mask > 1) 151 | { 152 | code_generator.faddp(SReg(first_), VReg(first_).s2); 153 | } 154 | } 155 | else if constexpr (std::is_same_v) 156 | { 157 | switch (mask) 158 | { 159 | case 3: 160 | code_generator.ins(VReg(first_).h8[3], VReg(zero_vector).h8[3]); 161 | break; 162 | case 5: 163 | code_generator.ins(VReg(first_).h8[5], VReg(zero_vector).h8[5]); 164 | // fallthrough 165 | case 6: 166 | code_generator.ins(VReg(first_).s4[3], code_generator.w4); 167 | break; 168 | case 7: 169 | code_generator.ins(VReg(first_).h8[7], VReg(zero_vector).h8[7]); 170 | break; 171 | default: 172 | break; 173 | } 174 | 175 | if (mask > 4) 176 | { 177 | code_generator.faddp(VReg(first_).h8, VReg(first_).h8, 178 | VReg(first_).h8); 179 | } 180 | if (mask > 2) 181 | { 182 | code_generator.faddp(VReg(first_).h4, VReg(first_).h4, 183 | VReg(first_).h4); 184 | } 185 | if (mask > 1) 186 | { 187 | // TO DO HERE. 188 | code_generator.faddp(HReg(first_), VReg(first_).h2); 189 | } 190 | } 191 | } 192 | }; 193 | 194 | } // namespace arm 195 | } // namespace dabun 196 | -------------------------------------------------------------------------------- /include/dabun/arm/peak_gflops.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/isa.hpp" 9 | #ifdef DABUN_ARCH_AARCH64 10 | 11 | # include "dabun/code_generator/code_generator.hpp" 12 | # include "dabun/isa.hpp" 13 | # include "dabun/math.hpp" 14 | # include "dabun/numeric.hpp" 15 | 16 | # include 17 | 18 | # include 19 | 20 | namespace dabun 21 | { 22 | namespace arm 23 | { 24 | 25 | template 26 | struct bench_gflops; 27 | 28 | template 29 | struct bench_gflops 30 | { 31 | private: 32 | static constexpr int vector_size = isa_traits::vector_size; 33 | 34 | class test : public code_generator 35 | { 36 | private: 37 | Reg64 ZeroReg_ = x4; 38 | 39 | public: 40 | test() 41 | { 42 | eor(ZeroReg_, ZeroReg_, ZeroReg_); 43 | ins(v0.d[0], ZeroReg_); 44 | ins(v0.d[1], ZeroReg_); 45 | ins(v1.d[0], ZeroReg_); 46 | ins(v1.d[1], ZeroReg_); 47 | 48 | auto loopLabel = make_label(); 49 | L_aarch64(*loopLabel); 50 | 51 | for (int i = 0; i < 10; ++i) 52 | { 53 | if constexpr (std::is_same_v) 54 | { 55 | for (int r = 2; r < 8; ++r) 56 | { 57 | fmla(VReg(r).s4, v0.s4, v1.s4); 58 | } 59 | for (int r = 16; r < 32; ++r) 60 | { 61 | fmla(VReg(r).s4, v0.s4, v1.s4); 62 | } 63 | } 64 | else 65 | { 66 | for (int r = 2; r < 8; ++r) 67 | { 68 | fmla(VReg(r).h8, v0.h8, v1.h8); 69 | } 70 | for (int r = 16; r < 32; ++r) 71 | { 72 | fmla(VReg(r).h8, v0.h8, v1.h8); 73 | } 74 | } 75 | } 76 | 77 | sub(x0, x0, 1); 78 | cbnz(x0, *loopLabel); 79 | 80 | ret(); 81 | } 82 | }; 83 | 84 | public: 85 | static std::pair do_bench(int iterations = 10000000) 86 | { 87 | auto fn = test().get_shared(); 88 | 89 | double secs = sysml::measure_fastest([&]() { fn(iterations); }, 100); 90 | 91 | double gflops = 2.0 * iterations * 10 * (16 + 6) * 92 | (vector_size * 4 / sizeof(Arithmetic)) / 1000000000; 93 | 94 | return {gflops, secs}; 95 | } 96 | }; 97 | 98 | # ifndef DABUN_HEADER_ONLY 99 | 100 | extern template struct bench_gflops; 101 | extern template struct bench_gflops; 102 | 103 | # endif 104 | 105 | } // namespace arm 106 | } // namespace dabun 107 | 108 | #endif 109 | -------------------------------------------------------------------------------- /include/dabun/arm/xbyak.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/isa.hpp" 9 | #ifdef DABUN_ARCH_AARCH64 10 | 11 | # include "xbyak_aarch64/xbyak_aarch64.h" 12 | 13 | # include "dabun/core.hpp" 14 | 15 | using xbyak_buffer_type = std::uint32_t; 16 | 17 | namespace Xbyak 18 | { 19 | using namespace Xbyak_aarch64; 20 | using CodeArray = CodeArrayAArch64; 21 | using Allocator = AllocatorAArch64; 22 | using CodeGenerator = CodeGeneratorAArch64; 23 | using Reg64 = XReg; 24 | using Label = LabelAArch64; 25 | } // namespace Xbyak 26 | 27 | namespace dabun 28 | { 29 | 30 | template 31 | struct vreg_view 32 | { 33 | private: 34 | static_assert(ElementSize == 1 || ElementSize == 2 || ElementSize == 4 || 35 | ElementSize == 8); 36 | 37 | public: 38 | decltype(auto) operator()(Xbyak::VReg const& r) 39 | { 40 | if constexpr (ElementSize == 1) 41 | { 42 | if constexpr (NumElements == 4) 43 | { 44 | return r.b4; 45 | } 46 | else if constexpr (NumElements == 8) 47 | { 48 | return r.b8; 49 | } 50 | else if constexpr (NumElements == 16) 51 | { 52 | return r.b16; 53 | } 54 | else 55 | { 56 | strong_assert(false); 57 | return nullptr; 58 | } 59 | } 60 | else if constexpr (ElementSize == 2) 61 | { 62 | if constexpr (NumElements == 2) 63 | { 64 | return r.h2; 65 | } 66 | else if constexpr (NumElements == 4) 67 | { 68 | return r.h4; 69 | } 70 | else if constexpr (NumElements == 8) 71 | { 72 | return r.h8; 73 | } 74 | else 75 | { 76 | strong_assert(false); 77 | return nullptr; 78 | } 79 | } 80 | else if constexpr (ElementSize == 4) 81 | { 82 | // if constexpr (NumElements == 1) 83 | // { 84 | // return r.s1; 85 | // } 86 | //else 87 | if constexpr (NumElements == 2) 88 | { 89 | return r.s2; 90 | } 91 | else if constexpr (NumElements == 4) 92 | { 93 | return r.s4; 94 | } 95 | else 96 | { 97 | strong_assert(false); 98 | return nullptr; 99 | } 100 | } 101 | else if constexpr (ElementSize == 8) 102 | { 103 | if constexpr (NumElements == 1) 104 | { 105 | return r.d1; 106 | } 107 | else if constexpr (NumElements == 2) 108 | { 109 | return r.d2; 110 | } 111 | else 112 | { 113 | strong_assert(false); 114 | return nullptr; 115 | } 116 | } 117 | else 118 | { 119 | strong_assert(false); 120 | return nullptr; 121 | } 122 | } 123 | }; 124 | 125 | } // namespace dabun 126 | 127 | #endif 128 | -------------------------------------------------------------------------------- /include/dabun/check.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/numeric.hpp" 9 | #include "sysml/math.hpp" 10 | 11 | #include 12 | #include 13 | 14 | namespace dabun 15 | { 16 | 17 | template 18 | void apply_relu(Float* Begin, Float* End) 19 | { 20 | for (; Begin != End; ++Begin) 21 | { 22 | if constexpr (std::is_same_v) 23 | { 24 | *Begin = static_cast( 25 | std::max(static_cast(0), static_cast(*Begin))); 26 | } 27 | else 28 | { 29 | *Begin = std::max(static_cast(0), *Begin); 30 | } 31 | } 32 | } 33 | 34 | template 35 | auto max_abs_difference(Float const* LBegin, Float const* LEnd, 36 | Float const* RBegin) 37 | { 38 | decltype(sysml::absolute_difference(*LBegin, *RBegin)) res = 0; 39 | 40 | for (; LBegin != LEnd; ++LBegin, ++RBegin) 41 | { 42 | res = std::max(res, sysml::absolute_difference(*LBegin, *RBegin)); 43 | } 44 | return res; 45 | } 46 | 47 | template 48 | Float max_abs_difference_verbose(Float const* LBegin, Float const* LEnd, 49 | Float const* RBegin) 50 | { 51 | int off = 0; 52 | Float res = 0; 53 | for (; LBegin != LEnd; ++LBegin, ++RBegin) 54 | { 55 | if constexpr (std::is_same_v) 56 | { 57 | std::cout << off++ << " : " << *LBegin << " " 58 | << static_cast(*RBegin) << " " 59 | << std::abs(static_cast(*LBegin - *RBegin)) 60 | << "\n"; 61 | res = static_cast( 62 | std::max(static_cast(res), 63 | std::abs(static_cast(*LBegin) - 64 | static_cast(*RBegin)))); 65 | } 66 | else 67 | { 68 | std::cout << off++ << " : " << *LBegin << " " << *RBegin << " " 69 | << std::abs(*LBegin - *RBegin) << "\n"; 70 | res = std::max(res, std::abs(*LBegin - *RBegin)); 71 | } 72 | } 73 | return res; 74 | } 75 | 76 | template 77 | Float max_abs_difference_verbose(Float const* LBegin, Float const* LEnd, 78 | Float const* RBegin, float delta) 79 | { 80 | int off = 0; 81 | Float res = 0; 82 | for (; LBegin != LEnd; ++LBegin, ++RBegin) 83 | { 84 | if (std::abs(*LBegin - *RBegin) > delta) 85 | { 86 | std::cout << off << " : " << *LBegin << " " << *RBegin << " " 87 | << std::abs(*LBegin - *RBegin) << "\n"; 88 | } 89 | res = std::max(res, std::abs(*LBegin - *RBegin)); 90 | off++; 91 | } 92 | return res; 93 | } 94 | 95 | } // namespace dabun 96 | -------------------------------------------------------------------------------- /include/dabun/code_generator.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/code_generator/code_generator.hpp" 9 | -------------------------------------------------------------------------------- /include/dabun/code_generator/aot_fn.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | 11 | namespace dabun 12 | { 13 | 14 | // using ::sysml::code_generator::code_generated_fn_ref; 15 | using ::sysml::code_generator::observed_dynamic_fn; 16 | using ::sysml::code_generator::shared_dynamic_fn; 17 | using ::sysml::code_generator::unique_dynamic_fn; 18 | using ::sysml::code_generator::weak_dynamic_fn; 19 | 20 | using ::sysml::code_generator::dynamic_fn_cast; 21 | 22 | } // namespace dabun 23 | -------------------------------------------------------------------------------- /include/dabun/code_generator/code_generator.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #include "dabun/code_generator/xbyak.hpp" 13 | 14 | namespace dabun 15 | { 16 | using ::sysml::code_generator::allocator_adapter_base; 17 | using ::sysml::code_generator::basic_code_generator; 18 | using ::sysml::code_generator::code_generator; 19 | using ::sysml::code_generator::with_signature; 20 | } // namespace dabun 21 | -------------------------------------------------------------------------------- /include/dabun/code_generator/memory_resource.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | 10 | namespace dabun 11 | { 12 | 13 | using ::sysml::code_generator::inplace_memory_resource; 14 | using ::sysml::code_generator::malloc_memory_resource; 15 | using ::sysml::code_generator::memory_resource; 16 | using ::sysml::code_generator::mmap_memory_resource; 17 | 18 | } // namespace dabun 19 | -------------------------------------------------------------------------------- /include/dabun/code_generator/xbyak.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/isa.hpp" 9 | 10 | #if defined(DABUN_ARCH_AARCH64) 11 | 12 | # include "dabun/arm/xbyak.hpp" 13 | 14 | #else 15 | 16 | # include "dabun/x86/xbyak.hpp" 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /include/dabun/common.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/code_generator/xbyak.hpp" 9 | #include "dabun/core.hpp" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | namespace dabun 20 | { 21 | 22 | static inline constexpr int skip_postop = 0b10; 23 | static inline constexpr int alpha_1 = 0b01; 24 | static inline constexpr int alpha_0 = 0b00; 25 | 26 | enum access_kind 27 | { 28 | SCALAR, 29 | VECTOR_PACKED, 30 | VECTOR_STRIDED 31 | }; 32 | 33 | inline std::string to_string(access_kind akind) 34 | { 35 | switch (akind) 36 | { 37 | case SCALAR: 38 | return "scalar"; 39 | case VECTOR_PACKED: 40 | return "vector_packed"; 41 | case VECTOR_STRIDED: 42 | return "vector_strided"; 43 | } 44 | return "unknown"; 45 | } 46 | 47 | struct loop_descriptor 48 | { 49 | std::string var; 50 | int end; 51 | int delta; 52 | }; 53 | 54 | struct tensor_traits 55 | { 56 | std::string name; 57 | access_kind access; 58 | Xbyak::Reg64 reg = Xbyak::Reg64(0); 59 | Xbyak::Label* stridesLabel; 60 | int innermost_stride; 61 | int access_len; 62 | }; 63 | 64 | template 65 | struct memory_argument_type 66 | { 67 | int offset; 68 | tensor_traits const* traits; 69 | int mask; 70 | std::map coordinates; 71 | 72 | memory_argument_type(int offset, tensor_traits const* traits, int mask, 73 | std::map coordinates = {}) 74 | : offset(offset) 75 | , traits(traits) 76 | , mask(mask) 77 | , coordinates(coordinates){}; 78 | // We are not comparing the mask 79 | 80 | bool operator<(memory_argument_type const& o) const 81 | { 82 | return std::tie(offset, mask, traits->name) < 83 | std::tie(o.offset, mask, o.traits->name); 84 | } 85 | 86 | bool operator==(memory_argument_type const& o) const 87 | { 88 | return std::tie(offset, mask, traits->name) == 89 | std::tie(o.offset, mask, o.traits->name); 90 | } 91 | 92 | std::string readable() const 93 | { 94 | assert(traits); 95 | return traits->name + "[" + std::to_string(offset) + ":" + 96 | std::to_string(traits->access == SCALAR ? 1 : vector_size) + 97 | "]{" + std::to_string(traits->innermost_stride) + "}{" + 98 | std::to_string(mask) + "}"; 99 | } 100 | }; 101 | 102 | struct in_register_tensor_pointer_type 103 | { 104 | std::string name; 105 | Xbyak::Reg64 reg; 106 | std::map strides; 107 | }; 108 | 109 | inline int get_cursor_offset(std::map coordinates, 110 | std::map strides) 111 | { 112 | int off = 0; 113 | for (auto const& s : strides) 114 | { 115 | off += coordinates[s.first] * s.second; 116 | } 117 | return off; 118 | } 119 | 120 | } // namespace dabun 121 | -------------------------------------------------------------------------------- /include/dabun/configuration.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/isa.hpp" 9 | 10 | #if defined(DABUN_ARCH_AARCH64) 11 | # include "dabun/arm/configuration.hpp" 12 | #else 13 | # include "dabun/x86/configuration.hpp" 14 | #endif 15 | 16 | namespace dabun 17 | { 18 | 19 | using DABUN_ISA_NAMESPACE ::all_optims; 20 | using DABUN_ISA_NAMESPACE ::no_optims; 21 | using DABUN_ISA_NAMESPACE ::OptimizationConfiguration; 22 | 23 | } // namespace dabun 24 | -------------------------------------------------------------------------------- /include/dabun/core.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #define DABUN_STRINGIFY_0(s) #s 14 | #define DABUN_STRINGIFY(s) DABUN_STRINGIFY_0(s) 15 | 16 | #if defined(DABUN_REQUIES_TEMPLATE_DEFINITION) || \ 17 | defined(DABUN_MAYBE_EXTN_TPL_INSTNTON) 18 | # error The macros above cannot be defined at this stage! 19 | #endif 20 | 21 | #if defined(DABUN_HEADER_ONLY) 22 | # if defined(DABUN_COMPILING_LIBDABUN) 23 | # error Unsupported combination of defines 24 | # else 25 | # define DABUN_REQUIES_TEMPLATE_DEFINITION 26 | # endif 27 | #else 28 | # if defined(DABUN_COMPILING_LIBDABUN) 29 | # define DABUN_REQUIES_TEMPLATE_DEFINITION 30 | # define DABUN_MAYBE_EXTN_TPL_INSTNTON template 31 | # else 32 | # define DABUN_MAYBE_EXTN_TPL_INSTNTON extern template 33 | # endif 34 | #endif 35 | 36 | #define strong_assert(condition) \ 37 | if (!(condition)) \ 38 | { \ 39 | throw std::runtime_error( \ 40 | DABUN_STRINGIFY(condition) " failed file: " __FILE__ \ 41 | " line: " DABUN_STRINGIFY((__LINE__))); \ 42 | } \ 43 | static_cast(0) 44 | 45 | namespace dabun 46 | { 47 | 48 | #ifndef NDEBUG 49 | inline constexpr bool compiled_in_debug_mode = true; 50 | #else 51 | inline constexpr bool compiled_in_debug_mode = false; 52 | #endif 53 | 54 | // FROM: https://en.cppreference.com/w/cpp/utility/variant/visit 55 | 56 | template 57 | struct overloaded : Ts... 58 | { 59 | using Ts::operator()...; 60 | }; 61 | 62 | // explicit deduction guide (not needed as of C++20) 63 | template 64 | overloaded(Ts...) -> overloaded; 65 | 66 | template 67 | struct identity_type 68 | { 69 | using type = T; 70 | }; 71 | 72 | template 73 | using identity_type_t = typename identity_type::type; 74 | 75 | // Sourced from https://en.cppreference.com/w/cpp/numeric/bit_cast 76 | // to enable bit_cast from C++20 77 | template 78 | typename std::enable_if_t && 80 | std::is_trivially_copyable_v, 81 | To> 82 | // constexpr support needs compiler magic 83 | bit_cast(const From& src) noexcept 84 | { 85 | static_assert(std::is_trivially_constructible_v, 86 | "This implementation additionally requires destination type " 87 | "to be trivially constructible"); 88 | 89 | To dst; 90 | std::memcpy(&dst, &src, sizeof(To)); 91 | return dst; 92 | } 93 | 94 | #define DABUN_OP_RESULT_TYPE(OP, T1, T2) \ 95 | decltype(std::declval>() \ 96 | OP std::declval>()) 97 | 98 | #define DABUN_ALWAYS_INLINE __attribute__((always_inline)) inline 99 | 100 | } // namespace dabun 101 | -------------------------------------------------------------------------------- /include/dabun/elementwise_operation.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/isa.hpp" 9 | 10 | #if defined(DABUN_ARCH_AARCH64) 11 | # include "dabun/arm/elementwise_operation.hpp" 12 | #else 13 | # include "dabun/x86/elementwise_operation.hpp" 14 | #endif 15 | 16 | namespace dabun 17 | { 18 | 19 | using DABUN_ISA_NAMESPACE ::elementwise_bias; 20 | using DABUN_ISA_NAMESPACE ::elementwise_multiply; 21 | using DABUN_ISA_NAMESPACE ::elementwise_relu; 22 | 23 | using DABUN_ISA_NAMESPACE ::elementwise_operation; 24 | using DABUN_ISA_NAMESPACE ::relu_elementwise_operation; 25 | using DABUN_ISA_NAMESPACE ::single_tensor_elementwise_operation; 26 | 27 | } // namespace dabun 28 | -------------------------------------------------------------------------------- /include/dabun/hask/apple.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #if defined(__APPLE__) 9 | 10 | # include 11 | # include 12 | # include 13 | 14 | # ifndef MAP_JIT 15 | # define MAP_JIT 0x800 16 | # endif 17 | 18 | namespace dabun::hask 19 | { 20 | 21 | inline constexpr int mojave_version = 18; 22 | 23 | inline int get_macOS_version() 24 | { 25 | static const int version = []() 26 | { 27 | char buffer[64]; 28 | std::size_t size = sizeof(buffer); 29 | 30 | if (auto err = 31 | sysctlbyname("kern.osrelease", buffer, &size, nullptr, 0); 32 | err != 0) 33 | { 34 | return 0; 35 | } 36 | 37 | char* endp = nullptr; 38 | 39 | int ver_major = std::strtol(buffer, &endp, 10); 40 | 41 | if (*endp != '.') 42 | { 43 | return 0; 44 | } 45 | return ver_major; 46 | }(); 47 | 48 | return version; 49 | } 50 | 51 | } // namespace dabun::hask 52 | 53 | #endif 54 | -------------------------------------------------------------------------------- /include/dabun/isa.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | 10 | #ifndef DABUN_ISA 11 | 12 | # if defined(__AVX512F__) 13 | # define DABUN_ISA avx512 14 | # elif defined(__aarch64__) 15 | # define DABUN_ISA aarch64 16 | # else // default to avx2 17 | // #elif defined(__AVX2__) 18 | # define DABUN_ISA avx2 19 | // #error "ISA not supported" 20 | # endif 21 | 22 | #endif 23 | 24 | #ifndef DABUN_VEX 25 | # ifdef DABUN_ARITHMETIC 26 | # undef DABUN_ARITHMETIC 27 | # endif 28 | # 29 | # if defined(SYSML_ARCH_AMD64) 30 | # define DABUN_VEX extension::avx2 31 | # define DABUN_ARITHMETIC dabun::float 32 | # elif defined(SYSML_ARCH_ARM64) 33 | # define DABUN_VEX extension::aarch64 34 | # define DABUN_ARITHMETIC dabun::float 35 | # else 36 | # error "Not supported" 37 | # endif 38 | #endif 39 | 40 | namespace dabun 41 | { 42 | 43 | struct avx2 44 | { 45 | }; 46 | struct avx512 47 | { 48 | }; 49 | struct avx2_plus 50 | { 51 | }; 52 | struct aarch64 53 | { 54 | }; 55 | 56 | template 57 | struct isa_traits; 58 | 59 | template <> 60 | struct isa_traits 61 | { 62 | static constexpr int total_vector_registers = 16; 63 | static constexpr int vector_register_mask = 1; 64 | static constexpr int vector_size = 8; 65 | }; 66 | 67 | template <> 68 | struct isa_traits 69 | { 70 | static constexpr int total_vector_registers = 32; 71 | static constexpr int vector_register_mask = 0; 72 | static constexpr int vector_size = 16; 73 | }; 74 | 75 | template <> 76 | struct isa_traits 77 | { 78 | static constexpr int total_vector_registers = 16; 79 | static constexpr int vector_register_mask = 0; 80 | static constexpr int vector_size = 8; 81 | }; 82 | 83 | template <> 84 | struct isa_traits 85 | { 86 | static constexpr int total_vector_registers = 32; 87 | static constexpr int vector_register_mask = 0; 88 | static constexpr int vector_size = 4; 89 | static constexpr int fp16_vector_size = 2; 90 | }; 91 | 92 | } // namespace dabun 93 | 94 | // Copyright 2004-present Facebook. All Rights Reserved. 95 | 96 | // #pragma once 97 | 98 | // For deprecated APIs 99 | // #include "dabun/isa.hpp" 100 | 101 | #include 102 | #include 103 | 104 | #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || \ 105 | defined(__x86_64) 106 | # define DABUN_ARCH_X86_64 107 | #elif defined(__aarch64__) 108 | # define DABUN_ARCH_AARCH64 109 | #else 110 | # error "Unknown target architecture" 111 | #endif 112 | 113 | namespace dabun 114 | { 115 | 116 | enum class architecture_kind : int 117 | { 118 | unknown = 0, 119 | x86_64 = 1, 120 | aarch64 = 2 121 | }; 122 | 123 | enum class extension : int 124 | { 125 | unknown = 0, 126 | 127 | // #if defined(DABUN_ARCH_X86_64) 128 | avx = 1001, 129 | avx2 = 1002, 130 | avx512_ymm = 1003, 131 | avx512 = 1004, 132 | // #elif defined(DABUN_ARCH_AARCH64) 133 | neon = 2001, 134 | neon_fp16 = 2002 135 | // #endif 136 | }; 137 | 138 | template 139 | struct extension_traits; 140 | 141 | // TODO(zi) deprecate the following two 142 | template 143 | struct extension_to_deprecated_ISA; 144 | 145 | template 146 | using extension_to_deprecated_ISA_t = 147 | typename extension_to_deprecated_ISA::type; 148 | 149 | #if defined(DABUN_ARCH_X86_64) 150 | 151 | template <> 152 | struct extension_traits 153 | { 154 | static constexpr architecture_kind architecture = architecture_kind::x86_64; 155 | static constexpr int vector_register_bits = 256; 156 | static constexpr int vector_size = 32; 157 | static constexpr bool has_mask_register = false; 158 | static constexpr int num_vector_registers = 32; 159 | }; 160 | 161 | template <> 162 | struct extension_traits 163 | { 164 | static constexpr architecture_kind architecture = architecture_kind::x86_64; 165 | static constexpr int vector_register_bits = 256; 166 | static constexpr int vector_size = 32; 167 | static constexpr bool has_mask_register = true; 168 | static constexpr int num_vector_registers = 32; 169 | }; 170 | 171 | template <> 172 | struct extension_traits 173 | { 174 | static constexpr architecture_kind architecture = architecture_kind::x86_64; 175 | static constexpr int vector_register_bits = 512; 176 | static constexpr int vector_size = 64; 177 | static constexpr bool has_mask_register = false; 178 | static constexpr int num_vector_registers = 32; 179 | }; 180 | 181 | // TODO(zi) deprecate 182 | template <> 183 | struct extension_to_deprecated_ISA 184 | { 185 | using type = avx2; 186 | }; 187 | template <> 188 | struct extension_to_deprecated_ISA 189 | { 190 | using type = avx2_plus; 191 | }; 192 | template <> 193 | struct extension_to_deprecated_ISA 194 | { 195 | using type = avx512; 196 | }; 197 | 198 | #elif defined(DABUN_ARCH_AARCH64) 199 | 200 | template <> 201 | struct extension_traits 202 | { 203 | static constexpr architecture_kind architecture = 204 | architecture_kind::aarch64; 205 | static constexpr int vector_register_bits = 128; 206 | static constexpr int vector_size = 16; 207 | static constexpr bool has_mask_register = false; 208 | static constexpr int num_vector_registers = 32; 209 | }; 210 | 211 | template <> 212 | struct extension_traits 213 | { 214 | static constexpr architecture_kind architecture = 215 | architecture_kind::aarch64; 216 | static constexpr int vector_register_bits = 128; 217 | static constexpr int vector_size = 16; 218 | static constexpr bool has_mask_register = false; 219 | static constexpr int num_vector_registers = 32; 220 | }; 221 | 222 | // TODO(zi) deprecate 223 | template <> 224 | struct extension_to_deprecated_ISA 225 | { 226 | using type = aarch64; 227 | }; 228 | template <> 229 | struct extension_to_deprecated_ISA 230 | { 231 | using type = aarch64; 232 | }; 233 | 234 | #endif 235 | 236 | } // namespace dabun 237 | 238 | #if defined(DABUN_ARCH_AARCH64) 239 | # define DABUN_ISA_NAMESPACE arm 240 | #else 241 | # define DABUN_ISA_NAMESPACE x86 242 | #endif 243 | -------------------------------------------------------------------------------- /include/dabun/loop_nest.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/isa.hpp" 9 | #include "dabun/numeric.hpp" 10 | 11 | #if defined(DABUN_ARCH_AARCH64) 12 | # include "dabun/arm/loop_nest.hpp" 13 | #else 14 | # include "dabun/x86/loop_nest.hpp" 15 | #endif 16 | 17 | namespace dabun 18 | { 19 | 20 | using DABUN_ISA_NAMESPACE ::loop_nest_code_generator; 21 | 22 | #if defined(DABUN_ARCH_AARCH64) 23 | 24 | using DABUN_ISA_NAMESPACE ::loop_nest_fp16_code_generator; 25 | 26 | template 27 | using loop_nest_compiler = std::conditional_t< 28 | std::is_same_v, 29 | loop_nest_code_generator, false>, 30 | loop_nest_code_generator, true>>; 31 | 32 | #else 33 | 34 | template 35 | using loop_nest_compiler = std::conditional_t< 36 | std::is_same_v, 37 | loop_nest_code_generator>, void>; 38 | 39 | #endif 40 | 41 | } // namespace dabun 42 | -------------------------------------------------------------------------------- /include/dabun/loop_tree/all_nodes.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/loop_tree/compiled_loop_nest_node.hpp" 9 | #include "dabun/loop_tree/compiled_transpose_node.hpp" 10 | #include "dabun/loop_tree/compute_node.hpp" 11 | #include "dabun/loop_tree/for_loop_node.hpp" 12 | #include "dabun/loop_tree/transpose_node.hpp" 13 | -------------------------------------------------------------------------------- /include/dabun/loop_tree/compiled_transpose_node.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/loop_tree/node.hpp" 9 | #include "dabun/utility/tmp_file_name.hpp" 10 | 11 | namespace dabun 12 | { 13 | namespace loop_tree 14 | { 15 | 16 | template 17 | class compiled_transpose_node : public node 18 | { 19 | private: 20 | using ISA = typename extension_to_deprecated_ISA::type; 21 | 22 | using super_type = node; 23 | 24 | std::string input; 25 | std::string output; 26 | std::vector> order; 27 | strides_map_type strides; 28 | std::optional unroll_limit; 29 | 30 | public: 31 | std::string dump(formulas_map_type const& /* formulas */, 32 | std::map const& sizes, 33 | std::string const& indent) const override 34 | { 35 | std::ostringstream ss; 36 | ss << indent << "AOT_tranpose" << std::endl; 37 | ss << utility::dump_order(order, indent); 38 | ss << utility::dump_sizes(sizes, indent); 39 | ss << indent << "Input: " << input << std::endl; 40 | ss << indent << "Output: " << output << std::endl; 41 | ss << utility::dump_strides(strides, indent); 42 | return ss.str(); 43 | } 44 | 45 | public: 46 | compiled_transpose_node( 47 | std::string const& input, std::string const& output, 48 | std::vector> const& order, 49 | strides_map_type const& strides, 50 | std::optional unroll_limit = std::nullopt) 51 | : super_type(node_kind::compiled_transpose) 52 | , input(input) 53 | , output(output) 54 | , order(order) 55 | , strides(strides) 56 | , unroll_limit(unroll_limit) 57 | { 58 | } 59 | 60 | compiled_transpose_node( 61 | const compiled_transpose_node& other) = default; 62 | 63 | // creates initial transpose nest 64 | compiled_transpose_node( 65 | std::shared_ptr> const& for_node, 66 | std::shared_ptr> const& transpose_node) 67 | : compiled_transpose_node( 68 | transpose_node->get_input(), transpose_node->get_output(), 69 | {{for_node->get_var(), for_node->get_delta()}}, 70 | transpose_node->get_tensor_strides(), 71 | transpose_node->get_unroll_limit()) 72 | { 73 | } 74 | 75 | // extends the tranpose nest 76 | compiled_transpose_node( 77 | std::shared_ptr> const& for_node, 78 | std::shared_ptr> const& 79 | transpose_compiler) 80 | : compiled_transpose_node(*transpose_compiler) 81 | { 82 | order.insert(order.begin(), 83 | {for_node->get_var(), for_node->get_delta()}); 84 | } 85 | 86 | std::pair, report_vector> 87 | get_fn(std::map const& tensors_idx, 88 | std::map const& sizes, 89 | std::map const&, 90 | formulas_map_type const& /* formulas */, 91 | bool spit_asm) const override 92 | { 93 | auto aot_fn = 94 | transposer_compiler< 95 | (VEX == extension::avx512 ? extension::avx512_ymm : VEX), 96 | Arithmetic>(order, sizes, strides.at(output), strides.at(input), 97 | 64 /* unroll_limit */) 98 | .get_shared(); 99 | 100 | // aot_fn.save_to_file("transpose.asm"); 101 | 102 | std::string asm_dump = "n/a"; 103 | 104 | if (spit_asm) 105 | { 106 | asm_dump = ::dabun::utility::get_temporary_file_name(".asm"); 107 | aot_fn.save_to_file(asm_dump); 108 | } 109 | 110 | std::string extra_string = 111 | std::string("output_idx: ") + 112 | std::to_string(tensors_idx.at(output)) + 113 | ", input_idx: " + std::to_string(tensors_idx.at(input)); 114 | 115 | compiled_transpose_node_info info{0, 0, extra_string}; 116 | 117 | return {[aot_fn, output_idx = tensors_idx.at(output), 118 | input_idx = tensors_idx.at(input)]( 119 | std::vector& tensors, std::vector&) 120 | { aot_fn(tensors[output_idx], tensors[input_idx]); }, 121 | {std::make_shared(info)}}; 122 | } 123 | 124 | std::set get_tensors_used() const override 125 | { 126 | return {input, output}; 127 | } 128 | 129 | std::set get_output_tensors() const override 130 | { 131 | return {output}; 132 | } 133 | 134 | strides_map_type const& get_tensor_strides() const override 135 | { 136 | return strides; 137 | } 138 | }; 139 | 140 | } // namespace loop_tree 141 | } // namespace dabun 142 | -------------------------------------------------------------------------------- /include/dabun/loop_tree/nested_for_loops_node.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/loop_tree/node.hpp" 9 | 10 | namespace dabun 11 | { 12 | namespace loop_tree 13 | { 14 | 15 | template 16 | class for_loop_node : public node 17 | { 18 | private: 19 | using super_type = node; 20 | 21 | std::string var; 22 | int delta; 23 | 24 | std::set in_scope_tensor_names; 25 | std::set in_scope_output_tensor_names; 26 | strides_map_type in_scope_tensor_strides; 27 | 28 | public: 29 | std::string dump(formulas_map_type const& /* formulas */, 30 | std::map const& /* sizes */, 31 | std::string const& indent) const override 32 | { 33 | std::ostringstream ss; 34 | ss << indent << "Interpreted For Node" << std::endl; 35 | ss << indent << "Var=" << var << ", delta=" << delta << std::endl; 36 | return ss.str(); 37 | } 38 | 39 | private: 40 | void set_in_scope_tensor_info() 41 | { 42 | for (auto c : this->get_children()) 43 | { 44 | auto node_tensor_names = c->get_tensors_used(); 45 | in_scope_tensor_names.insert(node_tensor_names.begin(), 46 | node_tensor_names.end()); 47 | 48 | auto node_output_tensor_names = c->get_output_tensors(); 49 | in_scope_output_tensor_names.insert( 50 | node_output_tensor_names.begin(), 51 | node_output_tensor_names.end()); 52 | 53 | auto node_tensor_strides = c->get_tensor_strides(); 54 | 55 | in_scope_tensor_strides.insert(node_tensor_strides.begin(), 56 | node_tensor_strides.end()); 57 | } 58 | } 59 | 60 | std::function&, int)> 61 | get_tensor_advancer(std::map const& tensors_idx, 62 | std::set const& tensor_names) const 63 | { 64 | std::vector> to_advance; 65 | 66 | for (auto const& name : tensor_names) 67 | { 68 | if (in_scope_tensor_strides.at(name).count(var)) 69 | { 70 | std::int64_t offset = 71 | in_scope_tensor_strides.at(name).at(var) * delta; 72 | if (offset != 0) 73 | { 74 | int idx = tensors_idx.at(name); 75 | to_advance.push_back({idx, offset}); 76 | } 77 | } 78 | } 79 | 80 | return [to_advance](std::vector& tensors, int delta = 1) 81 | { 82 | for (auto const& p : to_advance) 83 | { 84 | tensors[p.first] += p.second * delta; 85 | } 86 | }; 87 | } 88 | 89 | std::function&, int)> 90 | get_alpha_offsets_adjuster(std::map const& tensors_idx, 91 | std::set const& output_tensor_names, 92 | formulas_map_type const& formulas) const 93 | { 94 | 95 | std::vector to_adjust; 96 | for (auto const& name : output_tensor_names) 97 | { 98 | if (formulas.count(name) && formulas.at(name).count(var) == 0) 99 | { 100 | // reduction variable, so adjust the tensor's alpha 101 | to_adjust.push_back(tensors_idx.at(name)); 102 | } 103 | } 104 | 105 | return [to_adjust](std::vector& alpha_offsets, int adjustment) 106 | { 107 | for (auto const& idx : to_adjust) 108 | { 109 | alpha_offsets[idx] += adjustment; 110 | } 111 | }; 112 | } 113 | 114 | public: 115 | std::string const& get_var() const { return var; } 116 | int get_delta() const { return delta; } 117 | 118 | for_loop_node(std::string var, int delta, 119 | std::vector> const& children) 120 | : super_type(node_kind::for_loop) 121 | , var(var) 122 | , delta(delta) 123 | { 124 | this->set_children(children); 125 | set_in_scope_tensor_info(); 126 | } 127 | 128 | std::set get_tensors_used() const override 129 | { 130 | return in_scope_tensor_names; 131 | } 132 | 133 | std::set get_output_tensors() const override 134 | { 135 | return in_scope_output_tensor_names; 136 | } 137 | 138 | strides_map_type const& get_tensor_strides() const override 139 | { 140 | return in_scope_tensor_strides; 141 | } 142 | 143 | std::pair, report_vector> 144 | get_fn(std::map const& tensors_idx, 145 | std::map const& sizes, 146 | std::map const& outer_iteration_depths, 147 | formulas_map_type const& formulas, bool debug_mode) const override 148 | { 149 | auto var = this->var; 150 | auto delta = this->delta; 151 | auto children = this->get_children(); 152 | auto limit = sizes.at(var); 153 | 154 | auto const [full, rest] = full_rest(limit, delta); 155 | 156 | report_vector report = { 157 | std::make_shared(for_loop_node_info{ 158 | 1, 1, var, full + (rest ? 1 : 0), delta, limit})}; 159 | 160 | std::vector> full_fns, tail_fns; 161 | 162 | auto iteration_depths = outer_iteration_depths; 163 | 164 | int last_iteration = full + (rest ? 1 : 0) - 1; 165 | strong_assert(last_iteration >= 0); 166 | 167 | iteration_depths[var] += last_iteration; 168 | 169 | for (auto c : children) 170 | { 171 | auto inner_sizes = sizes; 172 | 173 | if (full) 174 | { 175 | inner_sizes[var] = delta; 176 | auto [fn, rep] = 177 | c->get_fn(tensors_idx, inner_sizes, iteration_depths, 178 | formulas, debug_mode); 179 | full_fns.push_back(fn); 180 | report[0]->children.insert(report[0]->children.end(), 181 | rep.begin(), rep.end()); 182 | } 183 | if (rest) 184 | { 185 | inner_sizes[var] = rest; 186 | auto [fn, rep] = 187 | c->get_fn(tensors_idx, inner_sizes, iteration_depths, 188 | formulas, debug_mode); 189 | tail_fns.push_back(fn); 190 | report.insert(report.end(), rep.begin(), rep.end()); 191 | } 192 | } 193 | 194 | auto tensor_advancer = 195 | get_tensor_advancer(tensors_idx, get_tensors_used()); 196 | auto alpha_offsets_adjuster = get_alpha_offsets_adjuster( 197 | tensors_idx, get_output_tensors(), formulas); 198 | 199 | LN_LOG(DEBUG) << "loop_tree: Executing interpreted for(" << var << "," 200 | << delta << ")\n"; 201 | 202 | return {[full, full_fns, tensor_advancer, alpha_offsets_adjuster, 203 | tail_fns](std::vector& tensors, 204 | std::vector& alpha_offsets) 205 | { 206 | for (int i = 0; i < full; ++i) 207 | { 208 | for (auto const& fn : full_fns) 209 | { 210 | fn(tensors, alpha_offsets); 211 | } 212 | tensor_advancer(tensors, 1); 213 | alpha_offsets_adjuster(alpha_offsets, 1); 214 | } 215 | 216 | for (auto const& fn : tail_fns) 217 | { 218 | fn(tensors, alpha_offsets); 219 | } 220 | 221 | tensor_advancer(tensors, -full); 222 | alpha_offsets_adjuster(alpha_offsets, -full); 223 | }, 224 | report}; 225 | } 226 | }; 227 | 228 | template 229 | node_ptr 230 | make_for_loop_node(std::string var, int delta, 231 | std::vector> const& children) 232 | { 233 | return node_ptr( 234 | new for_loop_node(var, delta, children)); 235 | } 236 | 237 | } // namespace loop_tree 238 | } // namespace dabun 239 | -------------------------------------------------------------------------------- /include/dabun/loop_tree/node.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/code_generator/aot_fn.hpp" 9 | #include "dabun/configuration.hpp" 10 | #include "dabun/isa.hpp" 11 | #include "dabun/loop_nest.hpp" 12 | #include "dabun/loop_tree/report.hpp" 13 | #include "dabun/loop_tree/types.hpp" 14 | #include "dabun/loop_tree/utility.hpp" 15 | #include "dabun/utility/log.hpp" 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | namespace dabun 22 | { 23 | namespace loop_tree 24 | { 25 | 26 | enum class node_kind 27 | { 28 | for_loop, 29 | compute, 30 | transpose, 31 | compiled_loop_nest, 32 | compiled_transpose 33 | }; 34 | 35 | inline std::map const node_kind_to_str_map = { 36 | {node_kind::for_loop, "for_loop_node"}, 37 | {node_kind::compute, "compute_node"}, 38 | {node_kind::transpose, "transpose_node"}, 39 | {node_kind::compiled_loop_nest, "compiled_loop_nest_node"}, 40 | {node_kind::compiled_transpose, "compiled_transpose_node"}}; 41 | 42 | inline std::string const& node_kind_to_str(node_kind kind) 43 | { 44 | return node_kind_to_str_map.at(kind); 45 | } 46 | 47 | template 48 | class node 49 | { 50 | 51 | private: 52 | node_kind kind_; 53 | std::vector> children_; 54 | 55 | public: 56 | virtual ~node(){}; 57 | 58 | explicit node(node_kind kind) 59 | : kind_(kind) 60 | { 61 | } 62 | 63 | std::vector> const& get_children() const 64 | { 65 | return children_; 66 | } 67 | 68 | void set_children(std::vector> const& children) 69 | { 70 | children_ = children; 71 | } 72 | 73 | void set_children(std::vector>&& children) 74 | { 75 | children_ = std::move(children); 76 | } 77 | 78 | node_kind kind() const { return kind_; } 79 | 80 | // tensor positions, dimension sizes, and tensor formulas 81 | virtual std::pair, report_vector> 82 | get_fn(std::map const&, std::map const&, 83 | std::map const&, formulas_map_type const&, 84 | bool) const = 0; 85 | 86 | virtual std::set get_tensors_used() const = 0; 87 | 88 | virtual std::set get_output_tensors() const = 0; 89 | 90 | virtual strides_map_type const& get_tensor_strides() const = 0; 91 | 92 | virtual std::string dump(formulas_map_type const& formulas, 93 | std::map const& sizes, 94 | std::string const& indent) const = 0; 95 | }; 96 | 97 | template 98 | class compute_node; 99 | 100 | template 101 | class compiled_loop_nest_node; 102 | 103 | template 104 | class transpose_node; 105 | 106 | template 107 | class compiled_transpose_node; 108 | 109 | template 110 | class for_loop_node; 111 | 112 | template 113 | class nested_for_loops_node; 114 | 115 | } // namespace loop_tree 116 | } // namespace dabun 117 | -------------------------------------------------------------------------------- /include/dabun/loop_tree/report.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "dabun/common.hpp" 15 | 16 | namespace dabun 17 | { 18 | namespace loop_tree 19 | { 20 | 21 | struct program_node_info 22 | { 23 | std::int64_t const flops = 0; 24 | std::int64_t const effective_flops = 0; 25 | std::string const extra = ""; 26 | 27 | std::string to_string() const 28 | { 29 | return std::string("program - FLOPs: ") + std::to_string(flops) + 30 | ", effective FLOPs: " + std::to_string(effective_flops) + 31 | ", extra\"" + extra + "\""; 32 | } 33 | }; 34 | 35 | struct compute_node_info 36 | { 37 | std::int64_t const flops = 0; 38 | std::int64_t const effective_flops = 0; 39 | std::string const extra = ""; 40 | 41 | std::string to_string() const 42 | { 43 | return std::string("compute node: 2 FLOPs") + ", extra\"" + extra + 44 | "\""; 45 | } 46 | }; 47 | 48 | struct compiled_loop_nest_node_info 49 | { 50 | std::int64_t const flops = 0; 51 | std::int64_t const effective_flops = 0; 52 | std::string const asm_dump = ""; 53 | 54 | access_kind A_access_kind; 55 | access_kind B_access_kind; 56 | access_kind C_access_kind; 57 | 58 | std::pair register_blocking_info; 59 | 60 | std::string const extra = ""; 61 | 62 | std::string to_string() const 63 | { 64 | return std::string("compiled loop_nest - FLOPs: ") + 65 | std::to_string(flops) + 66 | ", effective FLOPs: " + std::to_string(effective_flops) + 67 | ", A access: " + dabun::to_string(A_access_kind) + 68 | ", B access: " + dabun::to_string(B_access_kind) + 69 | ", C access: " + dabun::to_string(C_access_kind) + 70 | ", register blocking: " + 71 | std::to_string(register_blocking_info.first) + ":" + 72 | std::to_string(register_blocking_info.second) + ", extra\"" + 73 | extra + "\""; 74 | } 75 | }; 76 | 77 | struct transpose_node_info 78 | { 79 | std::int64_t const flops = 0; 80 | std::int64_t const effective_flops = 0; 81 | std::string const extra = ""; 82 | 83 | std::string to_string() const 84 | { 85 | return std::string("transpose_node") + ", extra\"" + extra + "\""; 86 | } 87 | }; 88 | 89 | struct compiled_transpose_node_info 90 | { 91 | std::int64_t const flops = 0; 92 | std::int64_t const effective_flops = 0; 93 | std::string const asm_dump = ""; 94 | std::string const extra = ""; 95 | 96 | std::string to_string() const 97 | { 98 | return std::string("compiled_transpose_node") + ", extra\"" + extra + 99 | "\""; 100 | } 101 | }; 102 | 103 | struct for_loop_node_info 104 | { 105 | std::int64_t const flops = 0; 106 | std::int64_t const effective_flops = 0; 107 | 108 | std::string const var_name = ""; 109 | std::int64_t const steps = 0; 110 | std::int64_t const delta = 0; 111 | std::int64_t const size = 0; 112 | 113 | std::string const extra = ""; 114 | 115 | std::string to_string() const 116 | { 117 | return std::string("for_loop - FLOPs: ") + std::to_string(flops) + 118 | ", effective FLOPs: " + std::to_string(effective_flops) + 119 | ", var: " + var_name + ", steps: " + std::to_string(steps) + 120 | ", delta: " + std::to_string(delta) + 121 | ", size: " + std::to_string(size) + ", extra\"" + extra + "\""; 122 | } 123 | }; 124 | 125 | using node_info = 126 | std::variant; 129 | 130 | struct node_report; 131 | 132 | using report_vector = std::vector>; 133 | 134 | struct node_report 135 | { 136 | node_info info; 137 | report_vector children; 138 | 139 | node_report(node_info i) 140 | : info(i) 141 | { 142 | } 143 | 144 | node_report(node_info i, report_vector&& c) 145 | : info(i) 146 | , children(std::move(c)) 147 | { 148 | } 149 | }; 150 | 151 | inline void print_report_helper(std::ostringstream& oss, 152 | report_vector const& report, int indent = 0) 153 | { 154 | for (auto const& r : report) 155 | { 156 | std::visit( 157 | [&](auto const& i) 158 | { oss << std::string(indent, '|') << i.to_string() << '\n'; }, 159 | r->info); 160 | print_report_helper(oss, r->children, indent + 2); 161 | } 162 | } 163 | 164 | inline std::string print_report(report_vector const& report, int indent) 165 | { 166 | std::ostringstream oss; 167 | print_report_helper(oss, report, indent); 168 | return oss.str(); 169 | } 170 | 171 | inline std::string print_report(std::shared_ptr const& node, 172 | int indent = 0) 173 | { 174 | std::ostringstream oss; 175 | std::visit([&](auto const& i) 176 | { oss << std::string(indent, '|') << i.to_string() << '\n'; }, 177 | node->info); 178 | print_report_helper(oss, node->children, indent + 2); 179 | return oss.str(); 180 | } 181 | 182 | } // namespace loop_tree 183 | } // namespace dabun 184 | -------------------------------------------------------------------------------- /include/dabun/loop_tree/transpose_node.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/loop_tree/node.hpp" 9 | 10 | namespace dabun 11 | { 12 | namespace loop_tree 13 | { 14 | 15 | template 16 | class transpose_node : public node 17 | { 18 | 19 | private: 20 | using super_type = node; 21 | 22 | std::string input; 23 | std::string output; 24 | strides_map_type strides; 25 | std::optional unroll_limit; 26 | 27 | public: 28 | std::string dump(formulas_map_type const& /* formulas */, 29 | std::map const& /* sizes */, 30 | std::string const& indent) const override 31 | { 32 | std::ostringstream ss; 33 | ss << indent << "Interpreted transpose" << std::endl; 34 | ss << indent << "Input: " << input << std::endl; 35 | ss << indent << "Output: " << output << std::endl; 36 | ss << utility::dump_strides(strides, indent); 37 | return ss.str(); 38 | } 39 | 40 | public: 41 | transpose_node(std::string const& input, std::string const& output, 42 | strides_map_type const& strides, 43 | std::optional unroll_limit = std::nullopt) 44 | : super_type(node_kind::transpose) 45 | , input(input) 46 | , output(output) 47 | , strides(strides) 48 | , unroll_limit(unroll_limit) 49 | { 50 | } 51 | 52 | std::string const& get_input() const { return input; } 53 | 54 | std::string const& get_output() const { return output; } 55 | 56 | std::optional get_unroll_limit() const { return unroll_limit; } 57 | 58 | std::set get_tensors_used() const override 59 | { 60 | return {input, output}; 61 | } 62 | 63 | std::set get_output_tensors() const override 64 | { 65 | return {output}; 66 | } 67 | 68 | strides_map_type const& get_tensor_strides() const override 69 | { 70 | return strides; 71 | } 72 | 73 | std::pair, report_vector> 74 | get_fn(std::map const& tensors_idx, 75 | std::map const&, std::map const&, 76 | formulas_map_type const&, bool) const override 77 | { 78 | report_vector report = { 79 | std::make_shared(transpose_node_info{})}; 80 | 81 | return {[input = this->input, output = this->output, 82 | input_idx = tensors_idx.at(input), 83 | output_idx = tensors_idx.at(output)]( 84 | std::vector& tensors, std::vector&) 85 | { 86 | strong_assert(tensors[input_idx]); 87 | strong_assert(tensors[output_idx]); 88 | 89 | Arithmetic* A = tensors[input_idx]; 90 | Arithmetic* C = tensors[output_idx]; 91 | C[0] = A[0]; 92 | }, 93 | report}; 94 | } 95 | }; 96 | 97 | template 98 | node_ptr 99 | make_transpose_node(std::string const& input, std::string const& output, 100 | strides_map_type const& strides, 101 | std::optional unroll_limit = std::nullopt) 102 | { 103 | return node_ptr(new transpose_node( 104 | input, output, strides, unroll_limit)); 105 | } 106 | 107 | } // namespace loop_tree 108 | } // namespace dabun 109 | -------------------------------------------------------------------------------- /include/dabun/loop_tree/types.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/elementwise_operation.hpp" 9 | #include "dabun/isa.hpp" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | namespace dabun 18 | { 19 | namespace loop_tree 20 | { 21 | 22 | // forward declaration 23 | template 24 | class node; 25 | 26 | // Type aliases for readability 27 | // void (vector of tensors, vector of alpha offsets) 28 | template 29 | using loop_tree_fn_type = 30 | std::function&, std::vector&)>; 31 | 32 | // map from name to map of strides 33 | using strides_map_type = std::map>; 34 | 35 | // map from name to set of dimensions 36 | using formulas_map_type = std::map>; 37 | 38 | template 39 | using elementwise_op_ptr = std::shared_ptr>; 40 | 41 | template 42 | using node_ptr = std::shared_ptr>; 43 | 44 | // Note: add classes from dabun/arithmetic_operations.hpp 45 | // as needed 46 | enum class arithmetic_op_kind 47 | { 48 | plus, 49 | multiplies, 50 | max, 51 | min 52 | }; 53 | 54 | } // namespace loop_tree 55 | } // namespace dabun 56 | -------------------------------------------------------------------------------- /include/dabun/loop_tree/utility.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "dabun/arithmetic_operation.hpp" 15 | #include "dabun/isa.hpp" 16 | #include "dabun/loop_tree/types.hpp" 17 | 18 | namespace dabun 19 | { 20 | namespace loop_tree 21 | { 22 | namespace utility 23 | { 24 | 25 | inline std::shared_ptr 26 | get_operation_pair(arithmetic_op_kind plus_op, arithmetic_op_kind multiplies_op) 27 | { 28 | 29 | std::map, 30 | std::shared_ptr> 31 | #ifndef DABUN_ARCH_AARCH64 32 | op_map = { 33 | {{arithmetic_op_kind::plus, arithmetic_op_kind::multiplies}, 34 | std::make_shared< 35 | operation_pair>()}, 36 | {{arithmetic_op_kind::max, arithmetic_op_kind::multiplies}, 37 | std::make_shared>()}, 38 | {{arithmetic_op_kind::min, arithmetic_op_kind::multiplies}, 39 | std::make_shared>()}, 40 | {{arithmetic_op_kind::max, arithmetic_op_kind::plus}, 41 | std::make_shared>()}}; 42 | #else 43 | op_map = {{{arithmetic_op_kind::plus, arithmetic_op_kind::multiplies}, 44 | std::make_shared()}}; 45 | #endif 46 | 47 | return op_map.at({plus_op, multiplies_op}); 48 | } 49 | 50 | inline std::string dump_strides(strides_map_type const& strides, 51 | std::string const& indent) 52 | { 53 | std::ostringstream ss; 54 | ss << indent << "Strides: " << std::endl; 55 | for (auto const& tensor_strides : strides) 56 | { 57 | // tensor 58 | ss << indent << " " << tensor_strides.first << ": "; 59 | // strides 60 | for (auto const& entry : tensor_strides.second) 61 | { 62 | ss << entry.first << ":" << entry.second << " "; 63 | } 64 | ss << std::endl; 65 | } 66 | return ss.str(); 67 | } 68 | 69 | inline std::string dump_formula(formulas_map_type const& formulas, 70 | std::string const& indent) 71 | { 72 | std::ostringstream ss; 73 | ss << indent << "Formulas: " << std::endl; 74 | for (auto const& tensor_formula : formulas) 75 | { 76 | // tensor 77 | ss << indent << " " << tensor_formula.first << ": "; 78 | // formula 79 | for (auto const& entry : tensor_formula.second) 80 | { 81 | ss << entry << " "; 82 | } 83 | ss << std::endl; 84 | } 85 | return ss.str(); 86 | } 87 | 88 | inline std::string dump_tensors(std::vector const& tensors, 89 | std::string const& indent) 90 | { 91 | std::ostringstream ss; 92 | for (auto const& i : tensors) 93 | { 94 | ss << indent << i << " "; 95 | } 96 | ss << std::endl; 97 | return ss.str(); 98 | } 99 | 100 | inline std::string 101 | dump_order(std::vector> const& order, 102 | std::string const& indent) 103 | { 104 | std::ostringstream ss; 105 | ss << indent << "Order: "; 106 | for (auto const& o : order) 107 | { 108 | ss << o.first << ":" << o.second << " "; 109 | } 110 | ss << std::endl; 111 | return ss.str(); 112 | } 113 | 114 | inline std::string dump_sizes(std::map const& sizes, 115 | std::string const& indent) 116 | { 117 | std::ostringstream ss; 118 | ss << indent << "Sizes: "; 119 | for (auto const& s : sizes) 120 | { 121 | ss << s.first << ":" << s.second << " "; 122 | } 123 | ss << std::endl; 124 | return ss.str(); 125 | } 126 | 127 | } // namespace utility 128 | } // namespace loop_tree 129 | } // namespace dabun 130 | -------------------------------------------------------------------------------- /include/dabun/math.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/core.hpp" 9 | 10 | #include 11 | #include 12 | 13 | namespace dabun 14 | { 15 | 16 | template 17 | constexpr inline T ceil_div(T a, identity_type_t b) noexcept 18 | { 19 | return (a + b - 1) / b; 20 | } 21 | 22 | template 23 | constexpr inline T round_up(T a, identity_type_t b) noexcept 24 | { 25 | return ceil_div(a, b) * b; 26 | } 27 | 28 | template 29 | constexpr inline std::tuple full_rest(T total, 30 | identity_type_t delta) noexcept 31 | { 32 | return {total / delta, total % delta}; 33 | } 34 | 35 | // Equals to the number of iterations of the loop 36 | // for (T i = from; i < to; i += stride) 37 | // Assumes from <= to 38 | template 39 | constexpr inline auto num_iterations(T from, identity_type_t to, 40 | identity_type_t stride) noexcept 41 | -> std::enable_if_t, T> 42 | { 43 | return ceil_div(to - from, stride); 44 | } 45 | 46 | } // namespace dabun 47 | -------------------------------------------------------------------------------- /include/dabun/numeric.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | 10 | namespace dabun 11 | { 12 | 13 | using sysml::fp16_t; 14 | using sysml::fp32_t; 15 | using sysml::fp64_t; 16 | 17 | using sysml::ivec; 18 | 19 | using namespace sysml::ivec_specializations; 20 | 21 | } // namespace dabun 22 | -------------------------------------------------------------------------------- /include/dabun/one_constant.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | namespace dabun 9 | { 10 | 11 | template 12 | Float const one_actual_constant = static_cast(1); 13 | 14 | template 15 | Float const* const one_constant = &one_actual_constant; 16 | 17 | } // namespace dabun 18 | -------------------------------------------------------------------------------- /include/dabun/peak_gflops.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/isa.hpp" 9 | 10 | #if defined(DABUN_ARCH_AARCH64) 11 | # include "dabun/arm/peak_gflops.hpp" 12 | #else 13 | # include "dabun/x86/peak_gflops.hpp" 14 | #endif 15 | 16 | namespace dabun::impl 17 | { 18 | 19 | } // namespace dabun::impl 20 | 21 | namespace dabun 22 | { 23 | 24 | namespace detail 25 | { 26 | 27 | template 28 | struct peak_gflops_impl 29 | { 30 | static double peak_gflops(int iterations = 1000000); 31 | static double measure_peak_gflops(double secs, 32 | int max_iterations = 1000000); 33 | }; 34 | 35 | #if defined(DABUN_REQUIES_TEMPLATE_DEFINITION) 36 | 37 | template 38 | double peak_gflops_impl::peak_gflops(int iterations) 39 | { 40 | auto measurement = 41 | DABUN_ISA_NAMESPACE ::bench_gflops::do_bench(iterations); 42 | return measurement.first / measurement.second; 43 | } 44 | 45 | template 46 | double peak_gflops_impl::measure_peak_gflops(double secs, 47 | int max_iterations) 48 | { 49 | int cur_it = 1; 50 | auto measurement = 51 | DABUN_ISA_NAMESPACE ::bench_gflops::do_bench(cur_it); 52 | 53 | while (measurement.first < secs && cur_it <= max_iterations) 54 | { 55 | cur_it *= 2; 56 | measurement = 57 | DABUN_ISA_NAMESPACE ::bench_gflops::do_bench(cur_it); 58 | } 59 | 60 | return measurement.first / measurement.second; 61 | } 62 | 63 | #endif 64 | 65 | #if defined(DABUN_MAYBE_EXTN_TPL_INSTNTON) 66 | 67 | # if defined(DABUN_ARCH_AARCH64) 68 | 69 | DABUN_MAYBE_EXTN_TPL_INSTNTON struct peak_gflops_impl; 70 | DABUN_MAYBE_EXTN_TPL_INSTNTON struct peak_gflops_impl; 71 | 72 | # else 73 | 74 | DABUN_MAYBE_EXTN_TPL_INSTNTON struct peak_gflops_impl; 75 | DABUN_MAYBE_EXTN_TPL_INSTNTON struct peak_gflops_impl; 76 | DABUN_MAYBE_EXTN_TPL_INSTNTON struct peak_gflops_impl; 77 | 78 | # endif 79 | 80 | #endif 81 | 82 | } // namespace detail 83 | 84 | template 85 | double peak_gflops(int iterations = 1000000) 86 | { 87 | return detail::peak_gflops_impl::peak_gflops(iterations); 88 | } 89 | 90 | template 91 | double measure_peak_gflops(double secs, int max_iterations = 1000000) 92 | { 93 | return detail::peak_gflops_impl::measure_peak_gflops(secs, 94 | max_iterations); 95 | } 96 | 97 | } // namespace dabun 98 | -------------------------------------------------------------------------------- /include/dabun/predef.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | 10 | // Inspired by boost predef 11 | 12 | #define DABUN_VERSION_NUMBER(major, minor, patch) \ 13 | SYSML_VERSION_NUMBER(major, minor, patch) 14 | 15 | #if defined(__clang__) 16 | 17 | # define DABUN_COMP_CLANG \ 18 | DABUN_VERSION_NUMBER(__clang_major__, __clang_minor__, \ 19 | __clang_patchlevel__) 20 | 21 | #elif defined(__GNUC__) 22 | 23 | # define DABUN_COMP_GNUC \ 24 | DABUN_VERSION_NUMBER(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__) 25 | 26 | #else 27 | 28 | # error "Compiler not supported" 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /include/dabun/random_vector.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/aligned_vector.hpp" 9 | #include "dabun/numeric.hpp" 10 | #include "sysml/random.hpp" 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | namespace dabun 17 | { 18 | 19 | namespace detail 20 | { 21 | 22 | template 23 | struct random_initalizer_helper 24 | { 25 | 26 | template 27 | static auto get_random_vector(unsigned size, unsigned extra_elements) 28 | -> std::enable_if_t || 29 | std::is_same_v, 30 | aligned_vector> 31 | { 32 | aligned_vector res(size + extra_elements); 33 | 34 | std::random_device rd; 35 | std::mt19937 gen(0); // rd()); 36 | 37 | sysml::uniform_distribution dis(-1.0, 1.0); 38 | 39 | for (auto& f : res) 40 | { 41 | f = dis(gen); 42 | } 43 | 44 | return res; 45 | } 46 | 47 | template 48 | static auto get_random_vector(unsigned size, unsigned extra_elements) 49 | -> std::enable_if_t, 50 | aligned_vector> 51 | { 52 | aligned_vector res(size + extra_elements); 53 | 54 | std::random_device rd; 55 | std::mt19937 gen(0); // rd()); 56 | 57 | sysml::uniform_distribution dis( 58 | std::numeric_limits::min(), 59 | std::numeric_limits::max()); 60 | 61 | for (auto& f : res) 62 | { 63 | f = dis(gen); 64 | } 65 | 66 | return res; 67 | } 68 | 69 | template 70 | static auto get_zero_vector(unsigned size, unsigned extra_elements) 71 | -> std::enable_if_t || 72 | std::is_same_v, 73 | aligned_vector> 74 | { 75 | aligned_vector res(size + extra_elements); 76 | return res; 77 | } 78 | 79 | template 80 | static auto get_zero_vector(unsigned size, unsigned extra_elements) 81 | -> std::enable_if_t, 82 | aligned_vector> 83 | { 84 | aligned_vector res(size + extra_elements); 85 | return res; 86 | } 87 | }; 88 | 89 | } // namespace detail 90 | 91 | template 92 | decltype(auto) get_random_vector(unsigned size, unsigned extra_elements = 16) 93 | { 94 | return detail::random_initalizer_helper::get_random_vector( 95 | size, extra_elements); 96 | } 97 | 98 | template 99 | decltype(auto) get_zero_vector(unsigned size, unsigned extra_elements = 16) 100 | { 101 | return detail::random_initalizer_helper::get_zero_vector(size, 102 | extra_elements); 103 | } 104 | 105 | template 106 | auto aligned_vector_cast(aligned_vector const& from) 107 | -> std::enable_if_t, aligned_vector> 108 | { 109 | aligned_vector ret(from.size()); 110 | 111 | for (std::size_t i = 0; i < from.size(); ++i) 112 | { 113 | ret[i] = static_cast(from[i]); 114 | } 115 | 116 | return ret; 117 | } 118 | 119 | } // namespace dabun 120 | -------------------------------------------------------------------------------- /include/dabun/tensillica/dl_compiled_fn.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #include once 7 | 8 | namespace dabun 9 | { 10 | namespace tensillica 11 | { 12 | 13 | template 14 | class unique_dl_compiled_fn; 15 | 16 | template 17 | class shared_dl_compiled_fn; 18 | 19 | template 20 | class weak_dl_compiled_fn; 21 | 22 | template 23 | class unique_dl_compiled_fn 24 | { 25 | public: 26 | using function_pointer_type = ReturnType (*)(Args...); 27 | }; 28 | 29 | } // namespace tensillica 30 | } // namespace dabun 31 | -------------------------------------------------------------------------------- /include/dabun/tensillica/multi_vmm.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/numeric.hpp" 9 | 10 | #include 11 | #include 12 | 13 | namespace dabun 14 | { 15 | namespace tensillica 16 | { 17 | 18 | // The main usage of the multi_vmm class is to increase the amount of 19 | // independent operations when accumulating to a single vector 20 | // register. This is accomplished by using multiple vector registers 21 | // which are reduced to a single one at the end. Each of the size_ 22 | // registers is independent of all the other ones. 23 | 24 | template 25 | class multi_vmm 26 | { 27 | private: 28 | int size_ = 0; 29 | int first_ = 0; 30 | int current_ = 0; 31 | int vlen_ = 4; 32 | int original_size_ = 0; 33 | 34 | public: 35 | multi_vmm() {} 36 | 37 | multi_vmm(int s, int f) 38 | : size_(s) 39 | , first_(f) 40 | , current_(0) 41 | , original_size_(s) 42 | { 43 | assert(s > 0); 44 | } 45 | 46 | void reset() 47 | { 48 | size_ = original_size_; 49 | current_ = 0; 50 | } 51 | 52 | multi_vmm(multi_vmm const&) = delete; 53 | multi_vmm& operator=(multi_vmm const&) = delete; 54 | 55 | multi_vmm(multi_vmm&& o) { *this = std::move(o); } 56 | 57 | multi_vmm& operator=(multi_vmm&& o) 58 | { 59 | assert(o.size_ > 0); 60 | size_ = o.size_; 61 | first_ = o.first_; 62 | current_ = o.current_; 63 | original_size_ = o.original_size_; 64 | return *this; 65 | } 66 | 67 | int size() const { return size_; } 68 | 69 | VReg operator++(int) 70 | { 71 | int c = current_; 72 | current_ = (current_ + 1) % size_; 73 | return VReg(first_ + c); 74 | } 75 | 76 | VReg operator[](int s) const 77 | { 78 | assert(s < size_); 79 | return VReg(first_ + s); 80 | } 81 | 82 | VReg operator++() 83 | { 84 | current_ = (current_ + 1) % size_; 85 | return VReg(first_ + current_); 86 | } 87 | 88 | VReg current() const { return VReg(first_ + current_); } 89 | 90 | VReg first() const { return VReg(first_); } 91 | 92 | template 93 | void half(Code_Generator& code_generator) 94 | { 95 | int h = (size_ + 1) / 2; 96 | for (int i = 0; i + h < size_; ++i) 97 | { 98 | code_generator.fadd(VReg(first_ + i).s4, VReg(first_ + i).s4, 99 | VReg(first_ + i + h).s4); 100 | } 101 | size_ = h; 102 | current_ = 0; 103 | } 104 | 105 | template 106 | void reduce(Code_Generator& code_generator) 107 | { 108 | while (size_ > 1) 109 | { 110 | half(code_generator); 111 | } 112 | } 113 | 114 | template 115 | void full_reduce(Code_Generator& code_generator, int mask = 4, 116 | int zero_vector = 0) 117 | { 118 | reduce(code_generator); 119 | assert(size_ == 1); 120 | 121 | { 122 | if (mask == 3) 123 | { 124 | // x4/w4 is zero reg by convention in the loop_nest.hpp 125 | code_generator.ins(VReg(first_).s4[3], Reg32(zero_vector)); 126 | } 127 | if (mask > 2) 128 | { 129 | code_generator.faddp(VReg(first_).s4, VReg(first_).s4, 130 | VReg(first_).s4); 131 | } 132 | if (mask > 1) 133 | { 134 | code_generator.faddp(SReg(first_), VReg(first_).s2); 135 | } 136 | } 137 | } 138 | }; 139 | 140 | } // namespace tensillica 141 | } // namespace dabun 142 | -------------------------------------------------------------------------------- /include/dabun/tensillica/peak_gflops.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | 10 | #include "dabun/tensillica/cpp_intrinsics_code_generator.hpp" 11 | 12 | // #include "dabun/isa.hpp" 13 | // #ifdef DABUN_ARCH_AARCH64 14 | 15 | // # include "dabun/code_generator/code_generator.hpp" 16 | // # include "dabun/isa.hpp" 17 | // # include "dabun/math.hpp" 18 | // # include "dabun/numeric.hpp" 19 | 20 | // # include 21 | 22 | #include 23 | 24 | namespace dabun 25 | { 26 | namespace tensillica 27 | { 28 | 29 | struct peak_gflops : cpp_intrinsics_code_generator 30 | { 31 | peak_gflops() 32 | { 33 | multi_vmm mvmm(8, 0); 34 | mvmm.full_reduce(*this, 4, 0); 35 | 36 | ldp(vmm0.s4, vmm1.s4, pre_ptr(x0, 4)); 37 | 38 | mov(vmm1.s4, vmm0.s4); 39 | fmla(vmm1.s4, vmm1.s4, vmm0.s[1]); 40 | 41 | 42 | // ins(vmm0.s[1], w2); 43 | stp(vmm0.s4, vmm1.s4, ptr(x1)); 44 | 45 | custom_string("return x0 + x1 + x2;"); 46 | } 47 | }; 48 | 49 | struct peak_gflopsw : cpp_intrinsics_code_generator 50 | { 51 | peak_gflopsw() 52 | { 53 | multi_vmm mvmm(8, 0); 54 | mvmm.full_reduce(*this, 4, 0); 55 | custom_string("return x0 * x1 * x2;"); 56 | } 57 | }; 58 | 59 | 60 | 61 | } // namespace tensillica 62 | } // namespace dabun 63 | -------------------------------------------------------------------------------- /include/dabun/transposer.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/isa.hpp" 9 | 10 | #if defined(DABUN_ARCH_AARCH64) 11 | #include "dabun/arm/transposer.hpp" 12 | #else 13 | #include "dabun/x86/transposer.hpp" 14 | #endif 15 | 16 | namespace dabun 17 | { 18 | 19 | using DABUN_ISA_NAMESPACE ::transposer_code_generator; 20 | 21 | #if defined(DABUN_ARCH_AARCH64) 22 | 23 | template 24 | using transposer_compiler = 25 | transposer_code_generator, Arithmetic>; 26 | 27 | #else 28 | 29 | template 30 | using transposer_compiler = 31 | transposer_code_generator>; 32 | 33 | #endif 34 | 35 | } // namespace dabun 36 | -------------------------------------------------------------------------------- /include/dabun/utility/log.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | 11 | namespace dabun 12 | { 13 | 14 | #ifndef NDEBUG 15 | static constexpr bool DEBUG = true; 16 | static constexpr bool INFO = true; 17 | #else 18 | static constexpr bool DEBUG = false; 19 | static constexpr bool INFO = false; 20 | #endif 21 | 22 | #if defined(DABUN_LOG_TO_FILE) 23 | 24 | class LN_LOG 25 | { 26 | private: 27 | bool print_ = false; 28 | 29 | public: 30 | explicit LN_LOG(bool p) 31 | : print_(p) 32 | { 33 | } 34 | 35 | template 36 | LN_LOG const& operator<<(T&& v) const 37 | { 38 | static std::ofstream fout("dabun_loop_nest.log"); 39 | if (print_) 40 | { 41 | fout << v; 42 | } 43 | return *this; 44 | } 45 | }; 46 | 47 | #else 48 | 49 | class LN_LOG 50 | { 51 | private: 52 | bool print_ = false; 53 | 54 | public: 55 | explicit LN_LOG(bool p) 56 | : print_(p) 57 | { 58 | } 59 | 60 | template 61 | LN_LOG const& operator<<(T&& v) const 62 | { 63 | if (print_) 64 | { 65 | std::cout << v; 66 | } 67 | return *this; 68 | } 69 | }; 70 | 71 | #endif 72 | 73 | } // namespace dabun 74 | -------------------------------------------------------------------------------- /include/dabun/utility/most_frequent_queue.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | namespace dabun::utility 15 | { 16 | 17 | // A queue that holds values of type T and a count for each value. We 18 | // can queue the most abundant value, remove it, as well as increase 19 | // or decrease instances of each value by 1. 20 | 21 | template 22 | struct most_frequent_queue 23 | { 24 | private: 25 | using map_t = std::map>; 26 | 27 | map_t sorted_; 28 | map_t ignored_; 29 | std::map counts_; 30 | 31 | bool was_read_ = false; 32 | 33 | std::pair remove_existing(T const& v) 34 | { 35 | assert(counts_.count(v) > 0); 36 | 37 | std::size_t s = counts_[v]; 38 | 39 | if (sorted_.count(s) > 0 && sorted_[s].count(v)) 40 | { 41 | auto& bucket = sorted_[s]; 42 | 43 | assert(bucket.count(v) > 0); 44 | 45 | bucket.erase(v); 46 | if (bucket.size() == 0) 47 | { 48 | sorted_.erase(s); 49 | } 50 | 51 | counts_.erase(v); 52 | return {&sorted_, s}; 53 | } 54 | else 55 | { 56 | assert(ignored_.count(s) > 0); 57 | 58 | auto& bucket = ignored_[s]; 59 | 60 | assert(bucket.count(v) > 0); 61 | 62 | bucket.erase(v); 63 | if (bucket.size() == 0) 64 | { 65 | ignored_.erase(s); 66 | } 67 | 68 | counts_.erase(v); 69 | return {&ignored_, s}; 70 | } 71 | } 72 | 73 | void add_with_count(map_t* where, T const& v, std::size_t s) 74 | { 75 | counts_[v] = s; 76 | assert(where->count(s) == 0 || (*where)[s].count(v) == 0); 77 | (*where)[s].insert(v); 78 | } 79 | 80 | void pop_or_skip(map_t* /* where */) {} 81 | 82 | public: 83 | std::size_t size() const { return sorted_.size(); } 84 | 85 | T top() const 86 | { 87 | assert(sorted_.size()); 88 | return *(sorted_.crbegin()->second.cbegin()); 89 | } 90 | 91 | T get_top_then_pop() 92 | { 93 | was_read_ = true; 94 | T ret = top(); 95 | pop(); 96 | return ret; 97 | } 98 | 99 | void pop() 100 | { 101 | was_read_ = true; 102 | assert(sorted_.size() > 0); 103 | auto& slot = sorted_.rbegin()->second; 104 | 105 | assert(slot.size() > 0); 106 | auto const& v = *slot.begin(); 107 | 108 | assert(counts_.count(v) > 0); 109 | counts_.erase(v); 110 | 111 | slot.erase(slot.begin()); 112 | if (slot.size() == 0) 113 | { 114 | sorted_.erase(sorted_.rbegin()->first); 115 | } 116 | } 117 | 118 | void skip() 119 | { 120 | was_read_ = true; 121 | assert(sorted_.size() > 0); 122 | 123 | auto& slot = sorted_.rbegin()->second; 124 | 125 | assert(slot.size() > 0); 126 | auto const& v = *slot.begin(); 127 | 128 | assert(counts_.count(v) > 0); 129 | 130 | slot.erase(slot.begin()); 131 | if (slot.size() == 0) 132 | { 133 | sorted_.erase(sorted_.rbegin()->first); 134 | } 135 | 136 | ignored_[counts_[v]].insert(v); 137 | } 138 | 139 | void inc(T const& v) 140 | { 141 | assert(!was_read_); 142 | 143 | if (counts_.count(v)) 144 | { 145 | auto s = remove_existing(v); 146 | add_with_count(s.first, v, s.second + 1); 147 | } 148 | else 149 | { 150 | add_with_count(&sorted_, v, 1); 151 | } 152 | } 153 | 154 | void dec(T const& v) 155 | { 156 | assert(was_read_); 157 | assert(counts_.count(v)); 158 | 159 | auto s = remove_existing(v); 160 | if (--s.second > 0) 161 | { 162 | add_with_count(s.first, v, s.second); 163 | } 164 | } 165 | }; 166 | 167 | } // namespace dabun::utility 168 | -------------------------------------------------------------------------------- /include/dabun/utility/tmp_file_name.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace dabun::utility 14 | { 15 | 16 | inline std::string get_temporary_file_name(std::string const& suffix, 17 | std::string const& dir = "/tmp") 18 | { 19 | static std::atomic counter(0); 20 | 21 | std::ostringstream oss; 22 | oss << dir << "/" << std::this_thread::get_id() << "_" << (counter++) 23 | << suffix; 24 | return oss.str(); 25 | } 26 | 27 | } // namespace dabun::utility 28 | -------------------------------------------------------------------------------- /include/dabun/x86/configuration.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/isa.hpp" 9 | #ifdef DABUN_ARCH_X86_64 10 | 11 | namespace dabun 12 | { 13 | namespace x86 14 | { 15 | 16 | class OptimizationConfiguration 17 | { 18 | private: 19 | bool delay_innermost_operations_; 20 | bool split_vector_registers_; 21 | bool use_address_packer_; 22 | 23 | public: 24 | OptimizationConfiguration(bool delay_innermost_operations, 25 | bool split_vector_registers, 26 | bool use_address_packer) 27 | : delay_innermost_operations_(delay_innermost_operations) 28 | , split_vector_registers_(split_vector_registers) 29 | , use_address_packer_(use_address_packer) 30 | 31 | { 32 | } 33 | 34 | OptimizationConfiguration() 35 | : delay_innermost_operations_(true) 36 | , split_vector_registers_(true) 37 | , use_address_packer_(true) 38 | { 39 | } 40 | 41 | bool delay_innermost_operations() const 42 | { 43 | return delay_innermost_operations_; 44 | } 45 | 46 | bool split_vector_registers() const { return split_vector_registers_; } 47 | 48 | bool use_address_packer() const { return use_address_packer_; } 49 | }; 50 | 51 | inline OptimizationConfiguration all_optims(true, true, true); 52 | 53 | // technically no optimizations beyond output tensor register blocking 54 | inline OptimizationConfiguration no_optims(false, false, false); 55 | 56 | } // namespace x86 57 | } // namespace dabun 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /include/dabun/x86/denormals.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/isa.hpp" 9 | #ifdef DABUN_ARCH_X86_64 10 | 11 | // Mosty from 12 | // https://en.wikipedia.org/wiki/Denormal_number#Disabling_denormal_floats_at_the_code_level 13 | 14 | #include 15 | 16 | #define LN_MM_DENORMALS_ZERO_MASK 0x0040 17 | #define LN_MM_DENORMALS_ZERO_ON 0x0040 18 | #define LN_MM_DENORMALS_ZERO_OFF 0x0000 19 | 20 | #define LN_MM_FLUSH_ZERO_MASK 0x8000 21 | #define LN_MM_FLUSH_ZERO_ON 0x8000 22 | #define LN_MM_FLUSH_ZERO_OFF 0x0000 23 | 24 | #define LN_MM_SET_DENORMALS_ZERO_MODE(mode) \ 25 | _mm_setcsr((_mm_getcsr() & ~LN_MM_DENORMALS_ZERO_MASK) | (mode)) 26 | 27 | #define LN_MM_GET_DENORMALS_ZERO_MODE() \ 28 | (_mm_getcsr() & LN_MM_DENORMALS_ZERO_MASK) 29 | 30 | #define LN_MM_SET_FLUSH_ZERO_MODE(mode) \ 31 | _mm_setcsr((_mm_getcsr() & ~LN_MM_FLUSH_ZERO_MASK) | (mode)) 32 | 33 | #define LN_MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & LN_MM_FLUSH_ZERO_MASK) 34 | 35 | namespace dabun::detail 36 | { 37 | class denormals_disabler 38 | { 39 | private: 40 | unsigned int previous_value; 41 | 42 | public: 43 | denormals_disabler() 44 | { 45 | previous_value = _mm_getcsr(); 46 | _mm_setcsr(previous_value | LN_MM_DENORMALS_ZERO_ON | 47 | LN_MM_FLUSH_ZERO_ON); 48 | } 49 | 50 | ~denormals_disabler() { _mm_setcsr(previous_value); } 51 | 52 | denormals_disabler(denormals_disabler const&) = delete; 53 | denormals_disabler& operator=(denormals_disabler const&) = delete; 54 | }; 55 | 56 | inline denormals_disabler denormals_disabler_instance; 57 | 58 | } // namespace dabun::detail 59 | 60 | #undef LN_MM_DENORMALS_ZERO_MASK 61 | #undef LN_MM_DENORMALS_ZERO_ON 62 | #undef LN_MM_DENORMALS_ZERO_OFF 63 | 64 | #undef LN_MM_SET_DENORMALS_ZERO_MODE 65 | #undef LN_MM_GET_DENORMALS_ZERO_MODE 66 | 67 | #undef LN_MM_FLUSH_ZERO_MASK 68 | #undef LN_MM_FLUSH_ZERO_ON 69 | #undef LN_MM_FLUSH_ZERO_OFF 70 | 71 | #undef LN_MM_SET_FLUSH_ZERO_MODE 72 | #undef LN_MM_GET_FLUSH_ZERO_MODE 73 | 74 | #endif 75 | -------------------------------------------------------------------------------- /include/dabun/x86/multi_vmm.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/isa.hpp" 9 | #ifdef DABUN_ARCH_X86_64 10 | 11 | #include "dabun/x86/arithmetic_operation.hpp" 12 | 13 | #include 14 | 15 | namespace dabun 16 | { 17 | namespace x86 18 | { 19 | 20 | // The main usage of the multi_vmm class is to increase the amount of 21 | // independent operations when accumulating to a single vector 22 | // register. This is accomplished by using multiple vector registers 23 | // which are reduced to a single one at the end. Each of the size_ 24 | // registers is independent of all the other ones. 25 | 26 | template 27 | class multi_vmm 28 | { 29 | private: 30 | int size_ = 0; 31 | int first_ = 0; 32 | int current_ = 0; 33 | int original_size_ = 0; 34 | int max_touched_ = 0; 35 | 36 | public: 37 | multi_vmm() {} 38 | 39 | multi_vmm(int s, int f) 40 | : size_(s) 41 | , first_(f) 42 | , current_(0) 43 | , original_size_(s) 44 | , max_touched_(0) 45 | { 46 | assert(s > 0); 47 | } 48 | 49 | void reset() 50 | { 51 | size_ = original_size_; 52 | current_ = 0; 53 | } 54 | 55 | multi_vmm(multi_vmm const&) = delete; 56 | multi_vmm& operator=(multi_vmm const&) = delete; 57 | 58 | multi_vmm(multi_vmm&& o) { *this = std::move(o); } 59 | 60 | multi_vmm& operator=(multi_vmm&& o) 61 | { 62 | assert(o.size_ > 0); 63 | size_ = o.size_; 64 | first_ = o.first_; 65 | current_ = o.current_; 66 | original_size_ = o.original_size_; 67 | max_touched_ = o.max_touched_; 68 | return *this; 69 | } 70 | 71 | int size() const { return size_; } 72 | 73 | Vmm operator++(int) 74 | { 75 | int c = current_; 76 | current_ = (current_ + 1) % size_; 77 | max_touched_ = std::max(max_touched_, current_); 78 | return Vmm(first_ + c); 79 | } 80 | 81 | Vmm operator[](int s) const 82 | { 83 | assert(s < size_); 84 | return Vmm(first_ + s); 85 | } 86 | 87 | Vmm operator++() 88 | { 89 | current_ = (current_ + 1) % size_; 90 | return Vmm(first_ + current_); 91 | } 92 | 93 | Vmm current() const { return Vmm(first_ + current_); } 94 | 95 | Vmm first() const { return Vmm(first_); } 96 | 97 | template 98 | void half(Code_Generator& code_generator, 99 | std::shared_ptr op_pair) 100 | { 101 | int h = (size_ + 1) / 2; 102 | for (int i = 0; i + h < size_; ++i) 103 | { 104 | op_pair->plus(code_generator, Vmm(first_ + i), Vmm(first_ + i), 105 | Vmm(first_ + i + h)); 106 | } 107 | size_ = h; 108 | current_ = 0; 109 | } 110 | 111 | template 112 | void reduce(Code_Generator& code_generator, 113 | std::shared_ptr op_pair) 114 | { 115 | // size_ = max_touched_ + 1; 116 | while (size_ > 1) 117 | { 118 | half(code_generator, op_pair); 119 | } 120 | } 121 | }; 122 | 123 | } // namespace x86 124 | } // namespace dabun 125 | 126 | #endif 127 | -------------------------------------------------------------------------------- /include/dabun/x86/peak_gflops.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/isa.hpp" 9 | 10 | #ifdef DABUN_ARCH_X86_64 11 | 12 | # include "dabun/code_generator/code_generator.hpp" 13 | # include "dabun/math.hpp" 14 | # include "dabun/x86/xbyak.hpp" 15 | 16 | # include 17 | 18 | # include 19 | 20 | namespace dabun 21 | { 22 | namespace x86 23 | { 24 | 25 | template 26 | struct bench_gflops 27 | { 28 | private: 29 | static_assert(std::is_same_v || std::is_same_v || 30 | std::is_same_v); 31 | static_assert(std::is_same_v); 32 | 33 | using Vmm = 34 | std::conditional_t, Xbyak::Zmm, Xbyak::Ymm>; 35 | static constexpr int vector_size = isa_traits::vector_size; 36 | static constexpr int num_vector_regs = 37 | isa_traits::total_vector_registers; 38 | 39 | class test : public code_generator 40 | { 41 | public: 42 | test(int iterations) 43 | { 44 | Label loopLabel; 45 | mov(rax, 0); 46 | L(loopLabel); 47 | 48 | vbroadcastss(Vmm(num_vector_regs - 1), ptr[rdi]); 49 | vbroadcastss(Vmm(num_vector_regs - 2), ptr[rdi]); 50 | 51 | for (int i = 0; i < 10; ++i) 52 | { 53 | for (int j = 0; j < num_vector_regs - 2; ++j) 54 | { 55 | vfmadd231ps(Vmm(j), Vmm(num_vector_regs - 1), 56 | Vmm(num_vector_regs - 2)); 57 | } 58 | } 59 | 60 | add(rax, 1); 61 | cmp(rax, iterations); 62 | jl(loopLabel); 63 | ret(); 64 | } 65 | }; 66 | 67 | public: 68 | static std::pair do_bench(int iterations = 10000000) 69 | { 70 | auto fn = test(iterations).get_shared(); 71 | float data[1] = {0}; 72 | 73 | auto secs = sysml::measure_fastest([&]() { fn(data); }, 100); 74 | 75 | double gflops = 2.0 * iterations * 10 * (num_vector_regs - 2) * 76 | vector_size / 1000000000; 77 | 78 | return {gflops, secs}; 79 | } 80 | }; 81 | 82 | # ifndef DABUN_HEADER_ONLY 83 | 84 | extern template struct dabun::x86::bench_gflops; 85 | extern template struct dabun::x86::bench_gflops; 86 | extern template struct dabun::x86::bench_gflops; 87 | 88 | # endif 89 | 90 | } // namespace x86 91 | } // namespace dabun 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /include/dabun/x86/xbyak.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include "dabun/isa.hpp" 9 | #include "dabun/predef.hpp" 10 | 11 | #ifdef DABUN_ARCH_X86_64 12 | 13 | # if DABUN_COMP_GNUC 14 | # if DABUN_COMP_GNUC >= DABUN_VERSION_NUMBER(11, 0, 0) 15 | # define DABUN_XBYAK_SUPRESS_WARRAY_BOUNDS 16 | # endif 17 | # endif 18 | 19 | # if !defined(XBYAK_NO_OP_NAMES) 20 | # define XBYAK_NO_OP_NAMES 21 | # endif 22 | 23 | # ifdef DABUN_XBYAK_SUPRESS_WARRAY_BOUNDS 24 | # pragma GCC diagnostic push 25 | # pragma GCC diagnostic ignored "-Warray-bounds" 26 | # endif 27 | 28 | # include "xbyak/xbyak.h" 29 | # include "xbyak/xbyak_util.h" 30 | 31 | # ifdef DABUN_XBYAK_SUPRESS_WARRAY_BOUNDS 32 | # pragma GCC diagnostic pop 33 | # undef DABUN_XBYAK_SUPRESS_WARRAY_BOUNDS 34 | # endif 35 | 36 | using xbyak_buffer_type = Xbyak::uint8; 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/loop_nest.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #ifndef DABUN_HEADER_ONLY 7 | 8 | #include "dabun/loop_nest.hpp" 9 | 10 | namespace dabun 11 | { 12 | 13 | #if defined(DABUN_ARCH_AARCH64) 14 | 15 | namespace arm 16 | { 17 | 18 | template class loop_nest_code_generator; 19 | template class loop_nest_code_generator; 20 | 21 | } // namespace arm 22 | 23 | #else 24 | 25 | namespace x86 26 | { 27 | 28 | template class loop_nest_code_generator; 29 | template class loop_nest_code_generator; 30 | // template struct dabun::x86::loop_nest_code_generator; 31 | 32 | } // namespace x86 33 | 34 | #endif 35 | 36 | } // namespace dabun 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/peak_gflops.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #ifndef DABUN_HEADER_ONLY 7 | 8 | # include "dabun/peak_gflops.hpp" 9 | 10 | namespace dabun 11 | { 12 | 13 | # if defined(DABUN_ARCH_AARCH64) 14 | 15 | namespace arm 16 | { 17 | 18 | template struct bench_gflops; 19 | template struct bench_gflops; 20 | 21 | } // namespace arm 22 | 23 | # else 24 | 25 | namespace x86 26 | { 27 | 28 | template struct bench_gflops; 29 | template struct bench_gflops; 30 | template struct bench_gflops; 31 | 32 | } // namespace x86 33 | 34 | # endif 35 | 36 | } // namespace dabun 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/transposer.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #ifndef DABUN_HEADER_ONLY 7 | 8 | # include "dabun/transposer.hpp" 9 | 10 | namespace dabun 11 | { 12 | 13 | # if defined(DABUN_ARCH_AARCH64) 14 | 15 | namespace arm 16 | { 17 | 18 | template class transposer_code_generator; 19 | template class transposer_code_generator; 20 | 21 | } // namespace arm 22 | 23 | # else 24 | 25 | namespace x86 26 | { 27 | 28 | template class transposer_code_generator; 29 | template class transposer_code_generator; 30 | template class transposer_code_generator; 31 | 32 | } // namespace x86 33 | 34 | # endif 35 | 36 | } // namespace dabun 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/x86/multi_vmm.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(DABUN_TESTS_DIR ${CMAKE_CURRENT_SOURCE_DIR}) 2 | 3 | add_library(dabun_tests_catch2_main 4 | catch2_main.cpp) 5 | 6 | target_link_libraries(dabun_tests_catch2_main 7 | PUBLIC Catch2::Catch2) 8 | 9 | function(dabun_blanket_test name) 10 | message(STATUS "dabun_blanket_test ${name}_blanket ${name}.cpp") 11 | add_executable(${name}_blanket ${name}.cpp) 12 | target_link_libraries(${name}_blanket 13 | PUBLIC dabun 14 | PUBLIC dabun_tests_catch2_main 15 | PUBLIC -lpthread) 16 | endfunction(dabun_blanket_test) 17 | 18 | dabun_blanket_test(sentinel) 19 | 20 | function(dabun_test name vex float isa) 21 | message(STATUS "dabun_test ${name}.${vex}.${float} ${name}.cpp") 22 | add_executable(${name}.${vex}.${float} ${name}.cpp) 23 | target_link_libraries(${name}.${vex}.${float} 24 | PUBLIC dabun 25 | PUBLIC dabun_tests_catch2_main) 26 | target_compile_options(${name}.${vex}.${float} 27 | PRIVATE "-DDABUN_ISA=${isa}" 28 | PRIVATE "-DDABUN_VEX=extension::${vex}" 29 | PRIVATE "-DDABUN_ARITHMETIC=dabun::${float}") 30 | endfunction(dabun_test) 31 | 32 | function(dabun_common_tests vex float isa) 33 | dabun_test(handpicked_loop_nest_test ${vex} ${float} ${isa}) 34 | endfunction(dabun_common_tests) 35 | 36 | function(dabun_x86_tests vex float isa) 37 | dabun_common_tests(${vex} ${float} ${isa}) 38 | endfunction(dabun_x86_tests) 39 | 40 | function(dabun_arm_tests vex float isa) 41 | dabun_common_tests(${vex} ${float} ${isa}) 42 | endfunction(dabun_arm_tests) 43 | 44 | if("${DABUN_HOST_ARCHITECTURE}" STREQUAL "x86_64") 45 | if (DABUN_BUILD_TESTS_FOR_ALL_ARCH_VEX OR DABUN_BUILD_TESTS_FOR_AVX2) 46 | dabun_x86_tests(avx2 fp32_t avx2) 47 | endif() 48 | if (DABUN_BUILD_TESTS_FOR_ALL_ARCH_VEX OR DABUN_BUILD_TESTS_FOR_AVX512) 49 | dabun_x86_tests(avx512 fp32_t avx512) 50 | endif() 51 | elseif("${DABUN_HOST_ARCHITECTURE}" STREQUAL "aarch64") 52 | if (DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX OR DABUN_BUILD_APPS_FOR_NEON) 53 | dabun_arm_tests(neon fp32_t aarch64) 54 | dabun_test(transpose_meta_mnemonics neon fp32_t aarch64) 55 | endif() 56 | if (DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX OR DABUN_BUILD_APPS_FOR_NEON_FP16) 57 | dabun_arm_tests(neon_fp16 fp16_t aarch64) 58 | endif() 59 | endif() 60 | -------------------------------------------------------------------------------- /tests/baseline/matrix_transpose_baseline.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #pragma once 7 | 8 | #include 9 | 10 | namespace dabun::tests::baseline 11 | { 12 | 13 | template 14 | void reorder_array2d(T* out, T const* in, int rows, int cols, 15 | int in_row_stride = cols, int in_col_stride = 1, 16 | int out_row_stride = 1, int out_col_stride = rows) noexcept 17 | { 18 | for (int r = 0; r < rows; ++r) 19 | { 20 | for (int c = 0; c < cols; ++c) 21 | { 22 | out[out_row_stride * row + out_col_stride * col] = 23 | in[in_row_stride * row + in_col_stride * col]; 24 | } 25 | } 26 | } 27 | 28 | template 29 | void for_all_elements_of_two_array2d(T const* a1, T const* a2, Fn&& fn int rows, 30 | int cols, int a1_row_stride, 31 | int a1_col_stride, int a2_row_stride, 32 | int a2_col_stride) 33 | { 34 | for (int r = 0; r < rows; ++r) 35 | { 36 | for (int c = 0; c < cols; ++r) 37 | { 38 | fn(a1[r * a1_row_stride + c * a1_col_stride], 39 | a2{r * a2_row_stride + c * a2_col_stride]); 40 | } 41 | } 42 | } 43 | } 44 | 45 | } // namespace dabun::tests::baseline 46 | -------------------------------------------------------------------------------- /tests/catch2_main.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #define CATCH_CONFIG_MAIN 7 | 8 | #include 9 | -------------------------------------------------------------------------------- /tests/sentinel.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. 2 | // 3 | // This source code is licensed under the MIT license found in the 4 | // LICENSE file in the root directory of this source tree. 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | TEST_CASE("ZZZZ", "[single-file]") 17 | { 18 | float a[] = {1.f, 2.f, 3.f, 4.f}; 19 | float b[] = {1.f, 2.f, 3.f, 4.1f}; 20 | 21 | std::cout << sysml::is_any_of_v << "\n"; 22 | 23 | std::cout << sysml::max_abs_difference(a, a + 4, b) << "\n"; 24 | std::cout << sysml::max_abs_difference_n(a, 4, b) << "\n"; 25 | } 26 | 27 | TEST_CASE("WTF", "[single-file]") 28 | { 29 | sysml::int16x4_t a(1, 2, 3, 4); 30 | sysml::uint16x4_t b(1, 2, 3, 4); 31 | 32 | std::cout << a << "\n"; 33 | std::cout << b << "\n"; 34 | 35 | a += b; 36 | 37 | std::cout << a << "\n"; 38 | 39 | a -= b; 40 | 41 | std::cout << a << "\n"; 42 | std::cout << a * b << "\n"; 43 | std::cout << -a << "\n"; 44 | 45 | // std::cout << sysml::int16x4_t::sign_bitmask << "\n"; 46 | 47 | sysml::fp16_t z = static_cast(1.3); 48 | z = 3.f; 49 | 50 | // z *= 4; 51 | 52 | std::cout << z << "\n"; 53 | std::cout << (z < 12.f) << "\n"; 54 | 55 | // unsign 56 | } 57 | 58 | int Factorial(int number) 59 | { 60 | // return number <= 1 ? number : Factorial(number - 1) * number; // fail 61 | return number <= 1 ? 1 : Factorial(number - 1) * number; // pass 62 | } 63 | 64 | TEST_CASE("Factorial of 0 is 1 (fail)", "[single-file]") 65 | { 66 | REQUIRE(Factorial(0) == 1); 67 | } 68 | 69 | TEST_CASE("Factorials of 1 and higher are computed (pass)", "[single-file]") 70 | { 71 | REQUIRE(Factorial(1) == 1); 72 | REQUIRE(Factorial(2) == 2); 73 | REQUIRE(Factorial(3) == 6); 74 | REQUIRE(Factorial(10) == 3628800); 75 | } 76 | 77 | TEST_CASE("Random threaded test", "[single-file]") 78 | { 79 | return; 80 | // sysml::thread::cpu_pool oset({0, 1, 5, 12, 18}); 81 | sysml::thread::cpu_pool oset(10); 82 | // int i; 83 | // std::cin >> i; 84 | // std::cout << "Was sleeping? " 85 | // << (oset.set_sleeping_mode(true) ? " Yes" : "No") << 86 | // std::endl; 87 | 88 | // std::cin >> i; 89 | 90 | // for (int i = 0; i < 10000000; ++i) 91 | // { 92 | // int x = rand() % 2; 93 | // // std::cout << "Requesting: " 94 | // // << (oset.set_sleeping_mode(x) ? " Yes" : "No") << ' '; 95 | // // std::cout << "Was sleeping? " 96 | // // << (oset.set_sleeping_mode(x) ? " Yes" : "No") << 97 | // // std::endl; 98 | // oset.set_sleeping_mode(x); 99 | // } 100 | 101 | int const len = 10000; 102 | 103 | { 104 | std::vector all_zeros(len); 105 | 106 | sysml::thread::naive_parallel_for(oset, 0, len, 1, 107 | [&](int idx) { all_zeros[idx] = 1; }); 108 | 109 | REQUIRE(std::accumulate(all_zeros.begin(), all_zeros.end(), 0) == len); 110 | } 111 | 112 | { 113 | std::vector all_zeros(len); 114 | 115 | sysml::thread::single_queue_parallel_for(oset, 0, len, 1, 116 | [&](auto const& c, int idx) 117 | { 118 | std::cout << c.cpu_index 119 | << "\n"; 120 | all_zeros[idx] = 1; 121 | }); 122 | 123 | REQUIRE(std::accumulate(all_zeros.begin(), all_zeros.end(), 0) == len); 124 | } 125 | 126 | std::atomic zi{0}; 127 | 128 | oset.execute_on_all_cpus( 129 | [&](auto) 130 | { 131 | while (zi.fetch_add(1) < len) 132 | { 133 | } 134 | }); 135 | 136 | REQUIRE(zi.load() == len + oset.size()); 137 | 138 | // std::cout 139 | // << std::alignment_of_v< 140 | // sysml::detail::primitive_aligned_wrapper> << 141 | // "\n\n"; 142 | 143 | // std::cout << sizeof(dabun::detail::primitive_aligned_wrapper) 144 | // << "\n\n"; 145 | 146 | // int i; 147 | // std::cin >> i; 148 | { 149 | // using dabun::vek; 150 | 151 | // vek b1 = std::array{1}; 152 | // vek b2{{1, 1}}; 153 | 154 | // auto begi = concat(b1, b2); 155 | 156 | // auto begin = begi + 1; 157 | 158 | // // dabun::vek begin{1, 1, 1}; 159 | // dabun::vek end{{3, 4, 5}}; 160 | // end += 1; 161 | 162 | // std::cout << (b2 == b2) << " " << (b2 != b2) << "\n"; 163 | 164 | // dabun::coord_for_loop(begin, end, 165 | // [](auto const& v) 166 | // { 167 | // std::cout << v << "\n"; 168 | // }); 169 | 170 | // std::cout << "\n"; 171 | // std::cout << "\n"; 172 | // std::cout << "\n"; 173 | // std::cout << "\n"; 174 | 175 | // // dabun::coord_for_loop(end, 176 | // // [](auto const& v) 177 | // // { 178 | // // std::cout << v[0]; 179 | // // for (int i = 1; i < v.size(); ++i) 180 | // // { 181 | // // std::cout << ", " << v[i]; 182 | // // } 183 | // // std::cout << "\n"; 184 | // // }); 185 | 186 | // std::cout << -(end + 3) << "\n"; 187 | 188 | // std::cout << dabun::to_string(-end + 3, ',') << "\n"; 189 | 190 | // { 191 | // auto r = -end + 3; 192 | // // auto z = dabun::head<2>(r); 193 | // std::cout << dabun::head<2>(r) << "\n"; 194 | // // std::cout << z << "\n"; 195 | 196 | // } 197 | 198 | std::cout << "HWC: " << std::thread::hardware_concurrency() << "\n"; 199 | } 200 | } 201 | -------------------------------------------------------------------------------- /utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "AlignedVec.h" 10 | 11 | #define LOOP_NEST_STRINGIFY_0(s) #s 12 | #define LOOP_NEST_STRINGIFY(s) LOOP_NEST_STRINGIFY_0(s) 13 | 14 | #define strong_assert(condition) \ 15 | if (!(condition)) \ 16 | { \ 17 | throw std::runtime_error(LOOP_NEST_STRINGIFY( \ 18 | condition) " failed file: " __FILE__ \ 19 | " line: " LOOP_NEST_STRINGIFY((__LINE__))); \ 20 | } \ 21 | static_cast(0) 22 | 23 | // FROM: https://en.cppreference.com/w/cpp/utility/variant/visit 24 | 25 | // helper type for the visitor #4 26 | template 27 | struct overloaded : Ts... 28 | { 29 | using Ts::operator()...; 30 | }; 31 | // explicit deduction guide (not needed as of C++20) 32 | template 33 | overloaded(Ts...) -> overloaded; 34 | 35 | template 36 | void apply_relu(Float* Begin, Float* End) 37 | { 38 | for (; Begin != End; ++Begin) 39 | { 40 | if constexpr (std::is_same_v) 41 | { 42 | *Begin = static_cast( 43 | std::max(static_cast(0), static_cast(*Begin))); 44 | } 45 | else 46 | { 47 | *Begin = std::max(static_cast(0), *Begin); 48 | } 49 | } 50 | } 51 | 52 | template 53 | Float max_abs_difference(Float const* LBegin, Float const* LEnd, 54 | Float const* RBegin) 55 | { 56 | Float res = 0; 57 | for (; LBegin != LEnd; ++LBegin, ++RBegin) 58 | { 59 | res = std::max(res, std::abs(*LBegin - *RBegin)); 60 | } 61 | return res; 62 | } 63 | 64 | template 65 | Float max_abs_differenceVerbose(Float const* LBegin, Float const* LEnd, 66 | Float const* RBegin) 67 | { 68 | int off = 0; 69 | Float res = 0; 70 | for (; LBegin != LEnd; ++LBegin, ++RBegin) 71 | { 72 | std::cout << off++ << " : " << (*LBegin) << " " << (*RBegin) << " " 73 | << std::abs(*LBegin - *RBegin) << "\n"; 74 | res = std::max(res, std::abs(*LBegin - *RBegin)); 75 | } 76 | return res; 77 | } 78 | 79 | template 80 | Float max_abs_differenceVerbose(Float const* LBegin, Float const* LEnd, 81 | Float const* RBegin, float delta) 82 | { 83 | int off = 0; 84 | Float res = 0; 85 | for (; LBegin != LEnd; ++LBegin, ++RBegin) 86 | { 87 | if (std::abs(*LBegin - *RBegin) > delta) 88 | { 89 | std::cout << off << " : " << (*LBegin) << " " << (*RBegin) << " " 90 | << std::abs(*LBegin - *RBegin) << "\n"; 91 | } 92 | res = std::max(res, std::abs(*LBegin - *RBegin)); 93 | off++; 94 | } 95 | return res; 96 | } 97 | 98 | template 99 | aligned_vector get_random_vector(unsigned size, 100 | unsigned extra_elements = 16) 101 | { 102 | aligned_vector res(size + extra_elements); 103 | 104 | std::random_device rd; 105 | std::mt19937 gen(0); // rd()); 106 | 107 | std::uniform_real_distribution dis(-1.0, 1.0); 108 | 109 | for (auto& f : res) 110 | { 111 | f = dis(gen); 112 | } 113 | 114 | return res; 115 | } 116 | 117 | template 118 | aligned_vector getZeroVector(unsigned size, unsigned extra_elements = 16) 119 | { 120 | aligned_vector res(size + extra_elements); 121 | return res; 122 | } 123 | 124 | template 125 | double measureFastestWithWarmup(Fn&& fn, int warmupIterations, 126 | int measuredIterations = 1) 127 | { 128 | for (int i = 0; i < warmupIterations; ++i) 129 | { 130 | fn(); 131 | } 132 | 133 | auto start = std::chrono::high_resolution_clock::now(); 134 | fn(); 135 | auto end = std::chrono::high_resolution_clock::now(); 136 | auto nsecs = 137 | std::chrono::duration_cast(end - start) 138 | .count(); 139 | 140 | for (int i = 1; i < measuredIterations; ++i) 141 | { 142 | start = std::chrono::high_resolution_clock::now(); 143 | fn(); 144 | end = std::chrono::high_resolution_clock::now(); 145 | 146 | auto new_time = 147 | std::chrono::duration_cast(end - start) 148 | .count(); 149 | 150 | // LN_LOG(INFO) << "T: " << new_time << "\n"; 151 | nsecs = std::min(nsecs, new_time); 152 | } 153 | 154 | return static_cast(nsecs) / 1e9; 155 | } 156 | 157 | inline std::uint64_t rdtsc() 158 | { 159 | #if !defined(LOOP_NEST_ARM) 160 | unsigned hi, lo; 161 | __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); 162 | return ((std::uint64_t)lo) | (((std::uint64_t)hi) << 32); 163 | #else 164 | return 0; 165 | #endif 166 | } 167 | 168 | template 169 | double measureMinCyclesWithWarmup(Fn&& fn, int warmupIterations, 170 | int measuredIterations = 1) 171 | { 172 | for (int i = 0; i < warmupIterations; ++i) 173 | { 174 | fn(); 175 | } 176 | 177 | auto start = rdtsc(); 178 | fn(); 179 | auto end = rdtsc(); 180 | auto cyc = end - start; 181 | 182 | for (int i = 1; i < measuredIterations; ++i) 183 | { 184 | start = rdtsc(); 185 | fn(); 186 | end = rdtsc(); 187 | 188 | auto new_time = end - start; 189 | // LN_LOG(INFO) << "T: " << new_time << "\n"; 190 | cyc = std::min(cyc, new_time); 191 | } 192 | 193 | return static_cast(cyc); 194 | } 195 | 196 | template 197 | void check_correctness(BaseLineImpl&& baseline_fn, JITImpl&& jit_fn, int A_size, 198 | int B_size, int C_size, int alpha = 0) 199 | { 200 | auto A = get_random_vector(A_size); 201 | auto B = get_random_vector(B_size); 202 | 203 | auto CN = aligned_vector(C_size); 204 | auto CJ = std::vector(C_size); 205 | 206 | baseline_fn(CN.data(), A.data(), B.data()); 207 | jit_fn(CJ.data(), A.data(), B.data(), alpha); 208 | 209 | std::cout << "MAXABSDIFF: " 210 | << max_abs_difference(CJ.data(), CJ.data() + C_size, CN.data()) 211 | << "\n"; 212 | } 213 | 214 | template 215 | void bench_implementation(Fn&& fn, int A_size, int B_size, int C_size, 216 | double gflops, int warmup = 5, int iters = 10) 217 | { 218 | auto A = get_random_vector(A_size); 219 | auto B = get_random_vector(B_size); 220 | auto C = std::vector(C_size); 221 | 222 | auto secs = measureFastestWithWarmup( 223 | [&]() { fn(C.data(), A.data(), B.data(), 0); }, warmup, iters); 224 | 225 | std::cout << "GFLOPS: " << (gflops / secs) << "\n"; 226 | } 227 | 228 | template 229 | void bench_implementation_fmas_per_cycle(Fn&& fn, int A_size, int B_size, 230 | int C_size, double flops, 231 | int warmup = 5, int iters = 10) 232 | { 233 | auto A = get_random_vector(A_size); 234 | auto B = get_random_vector(B_size); 235 | auto C = std::vector(C_size); 236 | 237 | auto secs = measureMinCyclesWithWarmup( 238 | [&]() { fn(C.data(), A.data(), B.data(), 0); }, warmup, iters); 239 | 240 | std::cout << "FLOPS per CYCLE: " << (flops / secs) << "\n"; 241 | } 242 | --------------------------------------------------------------------------------