├── .clang-format
├── .clang_complete
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── apps
    ├── CMakeLists.txt
    ├── address_packer_search.cpp
    ├── baselines.hpp
    ├── gflops.cpp
    ├── loop_nest.cpp
    ├── loop_nest_baseline.hpp
    ├── loop_nest_bench.cpp
    ├── loop_nest_bench.hpp
    ├── loop_nest_fp16.cpp
    ├── loop_nest_stress_test.cpp
    ├── loop_nest_tensillica.cpp
    ├── loop_nest_test.hpp
    ├── loop_nest_tests.cpp
    ├── loop_tree.cpp
    ├── serialization.cpp
    ├── tensillica_play.cpp
    ├── transposer.cpp
    ├── transposer_baseline.hpp
    ├── transposer_bench.hpp
    └── transposer_stress_test.cpp
├── assets
    └── logo
    │   ├── fulllogo.jpg
    │   ├── fulllogo.png
    │   ├── fulllogo_nobuffer.jpg
    │   ├── fulllogo_nobuffer.png
    │   ├── fulllogo_transparent.png
    │   ├── fulllogo_transparent_nobuffer.png
    │   ├── grayscale.png
    │   ├── grayscale_nobuffer.png
    │   ├── grayscale_transparent.png
    │   ├── grayscale_transparent_nobuffer.png
    │   ├── icononly.png
    │   ├── icononly_nobuffer.png
    │   ├── icononly_transparent.png
    │   ├── icononly_transparent_nobuffer.png
    │   ├── icononly_transparent_nobuffer_padded.png
    │   ├── print.eps
    │   ├── print.pdf
    │   ├── print.svg
    │   ├── print_transparent.eps
    │   ├── print_transparent.pdf
    │   ├── print_transparent.svg
    │   ├── textonly.png
    │   └── textonly_nobuffer.png
├── cmake
    ├── aarch64
    │   └── dabun.cmake
    └── x86_64
    │   └── dabun.cmake
├── dabun_config.hpp.in
├── include
    └── dabun
    │   ├── aligned_vector.hpp
    │   ├── arithmetic_operation.hpp
    │   ├── arm
    │       ├── arithmetic_operation.hpp
    │       ├── configuration.hpp
    │       ├── elementwise_operation.hpp
    │       ├── loop_nest.hpp
    │       ├── loop_nest_fp16.hpp
    │       ├── meta_mnemonics.hpp
    │       ├── multi_vreg.hpp
    │       ├── peak_gflops.hpp
    │       ├── transposer.hpp
    │       └── xbyak.hpp
    │   ├── check.hpp
    │   ├── code_generator.hpp
    │   ├── code_generator
    │       ├── aot_fn.hpp
    │       ├── code_generator.hpp
    │       ├── memory_resource.hpp
    │       └── xbyak.hpp
    │   ├── common.hpp
    │   ├── configuration.hpp
    │   ├── core.hpp
    │   ├── elementwise_operation.hpp
    │   ├── hask
    │       └── apple.hpp
    │   ├── isa.hpp
    │   ├── loop_nest.hpp
    │   ├── loop_nest_descriptor.hpp
    │   ├── loop_tree
    │       ├── all_nodes.hpp
    │       ├── compiled_loop_nest_node.hpp
    │       ├── compiled_transpose_node.hpp
    │       ├── compute_node.hpp
    │       ├── for_loop_node.hpp
    │       ├── nested_for_loops_node.hpp
    │       ├── node.hpp
    │       ├── program.hpp
    │       ├── report.hpp
    │       ├── transpose_node.hpp
    │       ├── types.hpp
    │       └── utility.hpp
    │   ├── math.hpp
    │   ├── numeric.hpp
    │   ├── one_constant.hpp
    │   ├── peak_gflops.hpp
    │   ├── predef.hpp
    │   ├── random_vector.hpp
    │   ├── serialization.hpp
    │   ├── tensillica
    │       ├── cpp_intrinsics_code_generator.hpp
    │       ├── dl_compiled_fn.hpp
    │       ├── loop_nest.hpp
    │       ├── multi_vmm.hpp
    │       └── peak_gflops.hpp
    │   ├── transposer.hpp
    │   ├── utility
    │       ├── log.hpp
    │       ├── most_frequent_queue.hpp
    │       └── tmp_file_name.hpp
    │   └── x86
    │       ├── address_packer.hpp
    │       ├── arithmetic_operation.hpp
    │       ├── configuration.hpp
    │       ├── denormals.hpp
    │       ├── elementwise_operation.hpp
    │       ├── loop_nest.hpp
    │       ├── multi_vmm.hpp
    │       ├── peak_gflops.hpp
    │       ├── transposer.hpp
    │       └── xbyak.hpp
├── src
    ├── loop_nest.cpp
    ├── peak_gflops.cpp
    ├── transposer.cpp
    └── x86
    │   └── multi_vmm.cpp
├── tests
    ├── CMakeLists.txt
    ├── baseline
    │   ├── loop_nest_baseline.hpp
    │   └── matrix_transpose_baseline.hpp
    ├── catch2_main.cpp
    ├── handpicked_loop_nest_test.cpp
    ├── sentinel.cpp
    └── transpose_meta_mnemonics.cpp
└── utils.h


/.clang-format:
--------------------------------------------------------------------------------
 1 | BasedOnStyle: LLVM
 2 | IndentWidth: 4
 3 | ---
 4 | Language: Cpp
 5 | # Force pointers to the type for C++.
 6 | DerivePointerAlignment: false
 7 | PointerAlignment: Left
 8 | # QualifierAlignmentStyle: Right
 9 | # ReferenceAlignmentStyle: Left
10 | 
11 | UseTab: Never
12 | IndentWidth: 4
13 | BreakBeforeBraces: Allman
14 | AllowShortIfStatementsOnASingleLine: false
15 | IndentCaseLabels: false
16 | ColumnLimit: 80
17 | AccessModifierOffset: -4
18 | AlignConsecutiveAssignments: true
19 | AlignConsecutiveDeclarations: true
20 | AlignOperands: true
21 | BreakBeforeBraces: Allman
22 | AlwaysBreakTemplateDeclarations: true
23 | BreakConstructorInitializersBeforeComma: true
24 | IndentPPDirectives: AfterHash


--------------------------------------------------------------------------------
/.clang_complete:
--------------------------------------------------------------------------------
1 | -I.
2 | -I./xbyak


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | loop_nest
34 | 
35 | build
36 | cmake-build-debug
37 | cmake-build-release
38 | *.asm
39 | tmp
40 | .idea
41 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "xbyak"]
 2 | 	path = extern/xbyak
 3 | 	url = https://github.com/herumi/xbyak
 4 | [submodule "xbyak_aarch64"]
 5 | 	path = extern/xbyak_aarch64
 6 | 	url = https://github.com/zlateski/xbyak_aarch64
 7 | 	branch = fjmaster
 8 | [submodule "extern/fmt"]
 9 | 	path = extern/fmt
10 | 	url = https://github.com/fmtlib/fmt
11 | [submodule "extern/Catch2"]
12 | 	path = extern/Catch2
13 | 	url = https://github.com/catchorg/Catch2
14 | 	branch = v2.x
15 | [submodule "extern/cpuinfo"]
16 | 	path = extern/cpuinfo
17 | 	url = https://github.com/pytorch/cpuinfo
18 | [submodule "extern/libsysml"]
19 | 	path = extern/libsysml
20 | 	url = git@github.com:facebookresearch/libsysml.git
21 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.15.0)
  2 | 
  3 | add_subdirectory(extern/Catch2)
  4 | 
  5 | add_subdirectory(extern/libsysml/cpp)
  6 | 
  7 | 
  8 | # TODO: This is probably a very bad and non-standard solution, figure
  9 | # out what's standard and/or better
 10 | 
 11 | get_target_property(LIBSYSML_INCLUDES sysmlcpp INCLUDE_DIRECTORIES)
 12 | 
 13 | # foreach(dir ${LIBSYSML_INCLUDES})
 14 | #  message(STATUS "Including libsysml include dir: ${dir}")
 15 | #  include_directories(${dir})
 16 | #endforeach()
 17 | 
 18 | 
 19 | message(STATUS "${libsysml_INCLUDE_DIRS} ????")
 20 | include_directories(${libsysml_INCLUDE_DIRS})
 21 | 
 22 | # SET (CMAKE_CXX_COMPILER             "/usr/bin/clang++")
 23 | 
 24 | project(dabun
 25 |   LANGUAGES CXX
 26 |   VERSION 0.0.0)
 27 | 
 28 | configure_file(dabun_config.hpp.in config/dabun_config.hpp)
 29 | 
 30 | set(CMAKE_CXX_STANDARD 20)
 31 | 
 32 | option(DABUN_DEBUG "Set to ON to build debug version" OFF)
 33 | option(DABUN_DEBUG_WERROR "Set to ON to enable all warnings in debug mode" ON)
 34 | option(DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX "Set to ON to build apps for all supported extensions" ON)
 35 | option(DABUN_BUILD_TESTS_FOR_ALL_ARCH_VEX "Set to ON to build tests for all supported extensions" ON)
 36 | 
 37 | if (DABUN_DEBUG)
 38 |   message(STATUS "Will compile libdabun in debug mode.")
 39 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
 40 |   if (DABUN_DEBUG_WERROR)
 41 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror -Wno-sign-compare")
 42 |   endif()
 43 | else()
 44 |   message(STATUS "Will compile libdabun in release mode.")
 45 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG -O3 -Wall -Wextra -Werror -Wno-sign-compare")
 46 | endif()
 47 | 
 48 | set(DABUN_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 49 | set(DABUN_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 50 | set(DABUN_THIRDPARTY_DIR ${DABUN_BINARY_DIR}/extern)
 51 | 
 52 | ##
 53 | ## Find Boost
 54 | ##
 55 | set(Boost_USE_STATIC_LIBS        ON)  # only find static libs
 56 | set(Boost_USE_DEBUG_LIBS        ${DABUN_DEBUG})  # ignore debug libs and
 57 | set(Boost_USE_RELEASE_LIBS       ON)  # only find release libs
 58 | set(Boost_USE_MULTITHREADED      ON)
 59 | set(Boost_USE_STATIC_RUNTIME    OFF)
 60 | find_package(Boost 1.66.0)
 61 | if(Boost_FOUND)
 62 |   include_directories(${Boost_INCLUDE_DIRS})
 63 | else()
 64 |   message(FATAL_ERROR "Can't find boost libraries")
 65 | endif()
 66 | 
 67 | ##
 68 | ## Find Cpuinfo
 69 | ##
 70 | if(NOT TARGET cpuinfo)
 71 |   #Download cpuinfo from github if CPUINFO_SOURCE_DIR is not specified.
 72 |   if(NOT DEFINED CPUINFO_SOURCE_DIR)
 73 |     set(CPUINFO_SOURCE_DIR "${DABUN_SOURCE_DIR}/extern/cpuinfo"
 74 |       CACHE STRING "cpuinfo source directory from submodules")
 75 |   endif()
 76 | 
 77 |   #build cpuinfo
 78 |   set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL "Do not build cpuinfo unit tests")
 79 |   set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL "Do not build cpuinfo mock tests")
 80 |   set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL "Do not build cpuinfo benchmarks")
 81 |   set(CPUINFO_LIBRARY_TYPE static CACHE STRING "Set lib type to static")
 82 |   #Select static runtime, needed for static build for MSVC
 83 |   set(CPUINFO_RUNTIME_TYPE static CACHE STRING "Set runtime to static")
 84 | 
 85 |   add_subdirectory("${CPUINFO_SOURCE_DIR}" "${DABUN_BINARY_DIR}/cpuinfo")
 86 |   set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON)
 87 | endif()
 88 | 
 89 | # include_directories(${CPUINFO_SOURCE_DIR}/include)
 90 | 
 91 | include_directories(extern/libsysml/cpp/include)
 92 | 
 93 | ##
 94 | ## Detect host architecture
 95 | ##
 96 | set(DABUN_HOST_ARCHITECTURE_SUPPORTED x86_64 aarch64 arm64)
 97 | 
 98 | execute_process(COMMAND uname -m
 99 |   COMMAND tr -d '\n'
100 |   OUTPUT_VARIABLE DABUN_HOST_ARCHITECTURE)
101 | 
102 | if("${DABUN_HOST_ARCHITECTURE}" STREQUAL "arm64")
103 |   set(DABUN_HOST_ARCHITECTURE aarch64)
104 | endif()
105 | 
106 | 
107 | message(STATUS "Host architecture detected: ${DABUN_HOST_ARCHITECTURE}")
108 | 
109 | set(DABUN_COMMON_SRC_CPP_FILES
110 |   src/loop_nest.cpp
111 |   src/transposer.cpp
112 |   src/peak_gflops.cpp)
113 | 
114 | if("${DABUN_HOST_ARCHITECTURE}" STREQUAL "x86_64")
115 |   include(cmake/x86_64/dabun.cmake)
116 | elseif("${DABUN_HOST_ARCHITECTURE}" STREQUAL "aarch64")
117 |   include(cmake/aarch64/dabun.cmake)
118 | else()
119 |   message(FATAL_ERROR "Host architecture ${DABUN_HOST_ARCHITECTURE} not supported.")
120 | endif()
121 | 
122 | add_subdirectory(apps)
123 | add_subdirectory(tests)
124 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when there is a
56 | reasonable belief that an individual's behavior may have a negative impact on
57 | the project or its community.
58 | 
59 | ## Enforcement
60 | 
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported by contacting the project team at <opensource-conduct@fb.com>. All
63 | complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 | 
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 | 
72 | ## Attribution
73 | 
74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 | 
77 | [homepage]: https://www.contributor-covenant.org
78 | 
79 | For answers to common questions about this code of conduct, see
80 | https://www.contributor-covenant.org/faq
81 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to loop_nest
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `main`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Facebook's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## License
30 | By contributing to loop_nest, you agree that your contributions will be licensed
31 | under the LICENSE file in the root directory of this source tree.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) Meta Platforms, Inc. and affiliates.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ![dabun logo](/assets/logo/icononly_transparent_nobuffer.png)
2 | 


--------------------------------------------------------------------------------
/apps/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(DABUN_APPS_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 2 | 
 3 | function(dabun_extension_specific_app name vex float isa)
 4 |   message(STATUS "dabun_extension_specific_app ${name}.${vex}.${float} ${name}.cpp")
 5 |   add_executable(${name}.${vex}.${float} ${name}.cpp)
 6 |   target_link_libraries(${name}.${vex}.${float}
 7 |     PUBLIC dabun
 8 |     PUBLIC -ldl
 9 |     PUBLIC -lpthread)
10 |   target_compile_options(${name}.${vex}.${float}
11 |     PRIVATE "-DDABUN_ISA=${isa}"
12 |     PRIVATE "-DDABUN_VEX=extension::${vex}"
13 |     PRIVATE "-DDABUN_ARITHMETIC=dabun::${float}")
14 | endfunction(dabun_extension_specific_app)
15 | 
16 | function(dabun_app name)
17 |   message(STATUS "dabun_app ${name}.cpp ${name}.cpp")
18 |   add_executable(${name} ${name}.cpp)
19 |   target_link_libraries(${name}
20 |     PUBLIC dabun
21 |     PUBLIC -ldl
22 |     PUBLIC -lpthread)
23 | endfunction(dabun_app)
24 | 
25 | function(dabun_x86_apps vex float isa)
26 |   dabun_extension_specific_app(address_packer_search ${vex} ${float} ${isa})
27 |   dabun_extension_specific_app(gflops ${vex} ${float} ${isa})
28 |   dabun_extension_specific_app(loop_nest ${vex} ${float} ${isa})
29 |   dabun_extension_specific_app(loop_nest_bench ${vex} ${float} ${isa})
30 |   dabun_extension_specific_app(loop_nest_stress_test ${vex} ${float} ${isa})
31 |   dabun_extension_specific_app(loop_nest_tests ${vex} ${float} ${isa})
32 |   dabun_extension_specific_app(loop_tree ${vex} ${float} ${isa})
33 |   dabun_extension_specific_app(serialization ${vex} ${float} ${isa})
34 |   dabun_extension_specific_app(transposer ${vex} ${float} ${isa})
35 |   dabun_extension_specific_app(transposer_stress_test ${vex} ${float} ${isa})
36 | endfunction(dabun_x86_apps)
37 | 
38 | function(dabun_arm_apps vex float isa)
39 |   dabun_extension_specific_app(gflops ${vex} ${float} ${isa})
40 |   dabun_extension_specific_app(loop_nest ${vex} ${float} ${isa})
41 |   dabun_extension_specific_app(loop_nest_bench ${vex} ${float} ${isa})
42 |   dabun_extension_specific_app(loop_nest_stress_test ${vex} ${float} ${isa})
43 |   dabun_extension_specific_app(loop_nest_tests ${vex} ${float} ${isa})
44 | #  dabun_extension_specific_app(loop_tree ${vex} ${float} ${isa})
45 | #  dabun_extension_specific_app(serialization ${vex} ${float} ${isa})
46 | #  dabun_extension_specific_app(transposer ${vex} ${float} ${isa})
47 | #  dabun_extension_specific_app(transposer_stress_test ${vex} ${float} ${isa})
48 | endfunction(dabun_arm_apps)
49 | 
50 | dabun_app(tensillica_play)
51 | dabun_app(loop_nest_tensillica)
52 | 
53 | if("${DABUN_HOST_ARCHITECTURE}" STREQUAL "x86_64")
54 |   if (DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX OR DABUN_BUILD_APPS_FOR_AVX2)
55 |     dabun_x86_apps(avx2 fp32_t avx2)
56 |   endif()
57 |   if (DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX OR DABUN_BUILD_APPS_FOR_AVX512)
58 |     dabun_x86_apps(avx512 fp32_t avx512)
59 |     dabun_extension_specific_app(gflops avx512_ymm fp32_t avx2_plus)
60 |   endif()
61 | elseif("${DABUN_HOST_ARCHITECTURE}" STREQUAL "aarch64")
62 |   if (DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX OR DABUN_BUILD_APPS_FOR_NEON)
63 |     dabun_arm_apps(neon fp32_t aarch64)
64 |   endif()
65 |   if (DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX OR DABUN_BUILD_APPS_FOR_NEON_FP16)
66 |     dabun_arm_apps(neon_fp16 fp16_t aarch64)
67 |   endif()
68 | elseif("${DABUN_HOST_ARCHITECTURE}" STREQUAL "arm64")
69 |   if (DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX OR DABUN_BUILD_APPS_FOR_NEON)
70 |     dabun_arm_apps(neon fp32_t aarch64)
71 |   endif()
72 |   if (DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX OR DABUN_BUILD_APPS_FOR_NEON_FP16)
73 |     dabun_arm_apps(neon_fp16 fp16_t aarch64)
74 |   endif()
75 | else()
76 | endif()
77 | 


--------------------------------------------------------------------------------
/apps/address_packer_search.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #include <iostream>
 7 | 
 8 | #include "dabun/x86/xbyak.hpp"
 9 | 
10 | auto get_size(bool is_broadcast, int reg_idx)
11 | {
12 |     return [=](int off)
13 |     {
14 |         Xbyak::CodeGenerator cg;
15 |         if (is_broadcast)
16 |         {
17 |             cg.vfmadd231ps(cg.zmm0, cg.zmm1,
18 |                            cg.ptr_b[Xbyak::Reg64(reg_idx) + off * 0x4]);
19 |         }
20 |         else
21 |         {
22 |             cg.vfmadd231ps(cg.zmm0, cg.zmm1,
23 |                            cg.ptr[Xbyak::Reg64(reg_idx) + off * 0x40]);
24 |         }
25 |         return cg.getSize();
26 |     };
27 | }
28 | 
29 | auto get_size2(bool is_broadcast, int reg_idx)
30 | {
31 |     return [=](int off)
32 |     {
33 |         Xbyak::CodeGenerator cg;
34 |         if (is_broadcast)
35 |         {
36 |             cg.vmovups(cg.zmm0, cg.ptr_b[Xbyak::Reg64(reg_idx) + off * 0x4]);
37 |         }
38 |         else
39 |         {
40 |             cg.vmovups(cg.zmm0, cg.ptr[Xbyak::Reg64(reg_idx) + off * 0x40]);
41 |         }
42 |         return cg.getSize();
43 |     };
44 | }
45 | 
46 | template <class F>
47 | int binary_search(F const& f, int begin, int end, int s)
48 | {
49 |     if (begin == end)
50 |     {
51 |         return begin;
52 |     }
53 | 
54 |     int mid = begin + (end - begin) / 2;
55 | 
56 |     if (f(mid) == s)
57 |     {
58 |         return binary_search(f, mid + 1, end, s);
59 |     }
60 |     else
61 |     {
62 |         return binary_search(f, begin, mid, s);
63 |     }
64 | }
65 | 
66 | template <class F>
67 | int do_search(F const& f)
68 | {
69 |     std::cout << "F(0) = " << f(0) << "; F(1) = " << f(1) << "\n";
70 |     int  s   = f(1);
71 |     auto ret = binary_search(f, 1, 0xFFFFFF, s);
72 |     return ret;
73 | }
74 | 
75 | int main()
76 | {
77 |     auto fn = get_size2(true, 0);
78 | 
79 |     std::cout << "line ";
80 |     std::cout << fn(0) << ' ';
81 |     std::cout << fn(10) << ' ';
82 |     std::cout << do_search(fn) << ' ';
83 |     std::cout << std::endl;
84 | }
85 | 


--------------------------------------------------------------------------------
/apps/gflops.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #include "dabun/numeric.hpp"
 7 | #include "dabun/peak_gflops.hpp"
 8 | 
 9 | #include <iostream>
10 | 
11 | using namespace dabun;
12 | 
13 | #ifndef DABUN_ARITHMETIC
14 | #    define DABUN_ARITHMETIC float
15 | #endif
16 | 
17 | #ifndef DABUN_ISA
18 | #    define DABUN_ISA avx2
19 | #endif
20 | 
21 | int main()
22 | {
23 |     std::cout << measure_peak_gflops<DABUN_ISA, DABUN_ARITHMETIC>(1) << "\n";
24 | }
25 | 


--------------------------------------------------------------------------------
/apps/loop_nest_bench.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #include "loop_nest_bench.hpp"
  7 | #include "loop_nest_baseline.hpp"
  8 | 
  9 | int main()
 10 | {
 11 | 
 12 |     using namespace dabun;
 13 | 
 14 |     // 2D convolution NCHW example:
 15 |     // O(c_out, o_h, o_w) = I(c_i, o_h + k_h, ow + k_w) * K(c_o, c_i,
 16 |     // k_h, k_w)
 17 |     {
 18 |         int CIN  = 64;
 19 |         int COUT = 64;
 20 |         int OS   = 112;
 21 |         int KS   = 3;
 22 |         int IS   = OS + KS - 1;
 23 | 
 24 |         loop_nest_bench<DABUN_ISA>(
 25 |             {{"c_out", 16}, //
 26 |              {"o_h", 1},
 27 |              {"o_w", 28},
 28 |              {"c_in", 16},
 29 |              {"c_in", 1},
 30 |              {"c_out", 1}, //
 31 |              {"o_w", 1},   //
 32 |              {"k_h", 1},   //
 33 |              {"k_w", 1}},  //
 34 |             // The second argument is a map of the dimension sizes
 35 |             {{"c_out", COUT},
 36 |              {"o_w", OS},
 37 |              {"k_w", KS},
 38 |              {"c_in", CIN},
 39 |              {"o_h", OS},
 40 |              {"k_h", KS},
 41 |              {"i_w", IS},
 42 |              {"i_h", IS}},
 43 |             // Vars of C (other variables are reduction variables)
 44 |             {"c_out", "o_w", "o_h"},
 45 |             // Variables of A, note that i_w and i_h are not used
 46 |             {"c_in", "i_w", "i_h"},
 47 |             // Variables of B
 48 |             {"c_in", "c_out", "k_w", "k_h"},
 49 |             // C's strides for each variable
 50 |             {{"o_w", 1}, {"c_out", OS * OS}, {"o_h", OS}},
 51 |             // A's strides for each variable Note how we
 52 |             // provide strides for i/k_h and i/k_w, this is
 53 |             // because the access to A is based on output
 54 |             // and reduction variables
 55 |             {{"o_w", 1},
 56 |              {"k_w", 1},
 57 |              {"c_in", IS * IS},
 58 |              {"o_h", IS},
 59 |              {"k_h", IS}},
 60 |             // B's strides for each variable
 61 |             {{"c_out", KS * KS * CIN},
 62 |              {"c_in", KS * KS},
 63 |              {"k_w", 1},
 64 |              {"k_h", KS}},
 65 |             64);
 66 |     }
 67 | 
 68 |     // 2D convolution on NCHW16c layout example:
 69 |     // O(g_out, c_out, o_h, o_w) = I(g_in, c_in, o_h + k_h, ow + k_w) *
 70 |     //                             K(g_in, g_out, c_in, c_out, k_h, k_w)
 71 |     // if (0)
 72 |     {
 73 |         int GIN  = 128 / 16;
 74 |         int CIN  = 16;
 75 |         int GOUT = 128 / 16;
 76 |         int COUT = 16;
 77 |         int OS   = 56;
 78 |         int KS   = 3;
 79 |         int IS   = OS + KS - 1;
 80 | 
 81 |         loop_nest_bench<DABUN_ISA>(
 82 |             {{"g_out", 1}, //
 83 |              {"o_w", 28},
 84 |              {"o_h", 1},
 85 |              {"g_in", 1},
 86 |              {"c_in", 1},
 87 |              {"o_w", 1}, //
 88 |              //{"o_w", 1},    //
 89 |              {"k_h", 1},    //
 90 |              {"k_w", 1},    //
 91 |              {"c_out", 1}}, //
 92 |             // The second argument is a map of the dimension sizes
 93 |             {{"g_out", GOUT},
 94 |              {"c_out", COUT},
 95 |              {"o_w", OS},
 96 |              {"k_w", KS},
 97 |              {"g_in", GIN},
 98 |              {"c_in", CIN},
 99 |              {"o_h", OS},
100 |              {"k_h", KS},
101 |              {"i_h", IS},
102 |              {"i_w", IS}},
103 |             // Vars of C (other variables are reduction variables)
104 |             {"g_out", "c_out", "o_w", "o_h"},
105 |             // Variables of A, note that i_w and i_h are not used
106 |             {"g_in", "c_in", "i_w", "i_h"},
107 |             // Variables of B
108 |             {"g_out", "g_in", "c_in", "c_out", "k_w", "k_h"},
109 |             // C's strides for each variable
110 |             {{"g_out", OS * OS * COUT},
111 |              {"o_h", OS * COUT},
112 |              {"o_w", COUT},
113 |              {"c_out", 1}},
114 |             // A's strides for each variable Note how we
115 |             // provide strides for i/k_h and i/k_w, this is
116 |             // because the access to A is based on output
117 |             // and reduction variables
118 |             {{"g_in", IS * IS * CIN},
119 |              {"o_h", IS * CIN},
120 |              {"k_h", IS * CIN},
121 |              {"o_w", CIN},
122 |              {"k_w", CIN},
123 |              {"c_in", 1}},
124 |             // B's strides for each variable
125 |             {{"g_in", COUT * KS * KS * CIN * GOUT},
126 |              {"g_out", COUT * KS * KS * CIN},
127 |              {"c_in", COUT * KS * KS},
128 |              {"k_h", COUT * KS},
129 |              {"k_w", COUT},
130 |              {"c_out", 1}});
131 |     }
132 | }
133 | 


--------------------------------------------------------------------------------
/apps/loop_nest_bench.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "dabun/arithmetic_operation.hpp"
 9 | #include "dabun/loop_nest.hpp"
10 | #include "dabun/random_vector.hpp"
11 | 
12 | #include <sysml/measure.hpp>
13 | 
14 | #include <functional>
15 | #include <map>
16 | #include <set>
17 | #include <string>
18 | #include <utility>
19 | #include <vector>
20 | 
21 | namespace dabun
22 | {
23 | 
24 | template <class ISA>
25 | void loop_nest_bench(std::vector<std::pair<std::string, int>> const& order,
26 |                      std::map<std::string, int> const&               sizes,
27 |                      std::set<std::string> const&                    C_formula,
28 |                      std::set<std::string> const&                    A_formula,
29 |                      std::set<std::string> const&                    B_formula,
30 |                      std::map<std::string, int> const&               C_strides,
31 |                      std::map<std::string, int> const&               A_strides,
32 |                      std::map<std::string, int> const& B_strides, int alpha = 0,
33 |                      int max_unrolled_fmas = 320, int total_iterations = 100)
34 | {
35 |     std::int64_t C_size = 1;
36 |     std::int64_t A_size = 1;
37 |     std::int64_t B_size = 1;
38 | 
39 |     double flops = 2.0;
40 | 
41 |     for (auto const& s : sizes)
42 |     {
43 |         if (C_strides.count(s.first))
44 |             C_size += (s.second - 1) * C_strides.at(s.first);
45 |         if (A_strides.count(s.first))
46 |             A_size += (s.second - 1) * A_strides.at(s.first);
47 |         if (B_strides.count(s.first))
48 |             B_size += (s.second - 1) * B_strides.at(s.first);
49 |         if (C_strides.count(s.first) || B_strides.count(s.first) ||
50 |             A_strides.count(s.first))
51 |             flops *= s.second;
52 |     }
53 | 
54 |     auto A  = get_random_vector<float>(A_size);
55 |     auto B  = get_random_vector<float>(B_size);
56 |     auto CN = get_random_vector<float>(C_size);
57 | 
58 |     auto jit_fn = loop_nest_code_generator<ISA>(
59 |                       order, sizes, C_formula, A_formula, B_formula, C_strides,
60 |                       A_strides, B_strides, dabun::fma, max_unrolled_fmas)
61 |                       .get_shared();
62 | 
63 |     jit_fn.save_to_file("zi.asm");
64 | 
65 |     auto secs = sysml::measure_fastest(
66 |         [&]() { jit_fn(CN.data(), A.data(), B.data(), alpha); },
67 |         total_iterations);
68 | 
69 |     double gflops = flops / 1000000000;
70 | 
71 |     std::cout << "gflops: " << gflops << "\n";
72 | 
73 |     std::cout << "GFLOPS: " << (gflops / secs) << "\n";
74 | }
75 | 
76 | } // namespace dabun
77 | 


--------------------------------------------------------------------------------
/apps/loop_nest_stress_test.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #include "baselines.hpp"
  7 | #include "loop_nest_baseline.hpp"
  8 | 
  9 | #include "dabun/arithmetic_operation.hpp"
 10 | #include "dabun/check.hpp"
 11 | #include "dabun/isa.hpp"
 12 | #include "dabun/loop_nest.hpp"
 13 | #include "dabun/random_vector.hpp"
 14 | 
 15 | #include <algorithm>
 16 | #include <cassert>
 17 | #include <chrono>
 18 | #include <iostream>
 19 | #include <map>
 20 | #include <numeric>
 21 | #include <optional>
 22 | #include <random>
 23 | #include <set>
 24 | #include <string>
 25 | #include <vector>
 26 | 
 27 | #ifndef DABUN_ISA
 28 | #define DABUN_ISA avx2
 29 | #endif
 30 | 
 31 | #ifndef DABUN_ARITHMETIC
 32 | #define DABUN_ARITHMETIC float
 33 | #endif
 34 | 
 35 | #ifndef DABUN_VEX
 36 | #if defined(DABUN_ARCH_AARCH64)
 37 | #define DABUN_VEX ::dabun::extension::neon
 38 | #else
 39 | #define DABUN_VEX ::dabun::extension::avx2
 40 | #endif
 41 | #endif
 42 | 
 43 | int main()
 44 | {
 45 |     using namespace dabun;
 46 | 
 47 |     using float_t = DABUN_ARITHMETIC;
 48 | 
 49 |     for (int rounds = 0; rounds < 1000000; ++rounds)
 50 |     {
 51 |         int ArCr              = (1 << rand() % 10) + rand() % 16;
 52 |         int AcBr              = (1 << rand() % 10) + rand() % 16;
 53 |         int BcCc              = (1 << rand() % 10) + rand() % 16;
 54 |         int max_fmas_unrolled = 1 << (rand() % 10);
 55 | 
 56 |         std::vector<std::pair<std::string, int>> order = {
 57 |             {"AcBr", 1}, {"BcCc", 1}, {"ArCr", 1}};
 58 | 
 59 |         std::vector<std::pair<std::string, int>> hyper_order = {
 60 |             {"AcBr", (rand() % AcBr) + 2},  // It's OK to go oversize
 61 |                                             // (tests whether it's
 62 |                                             // handled appropriately)
 63 |             {"BcCc", (rand() % BcCc) + 2},  // - || -
 64 |             {"ArCr", (rand() % ArCr) + 2}}; // - || -
 65 | 
 66 |         std::sort(hyper_order.begin(), hyper_order.end());
 67 | 
 68 |         do
 69 |         {
 70 |             std::sort(order.begin(), order.end());
 71 |             do
 72 |             {
 73 |                 auto full_order = hyper_order;
 74 |                 full_order.insert(full_order.end(), order.begin(), order.end());
 75 | 
 76 |                 std::cout << "DIF: ORDER: ";
 77 |                 for (auto& o : full_order)
 78 |                 {
 79 |                     if (o.first == full_order.back().first)
 80 |                     {
 81 |                         if (o.second != 1)
 82 |                         {
 83 |                             o.second = round_up(
 84 |                                 o.second,
 85 |                                 isa_traits<extension_to_deprecated_ISA_t<
 86 |                                         DABUN_VEX>>::vector_size *
 87 |                                     4 / sizeof(float_t));
 88 |                         }
 89 |                     }
 90 |                     std::cout << o.first << "=" << o.second << "  ";
 91 |                 }
 92 | 
 93 |                 std::cout << "ArCr=" << ArCr << " ";
 94 |                 std::cout << "AcBr=" << AcBr << " ";
 95 |                 std::cout << "BcCc=" << BcCc << " ";
 96 | 
 97 |                 std::cout << "MU=" << max_fmas_unrolled << std::endl;
 98 | 
 99 |                 auto fn =
100 |                     loop_nest_compiler<DABUN_VEX, float_t>(
101 |                         full_order, // The second argument is a map of the
102 |                                     // dimension sizes
103 |                         {{"AcBr", AcBr}, {"ArCr", ArCr}, {"BcCc", BcCc}},
104 |                         // Vars of C (other variables are reduction variables)
105 |                         {"ArCr", "BcCc"},
106 |                         // Variables of A
107 |                         {"ArCr", "AcBr"},
108 |                         // Variables of B
109 |                         {"AcBr", "BcCc"},
110 |                         // C's strides for each variable.
111 |                         {{"ArCr", BcCc}, {"BcCc", 1}},
112 |                         // A's strides for each variable
113 |                         {{"ArCr", AcBr}, {"AcBr", 1}},
114 |                         // B's strides for each variable
115 |                         {{"AcBr", BcCc}, {"BcCc", 1}}, dabun::fma,
116 |                         max_fmas_unrolled, nullptr)
117 |                         .get_shared();
118 | 
119 |                 auto A = get_random_vector<float_t>(AcBr * ArCr);
120 |                 auto B = get_random_vector<float_t>(AcBr * BcCc);
121 | 
122 |                 auto CN = get_random_vector<float_t>(ArCr * BcCc);
123 |                 auto CJ = CN;
124 | 
125 |                 baseline_MM(ArCr, AcBr, BcCc, AcBr, 1, BcCc, 1, BcCc, 1,
126 |                             A.data(), B.data(), CN.data(), 1);
127 | 
128 |                 // apply_relu(CN.data(), CN.data() + CN.size());
129 | 
130 |                 fn(CJ.data(), A.data(), B.data(), 1);
131 | 
132 |                 auto madiff = max_abs_difference(
133 |                     CJ.data(), CJ.data() + ArCr * BcCc, CN.data());
134 | 
135 |                 std::cout << "MAXABSDIFF: " << madiff << std::endl;
136 | 
137 |                 // assert(madiff < 0.001);
138 |             } while (std::next_permutation(order.begin(), order.end()));
139 |         } while (std::next_permutation(hyper_order.begin(), hyper_order.end()));
140 |     }
141 | }
142 | 


--------------------------------------------------------------------------------
/apps/loop_nest_tensillica.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #include <vector>
  7 | 
  8 | #include <sysml/measure.hpp>
  9 | 
 10 | #include "dabun/check.hpp"
 11 | #include "dabun/random_vector.hpp"
 12 | #include "dabun/tensillica/loop_nest.hpp"
 13 | 
 14 | #include "baselines.hpp"
 15 | 
 16 | int main()
 17 | {
 18 |     using namespace dabun;
 19 | 
 20 |     // Playing with weird schedules
 21 |     // Matrix-Matrix product
 22 |     // C(r, c) = A(r, k) * B(k, c)
 23 |     // if (0)
 24 |     {
 25 |         int ArCr = 324;
 26 |         int AcBr = 124;
 27 |         int BcCc = 54;
 28 | 
 29 |         auto gen_loop_nest = [&]()
 30 |         {
 31 |             return dabun::tensillica::loop_nest_code_generator(
 32 |                        // The first argument is the loop order in the form of
 33 |                        // {dimension, stride}.  For now the outer dimension
 34 |                        // has to divide the stride.  This is effectively the
 35 |                        // same as Halide's split into outer and inner
 36 |                        // variable, but can have arbitray number of splits.
 37 |                        { {"ArCr", 6},
 38 |                          {"BcCc", 16},
 39 |                          {"AcBr", 4},
 40 |                         {"AcBr", 1},
 41 |                         {"ArCr", 1},
 42 |                         {"BcCc", 1}},
 43 |                        // The second argument is a map of the dimension sizes
 44 |                        {{"AcBr", AcBr}, {"ArCr", ArCr}, {"BcCc", BcCc}},
 45 |                        // Vars of C (other variables are reduction variables)
 46 |                        {"ArCr", "BcCc"},
 47 |                        // Variables of A
 48 |                        {"ArCr", "AcBr"},
 49 |                        // Variables of B
 50 |                        {"AcBr", "BcCc"},
 51 |                        // C's strides for each variable.  Note that the
 52 |                        // strides data is a superset of the previous argument
 53 |                        // (variables of C).  I'm still deciding on the final
 54 |                        // design, possibly allowing for null strides that
 55 |                        // will just deduce them from the sizes, or some
 56 |                        // special structs indicating the layout (ie
 57 |                        // row-major, col-major).  In this case the vars have
 58 |                        // to be ordered though... Many decisions to make...
 59 |                        {{"ArCr", BcCc}, {"BcCc", 1}},
 60 |                        // A's strides for each variable
 61 |                        {{"ArCr", AcBr}, {"AcBr", 1}},
 62 |                        // B's strides for each variable
 63 |                        {{"AcBr", BcCc}, {"BcCc", 1}}, nullptr)
 64 |                 .get_shared();
 65 |         };
 66 | 
 67 |         auto compile_secs = sysml::measure_fastest(gen_loop_nest, 1);
 68 |         std::cout << "Compile: " << compile_secs << std::endl;
 69 | 
 70 |         auto fn = gen_loop_nest();
 71 |         // fn.save_to_file("zi.asm");
 72 |         // fn.register_perf("fn1");
 73 | 
 74 |         auto A = get_random_vector<float>(AcBr * ArCr + 1024);
 75 |         auto B = get_random_vector<float>(AcBr * BcCc + 1024);
 76 | 
 77 |         auto CN = get_random_vector<float>(ArCr * BcCc + 1024);
 78 |         auto CJ = CN;
 79 | 
 80 |         baseline_MM(ArCr, AcBr, BcCc, AcBr, BcCc, BcCc, A.data(), B.data(),
 81 |                     CN.data(), 1);
 82 | 
 83 |         fn(CJ.data(), A.data(), B.data(), 1);
 84 |         // apply_relu(CN.data(), CN.data() + CN.size());
 85 | 
 86 |         std::cout << "MAXABSDIFF: "
 87 |                   << max_abs_difference(CJ.data(), CJ.data() + ArCr * BcCc,
 88 |                                         CN.data())
 89 |                   << "\n";
 90 | 
 91 |         auto secs = sysml::measure_fastest(
 92 |             [&]() { fn(CJ.data(), A.data(), B.data(), 0); }, 10);
 93 | 
 94 |         double gflops = 1.0 * AcBr * ArCr * BcCc * 2 / 1000000000;
 95 | 
 96 |         std::cout << "GFLOPS: " << (gflops / secs) << "\n";
 97 | 
 98 |         // bench_implementation_fmas_per_cycle(
 99 |         //     fn, AcBr * ArCr, AcBr * BcCc, ArCr * BcCc,
100 |         //     1.0 * AcBr * ArCr * BcCc * 2, 10, 10);
101 |     }
102 | }
103 | 


--------------------------------------------------------------------------------
/apps/loop_nest_test.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "dabun/arithmetic_operation.hpp"
 9 | #include "dabun/check.hpp"
10 | #include "dabun/loop_nest.hpp"
11 | #include "dabun/random_vector.hpp"
12 | #include "loop_nest_baseline.hpp"
13 | 
14 | #include <functional>
15 | #include <map>
16 | #include <set>
17 | #include <string>
18 | #include <utility>
19 | #include <vector>
20 | 
21 | namespace dabun
22 | {
23 | 
24 | template <class ISA>
25 | void test_loop_nest_against_slow_baseline(
26 |     std::vector<std::pair<std::string, int>> const& order,
27 |     std::map<std::string, int> const&               sizes,
28 |     std::set<std::string> const&                    C_formula,
29 |     std::set<std::string> const&                    A_formula,
30 |     std::set<std::string> const&                    B_formula,
31 |     std::map<std::string, int> const&               C_strides,
32 |     std::map<std::string, int> const&               A_strides,
33 |     std::map<std::string, int> const& B_strides, int max_unrolled_fmas = 512,
34 |     int alpha = 1)
35 | {
36 |     std::int64_t C_size = 1;
37 |     std::int64_t A_size = 1;
38 |     std::int64_t B_size = 1;
39 | 
40 |     alpha = alpha ? 1 : 0;
41 | 
42 |     for (auto const& s : sizes)
43 |     {
44 |         if (C_strides.count(s.first))
45 |             C_size += (s.second - 1) * C_strides.at(s.first);
46 |         if (A_strides.count(s.first))
47 |             A_size += (s.second - 1) * A_strides.at(s.first);
48 |         if (B_strides.count(s.first))
49 |             B_size += (s.second - 1) * B_strides.at(s.first);
50 |     }
51 | 
52 |     auto A  = get_random_vector<float>(A_size);
53 |     auto B  = get_random_vector<float>(B_size);
54 |     auto CN = get_random_vector<float>(C_size);
55 |     auto CJ = CN;
56 | 
57 |     auto jit_fn = loop_nest_code_generator<ISA>(
58 |                       order, sizes, C_formula, A_formula, B_formula, C_strides,
59 |                       A_strides, B_strides, fma, max_unrolled_fmas)
60 |                       .get_shared();
61 | 
62 |     jit_fn.save_to_file("zi.asm");
63 | 
64 |     auto baseline_fn =
65 |         loop_nest_baseline(order, sizes, C_formula, A_formula, B_formula,
66 |                            C_strides, A_strides, B_strides, false);
67 | 
68 |     jit_fn(CJ.data(), A.data(), B.data(), alpha);
69 |     baseline_fn(CN.data(), A.data(), B.data(), alpha);
70 | 
71 |     std::cout << "MAXABSDIFF: ( " << C_size << " ) "
72 |               << max_abs_difference(CJ.data(), CJ.data() + C_size, CN.data())
73 |               << "\n";
74 | }
75 | 
76 | } // namespace dabun
77 | 


--------------------------------------------------------------------------------
/apps/serialization.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #include <iostream>
 7 | #include <map>
 8 | #include <string>
 9 | #include <vector>
10 | 
11 | #include "dabun/serialization.hpp"
12 | 
13 | int main()
14 | {
15 |     int  ArCr = 256;
16 |     int  AcBr = 256;
17 |     int  BcCc = 256;
18 |     auto s    = dabun::serialized_loop_nest_inputs(
19 |         // The first argument is the loop order in the form of
20 |         // {dimension, stride}.  For now the outer dimension
21 |         // has to divide the stride.  This is effectively the
22 |         // same as Halide's split into outer and inner
23 |         // variable, but can have arbitray number of splits.
24 |         {{"AcBr", 256},
25 |          {"ArCr", 3},
26 |          {"BcCc", 16},
27 |          {"AcBr", 1},
28 |          {"AcBr", 1},
29 |          {"ArCr", 1},
30 |          {"BcCc", 1}},
31 |         // The second argument is a map of the dimension sizes
32 |         {{"AcBr", AcBr}, {"ArCr", ArCr}, {"BcCc", BcCc}},
33 |         // Vars of C (other variables are reduction variables)
34 |         {"ArCr", "BcCc"},
35 |         // Variables of A
36 |         {"ArCr", "AcBr"},
37 |         // Variables of B
38 |         {"AcBr", "BcCc"},
39 |         // C's strides for each variable.  Note that the
40 |         // strides data is a superset of the previous argument
41 |         // (variables of C).  I'm still deciding on the final
42 |         // design, possibly allowing for null strides that
43 |         // will just deduce them from the sizes, or some
44 |         // special structs indicating the layout (ie
45 |         // row-major, col-major).  In this case the vars have
46 |         // to be ordered though... Many decisions to make...
47 |         {{"ArCr", BcCc}, {"BcCc", 1}},
48 |         // A's strides for each variable
49 |         {{"ArCr", AcBr}, {"AcBr", 1}},
50 |         // B's strides for each variable
51 |         {{"AcBr", BcCc}, {"BcCc", 1}}, 1024);
52 |     auto str_rep = s.str();
53 |     std::cout << str_rep << std::endl;
54 | 
55 |     auto s2 =
56 |         dabun::serialized_loop_nest_inputs::from_str(str_rep);
57 |     auto str_rep2 = s2.str();
58 |     std::cout << str_rep2 << std::endl;
59 | 
60 |     std::ofstream out("jose_test.txt");
61 |     out << str_rep2;
62 |     out.close();
63 | 
64 |     auto s3 = dabun::serialized_loop_nest_inputs::from_file(
65 |         "jose_test.txt");
66 |     auto str_rep3 = s3.str();
67 |     std::cout << str_rep3 << std::endl;
68 | }
69 | 


--------------------------------------------------------------------------------
/apps/tensillica_play.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #include "dabun/tensillica/peak_gflops.hpp"
 7 | #include "sysml/code_generator/code_generated_fn.hpp"
 8 | #include <cstdlib>
 9 | #include <fstream>
10 | #include <iostream>
11 | 
12 | #include <cstdio>
13 | #include <dlfcn.h>
14 | 
15 | #include <type_traits>
16 | 
17 | int main()
18 | {
19 |     std::cout << "zi";
20 |     std::cout << std::endl;
21 | 
22 |     auto fn = dabun::tensillica::peak_gflops().get_shared();
23 | 
24 |     if (fn)
25 |     {
26 | 
27 |         float in[20]  = {0.5f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f,
28 |                         0.5f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f};
29 |         float out[10] = {0.f};
30 | 
31 |         std::cout << fn(in, out, dabun::tensillica::dl_func_arg_cast<int>(3.14f)) << "\n";
32 | 
33 |         for (int i = 0; i < 10; ++i)
34 |         {
35 |             std::cout << out[i] << " --------------------\n";
36 |         }
37 |     }
38 |     else
39 |     {
40 |         std::cout << "Can't get fn_ptr\n";
41 |     }
42 | 
43 |     // dlclose(dlh);
44 | }
45 | 


--------------------------------------------------------------------------------
/apps/transposer.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #include "dabun/transposer.hpp"
 7 | #include "dabun/check.hpp"
 8 | #include "dabun/random_vector.hpp"
 9 | #include "transposer_baseline.hpp"
10 | #include "transposer_bench.hpp"
11 | 
12 | #ifndef DABUN_ARITHMETIC
13 | #define DABUN_ARITHMETIC float
14 | #endif
15 | 
16 | int main()
17 | {
18 |     using float_t = DABUN_ARITHMETIC;
19 | 
20 |     using namespace dabun;
21 | 
22 |     // for (int i = 0; i < 10; ++i)
23 |     {
24 | 
25 |         int R = 11;
26 |         int C = 13;
27 | 
28 |         auto A  = get_random_vector<float_t>(R * C);
29 |         auto B  = get_random_vector<float_t>(R * C);
30 |         auto BJ = get_random_vector<float_t>(R * C);
31 | 
32 |         // ArCr=12 AcBr=6 ORDER: ArCr,12 :: AcBr,5 :: AcBr,4 :: ArCr,4 :: AcBr,1
33 |         // :: ArCr,1 :: MU=32
34 | 
35 |         auto transpose = transposer_baseline<float_t>(
36 |             // Order
37 |             {{"C", 13}, {"R", 16}, {"C", 9}, {"R", 16}, {"C", 1}, {"R", 1}},
38 |             // Sizes
39 |             {{"R", R}, {"C", C}},
40 |             // Out Strides
41 |             {{"R", C}, {"C", 1}},
42 |             // In Strides
43 |             {{"R", 1}, {"C", R}});
44 | 
45 |         auto transpose_jit =
46 |             transposer_compiler<DABUN_VEX, float_t>(
47 |                 {{"C", 13}, {"R", 16}, {"C", 9}, {"R", 16}, {"C", 1}, {"R", 1}},
48 |                 // Sizes
49 |                 {{"R", R}, {"C", C}},
50 |                 // Out Strides
51 |                 {{"R", C}, {"C", 1}},
52 |                 // In Strides
53 |                 {{"R", 1}, {"C", R}}, 32)
54 |                 .get_shared();
55 | 
56 |         transpose(B.data(), A.data());
57 | 
58 |         transpose_jit.save_to_file("zi.asm");
59 | 
60 |         transpose_jit(BJ.data(), A.data());
61 | 
62 |         std::cout << "MAXABSDIFF: "
63 |                   << max_abs_difference(BJ.data(), BJ.data() + R * C, B.data())
64 |                   << "\n";
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/apps/transposer_baseline.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #pragma once
  7 | 
  8 | #include <functional>
  9 | #include <map>
 10 | #include <set>
 11 | #include <string>
 12 | #include <utility>
 13 | #include <vector>
 14 | 
 15 | namespace dabun
 16 | {
 17 | 
 18 | template <class Arithmetic>
 19 | auto transposer_baseline(std::vector<std::pair<std::string, int>> const& order,
 20 |                          std::map<std::string, int> const&               sizes,
 21 |                          std::map<std::string, int> const& out_strides,
 22 |                          std::map<std::string, int> const& in_strides)
 23 | {
 24 |     // Just optimizing out the map lookups.
 25 | 
 26 |     std::map<std::string, int> var_to_id;
 27 | 
 28 |     int next = 0;
 29 |     for (auto const& s : sizes)
 30 |     {
 31 |         var_to_id[s.first] = next++;
 32 |     }
 33 | 
 34 |     std::vector<int> initial_limits(next);
 35 | 
 36 |     for (auto const& s : sizes)
 37 |     {
 38 |         initial_limits[var_to_id[s.first]] = s.second;
 39 |     }
 40 | 
 41 |     std::vector<int> order_ids(order.size());
 42 |     std::vector<int> order_delta(order.size());
 43 |     std::vector<int> order_in_strides(order.size());
 44 |     std::vector<int> order_out_strides(order.size());
 45 | 
 46 |     for (int i = 0; i < order.size(); ++i)
 47 |     {
 48 |         order_ids[i]   = var_to_id[order[i].first];
 49 |         order_delta[i] = order[i].second;
 50 | 
 51 |         order_in_strides[i] =
 52 |             in_strides.count(order[i].first)
 53 |                 ? in_strides.at(order[i].first) * order_delta[i]
 54 |                 : 0;
 55 |         order_out_strides[i] =
 56 |             out_strides.count(order[i].first)
 57 |                 ? out_strides.at(order[i].first) * order_delta[i]
 58 |                 : 0;
 59 |     }
 60 | 
 61 |     return [=](Arithmetic* out_ptr, Arithmetic const* in_ptr) {
 62 |         auto limits = initial_limits;
 63 | 
 64 |         std::function<void(Arithmetic*, Arithmetic const*, int)>
 65 |             recursive_compute =
 66 |                 [&](Arithmetic* out, Arithmetic const* in, int order_depth) {
 67 |                     if (order_depth == order_ids.size())
 68 |                     {
 69 |                         out[0] = in[0];
 70 |                     }
 71 |                     else
 72 |                     {
 73 |                         auto var   = order_ids[order_depth];
 74 |                         auto delta = order_delta[order_depth];
 75 |                         auto limit = limits[var];
 76 |                         auto full  = limit / delta;
 77 |                         auto rest  = limit % delta;
 78 | 
 79 |                         auto save = std::exchange(limits[var], delta);
 80 |                         for (int i = 0; i < full; ++i)
 81 |                         {
 82 |                             recursive_compute(out, in, order_depth + 1);
 83 |                             in += order_in_strides[order_depth];
 84 |                             out += order_out_strides[order_depth];
 85 |                         }
 86 |                         limits[var] = save;
 87 | 
 88 |                         if (rest)
 89 |                         {
 90 |                             int s = std::exchange(limits[var], rest);
 91 |                             recursive_compute(out, in, order_depth + 1);
 92 |                             limits[var] = s;
 93 |                         }
 94 |                     }
 95 |                 };
 96 | 
 97 |         recursive_compute(out_ptr, in_ptr, 0);
 98 |     };
 99 | }
100 | 
101 | } // namespace dabun
102 | 


--------------------------------------------------------------------------------
/apps/transposer_bench.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #include "dabun/transposer.hpp"
 7 | 
 8 | #include <sysml/measure.hpp>
 9 | 
10 | #include <algorithm>
11 | #include <cassert>
12 | #include <chrono>
13 | #include <iostream>
14 | #include <map>
15 | #include <numeric>
16 | #include <optional>
17 | #include <random>
18 | #include <set>
19 | #include <string>
20 | #include <vector>
21 | 
22 | namespace dabun
23 | {
24 | 
25 | template <class ISA>
26 | void transposer_bench(std::vector<std::pair<std::string, int>> const& order,
27 |                       std::map<std::string, int> const&               sizes,
28 |                       std::map<std::string, int> const& out_strides,
29 |                       std::map<std::string, int> const& in_strides,
30 |                       int max_unrolled_fmas = 320, int total_iterations = 100)
31 | 
32 | {
33 |     auto total_moved_bytes =
34 |         std::accumulate(sizes.begin(), sizes.end(), 1,
35 |                         [](auto a, auto b) { return a * b.second; }) *
36 |         4;
37 | 
38 |     std::int64_t in_size  = 1;
39 |     std::int64_t out_size = 1;
40 | 
41 |     for (auto const& s : sizes)
42 |     {
43 |         in_size += (s.second - 1) * in_strides.at(s.first);
44 |         out_size += (s.second - 1) * out_strides.at(s.first);
45 |     }
46 | 
47 |     auto A = get_random_vector<float>(in_size);
48 |     auto B = get_random_vector<float>(out_size);
49 | 
50 |     auto jit_fn = transposer_code_generator<ISA>(order, sizes, out_strides,
51 |                                                  in_strides, max_unrolled_fmas)
52 |                       .get_unique();
53 | 
54 |     jit_fn.save_to_file("zi.asm");
55 | 
56 |     auto secs = sysml::measure_fastest([&]() { jit_fn(B.data(), A.data()); },
57 |                                        total_iterations);
58 | 
59 |     double moved_gbytes = 1.0 * total_moved_bytes / 1000000000;
60 | 
61 |     std::cout << "GBPS: " << (moved_gbytes / secs) << "\n";
62 |     std::cout << "MSEC: " << (secs / 1000) << "\n";
63 | }
64 | 
65 | } // namespace dabun
66 | 


--------------------------------------------------------------------------------
/apps/transposer_stress_test.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #include "dabun/check.hpp"
  7 | #include "dabun/random_vector.hpp"
  8 | #include "dabun/transposer.hpp"
  9 | #include "transposer_baseline.hpp"
 10 | 
 11 | #include <algorithm>
 12 | #include <cassert>
 13 | #include <chrono>
 14 | #include <iostream>
 15 | #include <map>
 16 | #include <numeric>
 17 | #include <optional>
 18 | #include <random>
 19 | #include <set>
 20 | #include <string>
 21 | #include <vector>
 22 | 
 23 | #ifndef DABUN_ARITHMETIC
 24 | #define DABUN_ARITHMETIC float
 25 | #endif
 26 | 
 27 | int main()
 28 | {
 29 |     using namespace dabun;
 30 | 
 31 |     using float_t = DABUN_ARITHMETIC;
 32 | 
 33 |     srand(0);
 34 | 
 35 |     for (int rounds = 0; rounds < 1000000; ++rounds)
 36 |     {
 37 |         int ArCr         = (1 << rand() % 2) + rand() % 16;
 38 |         int AcBr         = (1 << rand() % 2) + rand() % 16;
 39 |         int max_unrolled = 1 << (rand() % 8);
 40 | 
 41 |         std::vector<std::pair<std::string, int>> order = {{"AcBr", 1},
 42 |                                                           {"ArCr", 1}};
 43 | 
 44 |         std::vector<std::pair<std::string, int>> hyper_order = {
 45 |             {"AcBr", (rand() % AcBr) + 2},  // It's OK to go oversize
 46 |                                             // (tests whether it's
 47 |                                             // handled appropriately)
 48 |             {"ArCr", (rand() % ArCr) + 2},  // - || -
 49 |             {"AcBr", (rand() % AcBr) + 2},  // - || -
 50 |             {"ArCr", (rand() % ArCr) + 2}}; // - || -
 51 | 
 52 |         std::sort(hyper_order.begin(), hyper_order.end(),
 53 |                   [](auto a, auto b) { return a.second > b.second; });
 54 | 
 55 |         {
 56 |             auto full_order = hyper_order;
 57 |             full_order.insert(full_order.end(), order.begin(), order.end());
 58 | 
 59 |             std::cout << "DIF: ORDER: ";
 60 |             for (auto& o : full_order)
 61 |             {
 62 |                 if (o.first == full_order.back().first)
 63 |                 {
 64 |                     if (o.second != 1)
 65 |                     {
 66 |                         o.second = round_up(o.second,
 67 |                                             isa_traits<DABUN_ISA>::vector_size *
 68 |                                                 4 / sizeof(float_t));
 69 |                     }
 70 |                 }
 71 |                 std::cout << o.first << "=" << o.second << "  ";
 72 |             }
 73 | 
 74 |             std::cout << "ArCr=" << ArCr << " ";
 75 |             std::cout << "AcBr=" << AcBr << " ";
 76 | 
 77 |             std::cout << "MU=" << max_unrolled << std::endl;
 78 | 
 79 |             auto fn_baselome = transposer_baseline<float_t>(
 80 |                 full_order, // The second argument is a
 81 |                             // map of the dimension sizes
 82 |                 {{"AcBr", AcBr}, {"ArCr", ArCr}},
 83 |                 // out's strides for each variable.
 84 |                 {{"ArCr", AcBr}, {"AcBr", 1}},
 85 |                 // in's strides for each variable
 86 |                 {{"ArCr", 1}, {"AcBr", ArCr}});
 87 | 
 88 |             auto fn = transposer_compiler<DABUN_VEX, float_t>(
 89 |                           full_order, // The second argument is a map of the
 90 |                                       // dimension sizes
 91 |                           {{"AcBr", AcBr}, {"ArCr", ArCr}},
 92 |                           // out's strides for each variable.
 93 |                           {{"ArCr", AcBr}, {"AcBr", 1}},
 94 |                           // in's strides for each variable
 95 |                           {{"ArCr", 1}, {"AcBr", ArCr}}, max_unrolled)
 96 |                           .get_shared();
 97 | 
 98 |             fn.save_to_file("zi.asm");
 99 | 
100 |             auto A  = get_random_vector<float_t>(AcBr * ArCr);
101 |             auto CN = get_random_vector<float_t>(ArCr * AcBr);
102 |             auto CJ = CN;
103 | 
104 |             fn_baselome(CN.data(), A.data());
105 |             fn(CJ.data(), A.data());
106 | 
107 |             auto madiff = max_abs_difference(CJ.data(), CJ.data() + ArCr * AcBr,
108 |                                              CN.data());
109 | 
110 |             std::cout << "ArCr=" << ArCr << " ";
111 |             std::cout << "AcBr=" << AcBr << " ";
112 | 
113 |             std::cout << "ORDER: ";
114 | 
115 |             for (auto const& o : full_order)
116 |             {
117 |                 std::cout << o.first << ',' << o.second << " :: ";
118 |             }
119 | 
120 |             std::cout << "\n";
121 | 
122 |             std::cout << "MU=" << max_unrolled << std::endl;
123 |             std::cout << "MAXABSDIFF: " << madiff << std::endl;
124 | 
125 |             assert(madiff < 0.000001);
126 |         }
127 |     }
128 | }
129 | 


--------------------------------------------------------------------------------
/assets/logo/fulllogo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/fulllogo.jpg


--------------------------------------------------------------------------------
/assets/logo/fulllogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/fulllogo.png


--------------------------------------------------------------------------------
/assets/logo/fulllogo_nobuffer.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/fulllogo_nobuffer.jpg


--------------------------------------------------------------------------------
/assets/logo/fulllogo_nobuffer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/fulllogo_nobuffer.png


--------------------------------------------------------------------------------
/assets/logo/fulllogo_transparent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/fulllogo_transparent.png


--------------------------------------------------------------------------------
/assets/logo/fulllogo_transparent_nobuffer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/fulllogo_transparent_nobuffer.png


--------------------------------------------------------------------------------
/assets/logo/grayscale.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/grayscale.png


--------------------------------------------------------------------------------
/assets/logo/grayscale_nobuffer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/grayscale_nobuffer.png


--------------------------------------------------------------------------------
/assets/logo/grayscale_transparent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/grayscale_transparent.png


--------------------------------------------------------------------------------
/assets/logo/grayscale_transparent_nobuffer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/grayscale_transparent_nobuffer.png


--------------------------------------------------------------------------------
/assets/logo/icononly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/icononly.png


--------------------------------------------------------------------------------
/assets/logo/icononly_nobuffer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/icononly_nobuffer.png


--------------------------------------------------------------------------------
/assets/logo/icononly_transparent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/icononly_transparent.png


--------------------------------------------------------------------------------
/assets/logo/icononly_transparent_nobuffer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/icononly_transparent_nobuffer.png


--------------------------------------------------------------------------------
/assets/logo/icononly_transparent_nobuffer_padded.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/icononly_transparent_nobuffer_padded.png


--------------------------------------------------------------------------------
/assets/logo/print.eps:
--------------------------------------------------------------------------------
  1 | %!PS-Adobe-3.0 EPSF-3.0
  2 | %%Creator: cairo 1.16.0 (https://cairographics.org)
  3 | %%CreationDate: Fri Sep 24 21:07:31 2021
  4 | %%Pages: 1
  5 | %%DocumentData: Clean7Bit
  6 | %%LanguageLevel: 2
  7 | %%BoundingBox: 0 0 960 768
  8 | %%EndComments
  9 | %%BeginProlog
 10 | 50 dict begin
 11 | /q { gsave } bind def
 12 | /Q { grestore } bind def
 13 | /cm { 6 array astore concat } bind def
 14 | /w { setlinewidth } bind def
 15 | /J { setlinecap } bind def
 16 | /j { setlinejoin } bind def
 17 | /M { setmiterlimit } bind def
 18 | /d { setdash } bind def
 19 | /m { moveto } bind def
 20 | /l { lineto } bind def
 21 | /c { curveto } bind def
 22 | /h { closepath } bind def
 23 | /re { exch dup neg 3 1 roll 5 3 roll moveto 0 rlineto
 24 |       0 exch rlineto 0 rlineto closepath } bind def
 25 | /S { stroke } bind def
 26 | /f { fill } bind def
 27 | /f* { eofill } bind def
 28 | /n { newpath } bind def
 29 | /W { clip } bind def
 30 | /W* { eoclip } bind def
 31 | /BT { } bind def
 32 | /ET { } bind def
 33 | /BDC { mark 3 1 roll /BDC pdfmark } bind def
 34 | /EMC { mark /EMC pdfmark } bind def
 35 | /cairo_store_point { /cairo_point_y exch def /cairo_point_x exch def } def
 36 | /Tj { show currentpoint cairo_store_point } bind def
 37 | /TJ {
 38 |   {
 39 |     dup
 40 |     type /stringtype eq
 41 |     { show } { -0.001 mul 0 cairo_font_matrix dtransform rmoveto } ifelse
 42 |   } forall
 43 |   currentpoint cairo_store_point
 44 | } bind def
 45 | /cairo_selectfont { cairo_font_matrix aload pop pop pop 0 0 6 array astore
 46 |     cairo_font exch selectfont cairo_point_x cairo_point_y moveto } bind def
 47 | /Tf { pop /cairo_font exch def /cairo_font_matrix where
 48 |       { pop cairo_selectfont } if } bind def
 49 | /Td { matrix translate cairo_font_matrix matrix concatmatrix dup
 50 |       /cairo_font_matrix exch def dup 4 get exch 5 get cairo_store_point
 51 |       /cairo_font where { pop cairo_selectfont } if } bind def
 52 | /Tm { 2 copy 8 2 roll 6 array astore /cairo_font_matrix exch def
 53 |       cairo_store_point /cairo_font where { pop cairo_selectfont } if } bind def
 54 | /g { setgray } bind def
 55 | /rg { setrgbcolor } bind def
 56 | /d1 { setcachedevice } bind def
 57 | /cairo_data_source {
 58 |   CairoDataIndex CairoData length lt
 59 |     { CairoData CairoDataIndex get /CairoDataIndex CairoDataIndex 1 add def }
 60 |     { () } ifelse
 61 | } def
 62 | /cairo_flush_ascii85_file { cairo_ascii85_file status { cairo_ascii85_file flushfile } if } def
 63 | /cairo_image { image cairo_flush_ascii85_file } def
 64 | /cairo_imagemask { imagemask cairo_flush_ascii85_file } def
 65 | %%EndProlog
 66 | %%BeginSetup
 67 | %%EndSetup
 68 | %%Page: 1 1
 69 | %%BeginPageSetup
 70 | %%PageBoundingBox: 0 0 960 768
 71 | %%EndPageSetup
 72 | q 0 0 960 768 rectclip
 73 | 1 0 0 -1 0 768 cm q
 74 | 0 g
 75 | 0 0 960 768 rectfill
 76 | 0.6 0.831373 0.12549 rg
 77 | 291.887 406.699 m 433.641 347.023 l 433.641 207.727 l 251.422 168.02 l 
 78 | 291.887 406.699 l f*
 79 | 0.176471 0.478431 0.784314 rg
 80 | 433.641 126.5 m 280.879 157.719 l 433.641 191.004 l h
 81 | 433.641 126.5 m f*
 82 | 0 0.678431 0.937255 rg
 83 | 526.543 384.84 m 526.543 490.191 l 653.219 420.633 l h
 84 | 526.543 384.84 m f*
 85 | 1 0.94902 0 rg
 86 | 442.328 361.039 m 307.781 417.68 l 510.195 492.281 l 510.195 380.215 l 
 87 | 442.328 361.039 l f*
 88 | 0.662745 0.231373 0.639216 rg
 89 | 526.539 367.879 m 668.094 407.879 l 708.578 169.082 l 526.539 223.898 l
 90 |  526.539 367.879 l f*
 91 | 0.956863 0.447059 0.0862745 rg
 92 | 449.996 211.293 m 449.996 346.25 l 510.188 363.258 l 510.188 224.41 l 449.996
 93 |  211.293 l f*
 94 | 0.929412 0.109804 0.141176 rg
 95 | 681.43 160.199 m 449.988 125.977 l 449.988 194.57 l 518.039 209.402 l 681.43
 96 |  160.199 l f*
 97 | 0.968627 0.933333 0.0156863 rg
 98 | 357.324 624.074 m 330.984 624.074 l 330.984 551.641 l 357.324 551.641 l
 99 |  381.297 551.641 393.285 563.051 393.285 585.875 c 393.285 611.34 381.297
100 |  624.074 357.324 624.074 c h
101 | 340.348 559.234 m 340.348 616.48 l 357.324 616.48 l 375.047 616.48 383.906
102 |  606.277 383.906 585.875 c 383.906 568.113 375.047 559.234 357.324 559.234
103 |  c h
104 | 403.414 608.023 m 403.414 597.109 410.609 591.652 425 591.652 c 429.621
105 |  591.652 434.246 591.988 438.875 592.66 c 438.875 587.648 l 438.875 581.539
106 |  434.098 578.484 424.539 578.484 c 419.062 578.484 413.457 579.328 407.723
107 |  581.02 c 407.723 573.41 l 413.457 571.73 419.062 570.891 424.539 570.891
108 |  c 440.34 570.891 448.242 576.395 448.242 587.402 c 448.242 624.074 l 442.98
109 |  624.074 l 439.738 618.801 l 434.438 622.316 428.66 624.074 422.406 624.074
110 |  c 409.746 624.074 403.414 618.723 403.414 608.023 c h
111 | 412.781 607.863 m 412.781 613.605 415.988 616.48 422.406 616.48 c 428.891
112 |  616.48 434.379 614.754 438.875 611.309 c 438.875 600.27 l 434.246 599.586
113 |  429.621 599.246 425 599.246 c 416.855 599.246 412.781 602.117 412.781 607.863
114 |  c h
115 | 463.441 621.898 m 463.441 551.641 l 472.809 551.641 l 472.809 573.41 l 
116 | 476.832 571.73 481.141 570.891 485.734 570.891 c 502.551 570.891 510.961
117 |  579.395 510.961 596.406 c 510.961 614.852 502.129 624.074 484.465 624.074
118 |  c 476.828 624.074 469.82 623.348 463.441 621.898 c h
119 | 472.809 581.727 m 472.809 614.809 l 475.883 615.789 479.613 616.277 484.004
120 |  616.277 c 495.828 616.277 501.742 609.539 501.742 596.062 c 501.742 584.68
121 |  496.387 578.988 485.676 578.988 c 480.812 578.988 476.527 579.902 472.809
122 |  581.727 c h
123 | 523.629 604.262 m 523.629 570.891 l 532.992 570.891 l 532.992 604.418 l
124 |  532.992 612.324 536.91 616.277 544.75 616.277 c 550.121 616.277 555.234
125 |  614.031 560.094 609.535 c 560.094 570.891 l 569.461 570.891 l 569.461 624.074
126 |  l 563.035 624.074 l 561.406 617.285 l 554.961 621.812 548.477 624.074 541.957
127 |  624.074 c 529.738 624.074 523.629 617.469 523.629 604.262 c h
128 | 594.027 624.074 m 584.664 624.074 l 584.664 570.891 l 590.988 570.891 l
129 |  592.66 577.676 l 598.098 573.152 604.074 570.891 610.598 570.891 c 623.875
130 |  570.891 630.512 577.488 630.512 590.688 c 630.512 624.074 l 621.129 624.074
131 |  l 621.129 590.543 l 621.129 582.637 617.195 578.688 609.328 578.688 c 603.961
132 |  578.688 598.859 580.934 594.027 585.43 c h
133 | 594.027 624.074 m f
134 | Q Q
135 | showpage
136 | %%Trailer
137 | end
138 | %%EOF
139 | 


--------------------------------------------------------------------------------
/assets/logo/print.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/print.pdf


--------------------------------------------------------------------------------
/assets/logo/print.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no" ?>
 2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 3 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="1280" height="1024" viewBox="0 0 1280 1024" xml:space="preserve">
 4 | <desc>Created with Fabric.js 4.4.0</desc>
 5 | <defs>
 6 | </defs>
 7 | <g transform="matrix(1 0 0 1 640 512)" id="background-logo"  >
 8 | <rect style="stroke: none; stroke-width: 0; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 10; fill: rgb(0,0,0); fill-rule: nonzero; opacity: 1;"  paint-order="stroke"  x="-640" y="-512" rx="0" ry="0" width="1280" height="1024" />
 9 | </g>
10 | <g transform="matrix(1.928550004267182 0 0 1.9285500042671826 639.9999759855768 412.1716590384616)" id="logo-logo"  >
11 | <g style=""  paint-order="stroke"   >
12 | 		<g transform="matrix(0.11306471756293264 0 0 -0.11306471756293264 -95.04139219940981 -15.049959880784456)"  >
13 | <path style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 10; fill: rgb(153,212,32); fill-rule: evenodd; opacity: 1;"  paint-order="stroke"  transform=" translate(-3765.62, -4385.835)" d="M 3455.95 3656.1 L 4322.74 4021.0099999999998 L 4322.74 4872.7699999999995 L 3208.5 5115.57 L 3455.95 3656.1" stroke-linecap="round" />
14 | </g>
15 | 		<g transform="matrix(0.11306471756293264 0 0 -0.11306471756293264 -84.85919216330615 -103.96435262174839)"  >
16 | <path style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 10; fill: rgb(45,122,200); fill-rule: evenodd; opacity: 1;"  paint-order="stroke"  transform=" translate(-3855.685, -5172.3150000000005)" d="M 4322.74 5369.53 L 3388.63 5178.65 L 4322.74 4975.1 Z" stroke-linecap="round" />
17 | </g>
18 | 		<g transform="matrix(0.11306471756293264 0 0 -0.11306471756293264 75.96705059294652 88.7621787423418)"  >
19 | <path style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 10; fill: rgb(0,173,239); fill-rule: evenodd; opacity: 1;"  paint-order="stroke"  transform=" translate(-5278.25, -3467.58)" d="M 4890.95 3789.69 L 4890.95 3145.4700000000003 L 5665.55 3570.82 z" stroke-linecap="round" />
20 | </g>
21 | 		<g transform="matrix(0.11306471756293264 0 0 -0.11306471756293264 -49.095776024992574 81.25707229195297)"  >
22 | <path style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 10; fill: rgb(255,242,0); fill-rule: evenodd; opacity: 1;"  paint-order="stroke"  transform=" translate(-4172.025, -3533.965)" d="M 4375.89 3935.23 L 3553.17 3588.87 L 4790.88 3132.7 L 4790.88 3817.97 L 4375.89 3935.23" stroke-linecap="round" />
23 | </g>
24 | 		<g transform="matrix(0.11306471756293264 0 0 -0.11306471756293264 95.1035777940694 -14.276537442759206)"  >
25 | <path style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 10; fill: rgb(169,59,163); fill-rule: evenodd; opacity: 1;"  paint-order="stroke"  transform=" translate(-5447.52, -4378.995)" d="M 4890.95 3893.48 L 5756.53 3648.9 L 6004.09 5109.09 L 4890.95 4773.9 L 4890.95 3893.4799999999996" stroke-linecap="round" />
26 | </g>
27 | 		<g transform="matrix(0.11306471756293264 0 0 -0.11306471756293264 0.06277084154561408 -15.108633994127672)"  >
28 | <path style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 10; fill: rgb(244,114,22); fill-rule: evenodd; opacity: 1;"  paint-order="stroke"  transform=" translate(-4606.85, -4386.355)" d="M 4422.82 4850.96 L 4422.82 4025.75 L 4790.88 3921.75 L 4790.88 4770.76 L 4422.82 4850.96" stroke-linecap="round" />
29 | </g>
30 | 		<g transform="matrix(0.11306471756293264 0 0 -0.11306471756293264 59.25538802170638 -97.78713399965369)"  >
31 | <path style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 10; fill: rgb(237,28,36); fill-rule: evenodd; opacity: 1;"  paint-order="stroke"  transform=" translate(-5130.43, -5117.674999999999)" d="M 5838.04 5163.46 L 4422.82 5372.74 L 4422.82 4953.29 L 4838.95 4862.61 L 5838.04 5163.46" stroke-linecap="round" />
32 | </g>
33 | </g>
34 | </g>
35 | <g transform="matrix(1.9211538461538462 0 0 1.9211538461538462 640.9962040103269 783.8088799601665)" id="text-logo-path"  >
36 | <path style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 4; fill: rgb(247,238,4); fill-rule: nonzero; opacity: 1;"  paint-order="stroke"  transform=" translate(-109.92000000000002, 25.135)" d="M 24.26 0 L 5.98 0 L 5.98 -50.27 L 24.26 -50.27 Q 49.22 -50.27 49.22 -26.51 L 49.22 -26.51 Q 49.22 0 24.26 0 L 24.26 0 Z M 12.48 -45 L 12.48 -5.27 L 24.26 -5.27 Q 42.71 -5.27 42.71 -26.51 L 42.71 -26.51 Q 42.71 -45 24.26 -45 L 24.26 -45 L 12.48 -45 Z M 56.25 -11.14 L 56.25 -11.14 Q 56.25 -22.5 71.23 -22.5 L 71.23 -22.5 Q 76.04 -22.5 80.86 -21.8 L 80.86 -21.8 L 80.86 -25.28 Q 80.86 -31.64 70.91 -31.64 L 70.91 -31.64 Q 65.21 -31.64 59.24 -29.88 L 59.24 -29.88 L 59.24 -35.16 Q 65.21 -36.91 70.91 -36.91 L 70.91 -36.91 Q 87.36 -36.91 87.36 -25.45 L 87.36 -25.45 L 87.36 0 L 83.71 0 L 81.46 -3.66 Q 75.94 0 69.43 0 L 69.43 0 Q 56.25 0 56.25 -11.14 Z M 62.75 -11.25 L 62.75 -11.25 Q 62.75 -5.27 69.43 -5.27 L 69.43 -5.27 Q 76.18 -5.27 80.86 -8.86 L 80.86 -8.86 L 80.86 -16.52 Q 76.04 -17.23 71.23 -17.23 L 71.23 -17.23 Q 62.75 -17.23 62.75 -11.25 Z M 97.91 -1.51 L 97.91 -1.51 L 97.91 -50.27 L 104.41 -50.27 L 104.41 -35.16 Q 108.6 -36.91 113.38 -36.91 L 113.38 -36.91 Q 130.89 -36.91 130.89 -19.2 L 130.89 -19.2 Q 130.89 0 112.5 0 L 112.5 0 Q 104.55 0 97.91 -1.51 Z M 104.41 -29.39 L 104.41 -6.43 Q 107.61 -5.41 112.18 -5.41 L 112.18 -5.41 Q 124.49 -5.41 124.49 -19.44 L 124.49 -19.44 Q 124.49 -31.29 113.34 -31.29 L 113.34 -31.29 Q 108.28 -31.29 104.41 -29.39 L 104.41 -29.39 Z M 139.68 -13.75 L 139.68 -13.75 L 139.68 -36.91 L 146.18 -36.91 L 146.18 -13.64 Q 146.18 -5.41 154.34 -5.41 L 154.34 -5.41 Q 159.93 -5.41 164.99 -10.09 L 164.99 -10.09 L 164.99 -36.91 L 171.49 -36.91 L 171.49 0 L 167.03 0 L 165.9 -4.71 Q 159.19 0 152.4 0 L 152.4 0 Q 139.68 0 139.68 -13.75 Z M 188.54 0 L 182.04 0 L 182.04 -36.91 L 186.43 -36.91 L 187.59 -32.2 Q 193.25 -36.91 200.04 -36.91 L 200.04 -36.91 Q 213.86 -36.91 213.86 -23.17 L 213.86 -23.17 L 213.86 0 L 207.35 0 L 207.35 -23.27 Q 207.35 -31.5 199.16 -31.5 L 199.16 -31.5 Q 193.57 -31.5 188.54 -26.82 L 188.54 -26.82 L 188.54 0 Z" stroke-linecap="round" />
37 | </g>
38 | </svg>


--------------------------------------------------------------------------------
/assets/logo/print_transparent.eps:
--------------------------------------------------------------------------------
  1 | %!PS-Adobe-3.0 EPSF-3.0
  2 | %%Creator: cairo 1.16.0 (https://cairographics.org)
  3 | %%CreationDate: Fri Sep 24 21:07:36 2021
  4 | %%Pages: 1
  5 | %%DocumentData: Clean7Bit
  6 | %%LanguageLevel: 2
  7 | %%BoundingBox: 251 143 709 643
  8 | %%EndComments
  9 | %%BeginProlog
 10 | 50 dict begin
 11 | /q { gsave } bind def
 12 | /Q { grestore } bind def
 13 | /cm { 6 array astore concat } bind def
 14 | /w { setlinewidth } bind def
 15 | /J { setlinecap } bind def
 16 | /j { setlinejoin } bind def
 17 | /M { setmiterlimit } bind def
 18 | /d { setdash } bind def
 19 | /m { moveto } bind def
 20 | /l { lineto } bind def
 21 | /c { curveto } bind def
 22 | /h { closepath } bind def
 23 | /re { exch dup neg 3 1 roll 5 3 roll moveto 0 rlineto
 24 |       0 exch rlineto 0 rlineto closepath } bind def
 25 | /S { stroke } bind def
 26 | /f { fill } bind def
 27 | /f* { eofill } bind def
 28 | /n { newpath } bind def
 29 | /W { clip } bind def
 30 | /W* { eoclip } bind def
 31 | /BT { } bind def
 32 | /ET { } bind def
 33 | /BDC { mark 3 1 roll /BDC pdfmark } bind def
 34 | /EMC { mark /EMC pdfmark } bind def
 35 | /cairo_store_point { /cairo_point_y exch def /cairo_point_x exch def } def
 36 | /Tj { show currentpoint cairo_store_point } bind def
 37 | /TJ {
 38 |   {
 39 |     dup
 40 |     type /stringtype eq
 41 |     { show } { -0.001 mul 0 cairo_font_matrix dtransform rmoveto } ifelse
 42 |   } forall
 43 |   currentpoint cairo_store_point
 44 | } bind def
 45 | /cairo_selectfont { cairo_font_matrix aload pop pop pop 0 0 6 array astore
 46 |     cairo_font exch selectfont cairo_point_x cairo_point_y moveto } bind def
 47 | /Tf { pop /cairo_font exch def /cairo_font_matrix where
 48 |       { pop cairo_selectfont } if } bind def
 49 | /Td { matrix translate cairo_font_matrix matrix concatmatrix dup
 50 |       /cairo_font_matrix exch def dup 4 get exch 5 get cairo_store_point
 51 |       /cairo_font where { pop cairo_selectfont } if } bind def
 52 | /Tm { 2 copy 8 2 roll 6 array astore /cairo_font_matrix exch def
 53 |       cairo_store_point /cairo_font where { pop cairo_selectfont } if } bind def
 54 | /g { setgray } bind def
 55 | /rg { setrgbcolor } bind def
 56 | /d1 { setcachedevice } bind def
 57 | /cairo_data_source {
 58 |   CairoDataIndex CairoData length lt
 59 |     { CairoData CairoDataIndex get /CairoDataIndex CairoDataIndex 1 add def }
 60 |     { () } ifelse
 61 | } def
 62 | /cairo_flush_ascii85_file { cairo_ascii85_file status { cairo_ascii85_file flushfile } if } def
 63 | /cairo_image { image cairo_flush_ascii85_file } def
 64 | /cairo_imagemask { imagemask cairo_flush_ascii85_file } def
 65 | %%EndProlog
 66 | %%BeginSetup
 67 | %%EndSetup
 68 | %%Page: 1 1
 69 | %%BeginPageSetup
 70 | %%PageBoundingBox: 251 143 709 643
 71 | %%EndPageSetup
 72 | q 251 143 458 500 rectclip
 73 | 1 0 0 -1 0 768 cm q
 74 | 0.6 0.831373 0.12549 rg
 75 | 291.887 406.699 m 433.641 347.023 l 433.641 207.727 l 251.422 168.02 l 
 76 | 291.887 406.699 l f*
 77 | 0.176471 0.478431 0.784314 rg
 78 | 433.641 126.5 m 280.879 157.719 l 433.641 191.004 l h
 79 | 433.641 126.5 m f*
 80 | 0 0.678431 0.937255 rg
 81 | 526.543 384.84 m 526.543 490.191 l 653.219 420.633 l h
 82 | 526.543 384.84 m f*
 83 | 1 0.94902 0 rg
 84 | 442.328 361.039 m 307.781 417.68 l 510.195 492.281 l 510.195 380.215 l 
 85 | 442.328 361.039 l f*
 86 | 0.662745 0.231373 0.639216 rg
 87 | 526.539 367.879 m 668.094 407.879 l 708.578 169.082 l 526.539 223.898 l
 88 |  526.539 367.879 l f*
 89 | 0.956863 0.447059 0.0862745 rg
 90 | 449.996 211.293 m 449.996 346.25 l 510.188 363.258 l 510.188 224.41 l 449.996
 91 |  211.293 l f*
 92 | 0.929412 0.109804 0.141176 rg
 93 | 681.43 160.199 m 449.988 125.977 l 449.988 194.57 l 518.039 209.402 l 681.43
 94 |  160.199 l f*
 95 | 0.968627 0.933333 0.0156863 rg
 96 | 357.324 624.074 m 330.984 624.074 l 330.984 551.641 l 357.324 551.641 l
 97 |  381.297 551.641 393.285 563.051 393.285 585.875 c 393.285 611.34 381.297
 98 |  624.074 357.324 624.074 c h
 99 | 340.348 559.234 m 340.348 616.48 l 357.324 616.48 l 375.047 616.48 383.906
100 |  606.277 383.906 585.875 c 383.906 568.113 375.047 559.234 357.324 559.234
101 |  c h
102 | 403.414 608.023 m 403.414 597.109 410.609 591.652 425 591.652 c 429.621
103 |  591.652 434.246 591.988 438.875 592.66 c 438.875 587.648 l 438.875 581.539
104 |  434.098 578.484 424.539 578.484 c 419.062 578.484 413.457 579.328 407.723
105 |  581.02 c 407.723 573.41 l 413.457 571.73 419.062 570.891 424.539 570.891
106 |  c 440.34 570.891 448.242 576.395 448.242 587.402 c 448.242 624.074 l 442.98
107 |  624.074 l 439.738 618.801 l 434.438 622.316 428.66 624.074 422.406 624.074
108 |  c 409.746 624.074 403.414 618.723 403.414 608.023 c h
109 | 412.781 607.863 m 412.781 613.605 415.988 616.48 422.406 616.48 c 428.891
110 |  616.48 434.379 614.754 438.875 611.309 c 438.875 600.27 l 434.246 599.586
111 |  429.621 599.246 425 599.246 c 416.855 599.246 412.781 602.117 412.781 607.863
112 |  c h
113 | 463.441 621.898 m 463.441 551.641 l 472.809 551.641 l 472.809 573.41 l 
114 | 476.832 571.73 481.141 570.891 485.734 570.891 c 502.551 570.891 510.961
115 |  579.395 510.961 596.406 c 510.961 614.852 502.129 624.074 484.465 624.074
116 |  c 476.828 624.074 469.82 623.348 463.441 621.898 c h
117 | 472.809 581.727 m 472.809 614.809 l 475.883 615.789 479.613 616.277 484.004
118 |  616.277 c 495.828 616.277 501.742 609.539 501.742 596.062 c 501.742 584.68
119 |  496.387 578.988 485.676 578.988 c 480.812 578.988 476.527 579.902 472.809
120 |  581.727 c h
121 | 523.629 604.262 m 523.629 570.891 l 532.992 570.891 l 532.992 604.418 l
122 |  532.992 612.324 536.91 616.277 544.75 616.277 c 550.121 616.277 555.234
123 |  614.031 560.094 609.535 c 560.094 570.891 l 569.461 570.891 l 569.461 624.074
124 |  l 563.035 624.074 l 561.406 617.285 l 554.961 621.812 548.477 624.074 541.957
125 |  624.074 c 529.738 624.074 523.629 617.469 523.629 604.262 c h
126 | 594.027 624.074 m 584.664 624.074 l 584.664 570.891 l 590.988 570.891 l
127 |  592.66 577.676 l 598.098 573.152 604.074 570.891 610.598 570.891 c 623.875
128 |  570.891 630.512 577.488 630.512 590.688 c 630.512 624.074 l 621.129 624.074
129 |  l 621.129 590.543 l 621.129 582.637 617.195 578.688 609.328 578.688 c 603.961
130 |  578.688 598.859 580.934 594.027 585.43 c h
131 | 594.027 624.074 m f
132 | Q Q
133 | showpage
134 | %%Trailer
135 | end
136 | %%EOF
137 | 


--------------------------------------------------------------------------------
/assets/logo/print_transparent.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/print_transparent.pdf


--------------------------------------------------------------------------------
/assets/logo/print_transparent.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no" ?>
 2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 3 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="1280" height="1024" viewBox="0 0 1280 1024" xml:space="preserve">
 4 | <desc>Created with Fabric.js 4.4.0</desc>
 5 | <defs>
 6 | </defs>
 7 | <g transform="matrix(1 0 0 1 640 512)" id="background-logo"  >
 8 | <rect style="stroke: none; stroke-width: 0; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 10; fill: rgb(255,255,255); fill-opacity: 0; fill-rule: nonzero; opacity: 1;"  paint-order="stroke"  x="-640" y="-512" rx="0" ry="0" width="1280" height="1024" />
 9 | </g>
10 | <g transform="matrix(1.928550004267182 0 0 1.9285500042671826 639.9999759855768 412.1716590384616)" id="logo-logo"  >
11 | <g style=""  paint-order="stroke"   >
12 | 		<g transform="matrix(0.11306471756293264 0 0 -0.11306471756293264 -95.04139219940981 -15.049959880784456)"  >
13 | <path style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 10; fill: rgb(153,212,32); fill-rule: evenodd; opacity: 1;"  paint-order="stroke"  transform=" translate(-3765.62, -4385.835)" d="M 3455.95 3656.1 L 4322.74 4021.0099999999998 L 4322.74 4872.7699999999995 L 3208.5 5115.57 L 3455.95 3656.1" stroke-linecap="round" />
14 | </g>
15 | 		<g transform="matrix(0.11306471756293264 0 0 -0.11306471756293264 -84.85919216330615 -103.96435262174839)"  >
16 | <path style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 10; fill: rgb(45,122,200); fill-rule: evenodd; opacity: 1;"  paint-order="stroke"  transform=" translate(-3855.685, -5172.3150000000005)" d="M 4322.74 5369.53 L 3388.63 5178.65 L 4322.74 4975.1 Z" stroke-linecap="round" />
17 | </g>
18 | 		<g transform="matrix(0.11306471756293264 0 0 -0.11306471756293264 75.96705059294652 88.7621787423418)"  >
19 | <path style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 10; fill: rgb(0,173,239); fill-rule: evenodd; opacity: 1;"  paint-order="stroke"  transform=" translate(-5278.25, -3467.58)" d="M 4890.95 3789.69 L 4890.95 3145.4700000000003 L 5665.55 3570.82 z" stroke-linecap="round" />
20 | </g>
21 | 		<g transform="matrix(0.11306471756293264 0 0 -0.11306471756293264 -49.095776024992574 81.25707229195297)"  >
22 | <path style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 10; fill: rgb(255,242,0); fill-rule: evenodd; opacity: 1;"  paint-order="stroke"  transform=" translate(-4172.025, -3533.965)" d="M 4375.89 3935.23 L 3553.17 3588.87 L 4790.88 3132.7 L 4790.88 3817.97 L 4375.89 3935.23" stroke-linecap="round" />
23 | </g>
24 | 		<g transform="matrix(0.11306471756293264 0 0 -0.11306471756293264 95.1035777940694 -14.276537442759206)"  >
25 | <path style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 10; fill: rgb(169,59,163); fill-rule: evenodd; opacity: 1;"  paint-order="stroke"  transform=" translate(-5447.52, -4378.995)" d="M 4890.95 3893.48 L 5756.53 3648.9 L 6004.09 5109.09 L 4890.95 4773.9 L 4890.95 3893.4799999999996" stroke-linecap="round" />
26 | </g>
27 | 		<g transform="matrix(0.11306471756293264 0 0 -0.11306471756293264 0.06277084154561408 -15.108633994127672)"  >
28 | <path style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 10; fill: rgb(244,114,22); fill-rule: evenodd; opacity: 1;"  paint-order="stroke"  transform=" translate(-4606.85, -4386.355)" d="M 4422.82 4850.96 L 4422.82 4025.75 L 4790.88 3921.75 L 4790.88 4770.76 L 4422.82 4850.96" stroke-linecap="round" />
29 | </g>
30 | 		<g transform="matrix(0.11306471756293264 0 0 -0.11306471756293264 59.25538802170638 -97.78713399965369)"  >
31 | <path style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 10; fill: rgb(237,28,36); fill-rule: evenodd; opacity: 1;"  paint-order="stroke"  transform=" translate(-5130.43, -5117.674999999999)" d="M 5838.04 5163.46 L 4422.82 5372.74 L 4422.82 4953.29 L 4838.95 4862.61 L 5838.04 5163.46" stroke-linecap="round" />
32 | </g>
33 | </g>
34 | </g>
35 | <g transform="matrix(1.9211538461538462 0 0 1.9211538461538462 640.9962040103269 783.8088799601665)" id="text-logo-path"  >
36 | <path style="stroke: none; stroke-width: 1; stroke-dasharray: none; stroke-linecap: butt; stroke-dashoffset: 0; stroke-linejoin: miter; stroke-miterlimit: 4; fill: rgb(247,238,4); fill-rule: nonzero; opacity: 1;"  paint-order="stroke"  transform=" translate(-109.92000000000002, 25.135)" d="M 24.26 0 L 5.98 0 L 5.98 -50.27 L 24.26 -50.27 Q 49.22 -50.27 49.22 -26.51 L 49.22 -26.51 Q 49.22 0 24.26 0 L 24.26 0 Z M 12.48 -45 L 12.48 -5.27 L 24.26 -5.27 Q 42.71 -5.27 42.71 -26.51 L 42.71 -26.51 Q 42.71 -45 24.26 -45 L 24.26 -45 L 12.48 -45 Z M 56.25 -11.14 L 56.25 -11.14 Q 56.25 -22.5 71.23 -22.5 L 71.23 -22.5 Q 76.04 -22.5 80.86 -21.8 L 80.86 -21.8 L 80.86 -25.28 Q 80.86 -31.64 70.91 -31.64 L 70.91 -31.64 Q 65.21 -31.64 59.24 -29.88 L 59.24 -29.88 L 59.24 -35.16 Q 65.21 -36.91 70.91 -36.91 L 70.91 -36.91 Q 87.36 -36.91 87.36 -25.45 L 87.36 -25.45 L 87.36 0 L 83.71 0 L 81.46 -3.66 Q 75.94 0 69.43 0 L 69.43 0 Q 56.25 0 56.25 -11.14 Z M 62.75 -11.25 L 62.75 -11.25 Q 62.75 -5.27 69.43 -5.27 L 69.43 -5.27 Q 76.18 -5.27 80.86 -8.86 L 80.86 -8.86 L 80.86 -16.52 Q 76.04 -17.23 71.23 -17.23 L 71.23 -17.23 Q 62.75 -17.23 62.75 -11.25 Z M 97.91 -1.51 L 97.91 -1.51 L 97.91 -50.27 L 104.41 -50.27 L 104.41 -35.16 Q 108.6 -36.91 113.38 -36.91 L 113.38 -36.91 Q 130.89 -36.91 130.89 -19.2 L 130.89 -19.2 Q 130.89 0 112.5 0 L 112.5 0 Q 104.55 0 97.91 -1.51 Z M 104.41 -29.39 L 104.41 -6.43 Q 107.61 -5.41 112.18 -5.41 L 112.18 -5.41 Q 124.49 -5.41 124.49 -19.44 L 124.49 -19.44 Q 124.49 -31.29 113.34 -31.29 L 113.34 -31.29 Q 108.28 -31.29 104.41 -29.39 L 104.41 -29.39 Z M 139.68 -13.75 L 139.68 -13.75 L 139.68 -36.91 L 146.18 -36.91 L 146.18 -13.64 Q 146.18 -5.41 154.34 -5.41 L 154.34 -5.41 Q 159.93 -5.41 164.99 -10.09 L 164.99 -10.09 L 164.99 -36.91 L 171.49 -36.91 L 171.49 0 L 167.03 0 L 165.9 -4.71 Q 159.19 0 152.4 0 L 152.4 0 Q 139.68 0 139.68 -13.75 Z M 188.54 0 L 182.04 0 L 182.04 -36.91 L 186.43 -36.91 L 187.59 -32.2 Q 193.25 -36.91 200.04 -36.91 L 200.04 -36.91 Q 213.86 -36.91 213.86 -23.17 L 213.86 -23.17 L 213.86 0 L 207.35 0 L 207.35 -23.27 Q 207.35 -31.5 199.16 -31.5 L 199.16 -31.5 Q 193.57 -31.5 188.54 -26.82 L 188.54 -26.82 L 188.54 0 Z" stroke-linecap="round" />
37 | </g>
38 | </svg>


--------------------------------------------------------------------------------
/assets/logo/textonly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/textonly.png


--------------------------------------------------------------------------------
/assets/logo/textonly_nobuffer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/loop_nest/f1a231f1f7ddc9bd1654d17f7890901a9627c069/assets/logo/textonly_nobuffer.png


--------------------------------------------------------------------------------
/cmake/aarch64/dabun.cmake:
--------------------------------------------------------------------------------
 1 | option(DABUN_BUILD_APPS_FOR_NEON "Set to ON to build apps for NEON extension" OFF)
 2 | option(DABUN_BUILD_APPS_FOR_NEON_FP16 "Set to ON to build apps for NEON FP16 extension" OFF)
 3 | 
 4 | add_library(dabun
 5 |   ${DABUN_COMMON_SRC_CPP_FILES})
 6 | 
 7 | target_include_directories(${PROJECT_NAME}
 8 |   PUBLIC ${PROJECT_BINARY_DIR})
 9 | 
10 | target_include_directories(${PROJECT_NAME}
11 |   PUBLIC include)
12 | 
13 | target_include_directories(${PROJECT_NAME}
14 |   PUBLIC extern/xbyak_aarch64)
15 | 
16 | target_include_directories(${PROJECT_NAME}
17 |   PUBLIC ${Boost_INCLUDE_DIRS})
18 | 
19 | target_compile_options(dabun
20 |   PRIVATE "-DDABUN_COMPILING_LIBDABUN")
21 | 


--------------------------------------------------------------------------------
/cmake/x86_64/dabun.cmake:
--------------------------------------------------------------------------------
 1 | option(DABUN_BUILD_APPS_FOR_AVX2 "Set to ON to build apps for AVX2 extension" OFF)
 2 | option(DABUN_BUILD_APPS_FOR_AVX2_PLUS "Set to ON to build apps for AVX512 extension using AVX512 instructions but only AVX2 (YMM) registers" OFF)
 3 | option(DABUN_BUILD_APPS_FOR_AVX512 "Set to ON to build apps for AVX512 extension" OFF)
 4 | option(DABUN_BUILD_APPS_FOR_AMX "Set to ON to build apps for AMX extension" OFF)
 5 | 
 6 | add_library(dabun
 7 |   ${DABUN_COMMON_SRC_CPP_FILES})
 8 | 
 9 | target_include_directories(${PROJECT_NAME}
10 |   PUBLIC ${PROJECT_BINARY_DIR})
11 | 
12 | target_include_directories(${PROJECT_NAME}
13 |   PUBLIC include)
14 | 
15 | # target_include_directories(${PROJECT_NAME}
16 | #   PUBLIC extern/xbyak)
17 | 
18 | target_compile_options(dabun
19 |   PRIVATE "-DDABUN_COMPILING_LIBDABUN")
20 | 


--------------------------------------------------------------------------------
/dabun_config.hpp.in:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #define DABUN_VERSION_MAJOR @dabun_VERSION_MAJOR@
 9 | #define DABUN_VERSION_MINOR @dabun_VERSION_MINOR@
10 | #define DABUN_VERSION_PATCH @dabun_VERSION_PATCH@
11 | 


--------------------------------------------------------------------------------
/include/dabun/aligned_vector.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <sysml/memory.hpp>
 9 | 
10 | #include <vector>
11 | 
12 | namespace dabun
13 | {
14 | 
15 | template <typename T>
16 | using aligned_vector = std::vector<T, sysml::aligned_allocator<T, 64>>;
17 | 
18 | } // namespace dabun
19 | 


--------------------------------------------------------------------------------
/include/dabun/arithmetic_operation.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "dabun/isa.hpp"
 9 | 
10 | #if defined(DABUN_ARCH_AARCH64)
11 | #    include "dabun/arm/arithmetic_operation.hpp"
12 | #else
13 | #    include "dabun/x86/arithmetic_operation.hpp"
14 | #endif
15 | 
16 | namespace dabun
17 | {
18 | 
19 | using DABUN_ISA_NAMESPACE ::fma;
20 | using DABUN_ISA_NAMESPACE ::multiply_max;
21 | using DABUN_ISA_NAMESPACE ::multiply_min;
22 | using DABUN_ISA_NAMESPACE ::non_fused_ma;
23 | using DABUN_ISA_NAMESPACE ::plus_max;
24 | 
25 | using DABUN_ISA_NAMESPACE ::operation_pair;
26 | using DABUN_ISA_NAMESPACE ::operation_pair_base;
27 | 
28 | namespace op
29 | {
30 | using DABUN_ISA_NAMESPACE ::basic_multiplies;
31 | using DABUN_ISA_NAMESPACE ::basic_plus;
32 | using DABUN_ISA_NAMESPACE ::duplicate_base_plus;
33 | using DABUN_ISA_NAMESPACE ::max;
34 | using DABUN_ISA_NAMESPACE ::min;
35 | } // namespace op
36 | 
37 | } // namespace dabun
38 | 


--------------------------------------------------------------------------------
/include/dabun/arm/arithmetic_operation.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <memory>
 9 | 
10 | namespace dabun
11 | {
12 | namespace arm
13 | {
14 | 
15 | class operation_pair_base
16 | {
17 | };
18 | 
19 | template <class PlusType, class MultipliesType>
20 | class operation_pair : public operation_pair_base
21 | {
22 | };
23 | 
24 | class basic_plus
25 | {
26 | };
27 | 
28 | class duplicate_base_plus
29 | {
30 | };
31 | 
32 | class max
33 | {
34 | };
35 | 
36 | class min
37 | {
38 | };
39 | 
40 | class basic_multiplies
41 | {
42 | };
43 | 
44 | inline std::shared_ptr<operation_pair_base> const fma =
45 |     std::make_shared<operation_pair_base>();
46 | 
47 | // exclusively here to test non-fused operations as base case
48 | inline std::shared_ptr<operation_pair_base> const non_fused_ma =
49 |     std::make_shared<operation_pair_base>();
50 | 
51 | inline std::shared_ptr<operation_pair_base> const multiply_max =
52 |     std::make_shared<operation_pair_base>();
53 | 
54 | inline std::shared_ptr<operation_pair_base> const multiply_min =
55 |     std::make_shared<operation_pair_base>();
56 | 
57 | inline std::shared_ptr<operation_pair_base> const plus_max =
58 |     std::make_shared<operation_pair_base>();
59 | 
60 | } // namespace arm
61 | } // namespace dabun
62 | 


--------------------------------------------------------------------------------
/include/dabun/arm/configuration.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | namespace dabun
 9 | {
10 | namespace arm
11 | {
12 | 
13 | class OptimizationConfiguration
14 | {
15 | };
16 | 
17 | inline OptimizationConfiguration all_optims;
18 | 
19 | inline OptimizationConfiguration no_optims;
20 | 
21 | } // namespace arm
22 | } // namespace dabun
23 | 


--------------------------------------------------------------------------------
/include/dabun/arm/elementwise_operation.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <memory>
 9 | 
10 | namespace dabun
11 | {
12 | namespace arm
13 | {
14 | 
15 | template <class ISA>
16 | class elementwise_operation
17 | {
18 | private:
19 |     bool is_relu_ = false;
20 | 
21 | public:
22 |     bool is_relu() const { return is_relu_; }
23 | 
24 |     explicit elementwise_operation(bool b)
25 |         : is_relu_(b)
26 |     {
27 |     }
28 | };
29 | 
30 | template <class ISA>
31 | class relu_elementwise_operation
32 | {
33 | };
34 | 
35 | template <class ISA>
36 | class single_tensor_elementwise_operation
37 | {
38 | };
39 | 
40 | template <class T>
41 | inline auto elementwise_relu = std::make_shared<elementwise_operation<T>>(true);
42 | 
43 | template <class T>
44 | inline auto
45 |     elementwise_bias = std::make_shared<elementwise_operation<T>>(false);
46 | 
47 | template <class T>
48 | inline auto
49 |     elementwise_multiply = std::make_shared<elementwise_operation<T>>(false);
50 | 
51 | } // namespace arm
52 | } // namespace dabun
53 | 


--------------------------------------------------------------------------------
/include/dabun/arm/multi_vreg.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #pragma once
  7 | 
  8 | #include "dabun/numeric.hpp"
  9 | 
 10 | #include <cassert>
 11 | #include <type_traits>
 12 | 
 13 | namespace dabun
 14 | {
 15 | namespace arm
 16 | {
 17 | 
 18 | // The main usage of the multi_vreg class is to increase the amount of
 19 | // independent operations when accumulating to a single vector
 20 | // register.  This is accomplished by using multiple vector registers
 21 | // which are reduced to a single one at the end.  Each of the size_
 22 | // registers is independent of all the other ones.
 23 | 
 24 | template <class VReg, class SReg, class HReg>
 25 | class multi_vreg
 26 | {
 27 | private:
 28 |     int size_          = 0;
 29 |     int first_         = 0;
 30 |     int current_       = 0;
 31 |     int vlen_          = 4;
 32 |     int original_size_ = 0;
 33 | 
 34 | public:
 35 |     multi_vreg() {}
 36 | 
 37 |     multi_vreg(int s, int f)
 38 |         : size_(s)
 39 |         , first_(f)
 40 |         , current_(0)
 41 |         , original_size_(s)
 42 |     {
 43 |         assert(s > 0);
 44 |     }
 45 | 
 46 |     void reset()
 47 |     {
 48 |         size_    = original_size_;
 49 |         current_ = 0;
 50 |     }
 51 | 
 52 |     multi_vreg(multi_vreg const&) = delete;
 53 |     multi_vreg& operator=(multi_vreg const&) = delete;
 54 | 
 55 |     multi_vreg(multi_vreg&& o) { *this = std::move(o); }
 56 | 
 57 |     multi_vreg& operator=(multi_vreg&& o)
 58 |     {
 59 |         assert(o.size_ > 0);
 60 |         size_          = o.size_;
 61 |         first_         = o.first_;
 62 |         current_       = o.current_;
 63 |         original_size_ = o.original_size_;
 64 |         return *this;
 65 |     }
 66 | 
 67 |     int size() const { return size_; }
 68 | 
 69 |     VReg operator++(int)
 70 |     {
 71 |         int c    = current_;
 72 |         current_ = (current_ + 1) % size_;
 73 |         return VReg(first_ + c);
 74 |     }
 75 | 
 76 |     VReg operator[](int s) const
 77 |     {
 78 |         assert(s < size_);
 79 |         return VReg(first_ + s);
 80 |     }
 81 | 
 82 |     VReg operator++()
 83 |     {
 84 |         current_ = (current_ + 1) % size_;
 85 |         return VReg(first_ + current_);
 86 |     }
 87 | 
 88 |     VReg current() const { return VReg(first_ + current_); }
 89 | 
 90 |     VReg first() const { return VReg(first_); }
 91 | 
 92 |     template <class Float, class Code_Generator>
 93 |     void half(Code_Generator& code_generator)
 94 |     {
 95 |         static_assert(std::is_same_v<Float, float> ||
 96 |                       std::is_same_v<Float, fp16_t>);
 97 | 
 98 |         int h = (size_ + 1) / 2;
 99 |         for (int i = 0; i + h < size_; ++i)
100 |         {
101 |             if constexpr (std::is_same_v<Float, float>)
102 |             {
103 |                 code_generator.fadd(VReg(first_ + i).s4, VReg(first_ + i).s4,
104 |                                     VReg(first_ + i + h).s4);
105 |             }
106 |             else if constexpr (std::is_same_v<Float, fp16_t>)
107 |             {
108 |                 code_generator.fadd(VReg(first_ + i).h8, VReg(first_ + i).h8,
109 |                                     VReg(first_ + i + h).h8);
110 |             }
111 |         }
112 |         size_    = h;
113 |         current_ = 0;
114 |     }
115 | 
116 |     template <class Float, class Code_Generator>
117 |     void reduce(Code_Generator& code_generator)
118 |     {
119 |         static_assert(std::is_same_v<Float, float> ||
120 |                       std::is_same_v<Float, fp16_t>);
121 | 
122 |         while (size_ > 1)
123 |         {
124 |             half<Float>(code_generator);
125 |         }
126 |     }
127 | 
128 |     template <class Float, class Code_Generator>
129 |     void full_reduce(Code_Generator& code_generator, int mask = 4,
130 |                      int zero_vector = 0)
131 |     {
132 |         static_assert(std::is_same_v<Float, float> ||
133 |                       std::is_same_v<Float, fp16_t>);
134 | 
135 |         reduce<Float>(code_generator);
136 |         assert(size_ == 1);
137 | 
138 |         if constexpr (std::is_same_v<Float, float>)
139 |         {
140 |             if (mask == 3)
141 |             {
142 |                 // x4/w4 is zero reg by convention in the loop_nest.hpp
143 |                 code_generator.ins(VReg(first_).s4[3], code_generator.w4);
144 |             }
145 |             if (mask > 2)
146 |             {
147 |                 code_generator.faddp(VReg(first_).s4, VReg(first_).s4,
148 |                                      VReg(first_).s4);
149 |             }
150 |             if (mask > 1)
151 |             {
152 |                 code_generator.faddp(SReg(first_), VReg(first_).s2);
153 |             }
154 |         }
155 |         else if constexpr (std::is_same_v<Float, fp16_t>)
156 |         {
157 |             switch (mask)
158 |             {
159 |             case 3:
160 |                 code_generator.ins(VReg(first_).h8[3], VReg(zero_vector).h8[3]);
161 |                 break;
162 |             case 5:
163 |                 code_generator.ins(VReg(first_).h8[5], VReg(zero_vector).h8[5]);
164 |                 // fallthrough
165 |             case 6:
166 |                 code_generator.ins(VReg(first_).s4[3], code_generator.w4);
167 |                 break;
168 |             case 7:
169 |                 code_generator.ins(VReg(first_).h8[7], VReg(zero_vector).h8[7]);
170 |                 break;
171 |             default:
172 |                 break;
173 |             }
174 | 
175 |             if (mask > 4)
176 |             {
177 |                 code_generator.faddp(VReg(first_).h8, VReg(first_).h8,
178 |                                      VReg(first_).h8);
179 |             }
180 |             if (mask > 2)
181 |             {
182 |                 code_generator.faddp(VReg(first_).h4, VReg(first_).h4,
183 |                                      VReg(first_).h4);
184 |             }
185 |             if (mask > 1)
186 |             {
187 |                 // TO DO HERE.
188 |                 code_generator.faddp(HReg(first_), VReg(first_).h2);
189 |             }
190 |         }
191 |     }
192 | };
193 | 
194 | } // namespace arm
195 | } // namespace dabun
196 | 


--------------------------------------------------------------------------------
/include/dabun/arm/peak_gflops.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #pragma once
  7 | 
  8 | #include "dabun/isa.hpp"
  9 | #ifdef DABUN_ARCH_AARCH64
 10 | 
 11 | #    include "dabun/code_generator/code_generator.hpp"
 12 | #    include "dabun/isa.hpp"
 13 | #    include "dabun/math.hpp"
 14 | #    include "dabun/numeric.hpp"
 15 | 
 16 | #    include <sysml/measure.hpp>
 17 | 
 18 | #    include <utility>
 19 | 
 20 | namespace dabun
 21 | {
 22 | namespace arm
 23 | {
 24 | 
 25 | template <class ISA, class Arithmetic = float>
 26 | struct bench_gflops;
 27 | 
 28 | template <class Arithmetic>
 29 | struct bench_gflops<aarch64, Arithmetic>
 30 | {
 31 | private:
 32 |     static constexpr int vector_size = isa_traits<aarch64>::vector_size;
 33 | 
 34 |     class test : public code_generator<void(int)>
 35 |     {
 36 |     private:
 37 |         Reg64 ZeroReg_ = x4;
 38 | 
 39 |     public:
 40 |         test()
 41 |         {
 42 |             eor(ZeroReg_, ZeroReg_, ZeroReg_);
 43 |             ins(v0.d[0], ZeroReg_);
 44 |             ins(v0.d[1], ZeroReg_);
 45 |             ins(v1.d[0], ZeroReg_);
 46 |             ins(v1.d[1], ZeroReg_);
 47 | 
 48 |             auto loopLabel = make_label();
 49 |             L_aarch64(*loopLabel);
 50 | 
 51 |             for (int i = 0; i < 10; ++i)
 52 |             {
 53 |                 if constexpr (std::is_same_v<Arithmetic, float>)
 54 |                 {
 55 |                     for (int r = 2; r < 8; ++r)
 56 |                     {
 57 |                         fmla(VReg(r).s4, v0.s4, v1.s4);
 58 |                     }
 59 |                     for (int r = 16; r < 32; ++r)
 60 |                     {
 61 |                         fmla(VReg(r).s4, v0.s4, v1.s4);
 62 |                     }
 63 |                 }
 64 |                 else
 65 |                 {
 66 |                     for (int r = 2; r < 8; ++r)
 67 |                     {
 68 |                         fmla(VReg(r).h8, v0.h8, v1.h8);
 69 |                     }
 70 |                     for (int r = 16; r < 32; ++r)
 71 |                     {
 72 |                         fmla(VReg(r).h8, v0.h8, v1.h8);
 73 |                     }
 74 |                 }
 75 |             }
 76 | 
 77 |             sub(x0, x0, 1);
 78 |             cbnz(x0, *loopLabel);
 79 | 
 80 |             ret();
 81 |         }
 82 |     };
 83 | 
 84 | public:
 85 |     static std::pair<double, double> do_bench(int iterations = 10000000)
 86 |     {
 87 |         auto fn = test().get_shared();
 88 | 
 89 |         double secs = sysml::measure_fastest([&]() { fn(iterations); }, 100);
 90 | 
 91 |         double gflops = 2.0 * iterations * 10 * (16 + 6) *
 92 |                         (vector_size * 4 / sizeof(Arithmetic)) / 1000000000;
 93 | 
 94 |         return {gflops, secs};
 95 |     }
 96 | };
 97 | 
 98 | #    ifndef DABUN_HEADER_ONLY
 99 | 
100 | extern template struct bench_gflops<aarch64, fp32_t>;
101 | extern template struct bench_gflops<aarch64, fp16_t>;
102 | 
103 | #    endif
104 | 
105 | } // namespace arm
106 | } // namespace dabun
107 | 
108 | #endif
109 | 


--------------------------------------------------------------------------------
/include/dabun/arm/xbyak.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #pragma once
  7 | 
  8 | #include "dabun/isa.hpp"
  9 | #ifdef DABUN_ARCH_AARCH64
 10 | 
 11 | #    include "xbyak_aarch64/xbyak_aarch64.h"
 12 | 
 13 | #    include "dabun/core.hpp"
 14 | 
 15 | using xbyak_buffer_type = std::uint32_t;
 16 | 
 17 | namespace Xbyak
 18 | {
 19 | using namespace Xbyak_aarch64;
 20 | using CodeArray     = CodeArrayAArch64;
 21 | using Allocator     = AllocatorAArch64;
 22 | using CodeGenerator = CodeGeneratorAArch64;
 23 | using Reg64         = XReg;
 24 | using Label         = LabelAArch64;
 25 | } // namespace Xbyak
 26 | 
 27 | namespace dabun
 28 | {
 29 | 
 30 | template <unsigned ElementSize, unsigned NumElements = 16 / ElementSize>
 31 | struct vreg_view
 32 | {
 33 | private:
 34 |     static_assert(ElementSize == 1 || ElementSize == 2 || ElementSize == 4 ||
 35 |                   ElementSize == 8);
 36 | 
 37 | public:
 38 |     decltype(auto) operator()(Xbyak::VReg const& r)
 39 |     {
 40 |         if constexpr (ElementSize == 1)
 41 |         {
 42 |             if constexpr (NumElements == 4)
 43 |             {
 44 |                 return r.b4;
 45 |             }
 46 |             else if constexpr (NumElements == 8)
 47 |             {
 48 |                 return r.b8;
 49 |             }
 50 |             else if constexpr (NumElements == 16)
 51 |             {
 52 |                 return r.b16;
 53 |             }
 54 |             else
 55 |             {
 56 |                 strong_assert(false);
 57 |                 return nullptr;
 58 |             }
 59 |         }
 60 |         else if constexpr (ElementSize == 2)
 61 |         {
 62 |             if constexpr (NumElements == 2)
 63 |             {
 64 |                 return r.h2;
 65 |             }
 66 |             else if constexpr (NumElements == 4)
 67 |             {
 68 |                 return r.h4;
 69 |             }
 70 |             else if constexpr (NumElements == 8)
 71 |             {
 72 |                 return r.h8;
 73 |             }
 74 |             else
 75 |             {
 76 |                 strong_assert(false);
 77 |                 return nullptr;
 78 |             }
 79 |         }
 80 |         else if constexpr (ElementSize == 4)
 81 |         {
 82 |             // if constexpr (NumElements == 1)
 83 |             // {
 84 |             //     return r.s1;
 85 |             // }
 86 |             //else
 87 |             if constexpr (NumElements == 2)
 88 |             {
 89 |                 return r.s2;
 90 |             }
 91 |             else if constexpr (NumElements == 4)
 92 |             {
 93 |                 return r.s4;
 94 |             }
 95 |             else
 96 |             {
 97 |                 strong_assert(false);
 98 |                 return nullptr;
 99 |             }
100 |         }
101 |         else if constexpr (ElementSize == 8)
102 |         {
103 |             if constexpr (NumElements == 1)
104 |             {
105 |                 return r.d1;
106 |             }
107 |             else if constexpr (NumElements == 2)
108 |             {
109 |                 return r.d2;
110 |             }
111 |             else
112 |             {
113 |                 strong_assert(false);
114 |                 return nullptr;
115 |             }
116 |         }
117 |         else
118 |         {
119 |             strong_assert(false);
120 |             return nullptr;
121 |         }
122 |     }
123 | };
124 | 
125 | } // namespace dabun
126 | 
127 | #endif
128 | 


--------------------------------------------------------------------------------
/include/dabun/check.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "dabun/numeric.hpp"
 9 | #include "sysml/math.hpp"
10 | 
11 | #include <cmath>
12 | #include <iostream>
13 | 
14 | namespace dabun
15 | {
16 | 
17 | template <class Float>
18 | void apply_relu(Float* Begin, Float* End)
19 | {
20 |     for (; Begin != End; ++Begin)
21 |     {
22 |         if constexpr (std::is_same_v<Float, fp16_t>)
23 |         {
24 |             *Begin = static_cast<fp16_t>(
25 |                 std::max(static_cast<float>(0), static_cast<float>(*Begin)));
26 |         }
27 |         else
28 |         {
29 |             *Begin = std::max(static_cast<Float>(0), *Begin);
30 |         }
31 |     }
32 | }
33 | 
34 | template <class Float>
35 | auto max_abs_difference(Float const* LBegin, Float const* LEnd,
36 |                         Float const* RBegin)
37 | {
38 |     decltype(sysml::absolute_difference(*LBegin, *RBegin)) res = 0;
39 | 
40 |     for (; LBegin != LEnd; ++LBegin, ++RBegin)
41 |     {
42 |         res = std::max(res, sysml::absolute_difference(*LBegin, *RBegin));
43 |     }
44 |     return res;
45 | }
46 | 
47 | template <class Float>
48 | Float max_abs_difference_verbose(Float const* LBegin, Float const* LEnd,
49 |                                  Float const* RBegin)
50 | {
51 |     int   off = 0;
52 |     Float res = 0;
53 |     for (; LBegin != LEnd; ++LBegin, ++RBegin)
54 |     {
55 |         if constexpr (std::is_same_v<Float, fp16_t>)
56 |         {
57 |             std::cout << off++ << " : " << *LBegin << " "
58 |                       << static_cast<float>(*RBegin) << " "
59 |                       << std::abs(static_cast<float>(*LBegin - *RBegin))
60 |                       << "\n";
61 |             res = static_cast<fp16_t>(
62 |                 std::max(static_cast<float>(res),
63 |                          std::abs(static_cast<float>(*LBegin) -
64 |                                   static_cast<float>(*RBegin))));
65 |         }
66 |         else
67 |         {
68 |             std::cout << off++ << " : " << *LBegin << " " << *RBegin << " "
69 |                       << std::abs(*LBegin - *RBegin) << "\n";
70 |             res = std::max(res, std::abs(*LBegin - *RBegin));
71 |         }
72 |     }
73 |     return res;
74 | }
75 | 
76 | template <class Float>
77 | Float max_abs_difference_verbose(Float const* LBegin, Float const* LEnd,
78 |                                  Float const* RBegin, float delta)
79 | {
80 |     int   off = 0;
81 |     Float res = 0;
82 |     for (; LBegin != LEnd; ++LBegin, ++RBegin)
83 |     {
84 |         if (std::abs(*LBegin - *RBegin) > delta)
85 |         {
86 |             std::cout << off << " : " << *LBegin << " " << *RBegin << " "
87 |                       << std::abs(*LBegin - *RBegin) << "\n";
88 |         }
89 |         res = std::max(res, std::abs(*LBegin - *RBegin));
90 |         off++;
91 |     }
92 |     return res;
93 | }
94 | 
95 | } // namespace dabun
96 | 


--------------------------------------------------------------------------------
/include/dabun/code_generator.hpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
2 | //
3 | // This source code is licensed under the MIT license found in the
4 | // LICENSE file in the root directory of this source tree.
5 | 
6 | #pragma once
7 | 
8 | #include "dabun/code_generator/code_generator.hpp"
9 | 


--------------------------------------------------------------------------------
/include/dabun/code_generator/aot_fn.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <sysml/code_generator/code_generated_fn.hpp>
 9 | #include <sysml/code_generator/protect.hpp>
10 | 
11 | namespace dabun
12 | {
13 | 
14 | // using ::sysml::code_generator::code_generated_fn_ref;
15 | using ::sysml::code_generator::observed_dynamic_fn;
16 | using ::sysml::code_generator::shared_dynamic_fn;
17 | using ::sysml::code_generator::unique_dynamic_fn;
18 | using ::sysml::code_generator::weak_dynamic_fn;
19 | 
20 | using ::sysml::code_generator::dynamic_fn_cast;
21 | 
22 | } // namespace dabun
23 | 


--------------------------------------------------------------------------------
/include/dabun/code_generator/code_generator.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <sysml/code_generator/code_generated_fn.hpp>
 9 | #include <sysml/code_generator/code_generator.hpp>
10 | #include <sysml/code_generator/memory_resource.hpp>
11 | 
12 | #include "dabun/code_generator/xbyak.hpp"
13 | 
14 | namespace dabun
15 | {
16 | using ::sysml::code_generator::allocator_adapter_base;
17 | using ::sysml::code_generator::basic_code_generator;
18 | using ::sysml::code_generator::code_generator;
19 | using ::sysml::code_generator::with_signature;
20 | } // namespace dabun
21 | 


--------------------------------------------------------------------------------
/include/dabun/code_generator/memory_resource.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <sysml/code_generator/memory_resource.hpp>
 9 | 
10 | namespace dabun
11 | {
12 | 
13 | using ::sysml::code_generator::inplace_memory_resource;
14 | using ::sysml::code_generator::malloc_memory_resource;
15 | using ::sysml::code_generator::memory_resource;
16 | using ::sysml::code_generator::mmap_memory_resource;
17 | 
18 | } // namespace dabun
19 | 


--------------------------------------------------------------------------------
/include/dabun/code_generator/xbyak.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "dabun/isa.hpp"
 9 | 
10 | #if defined(DABUN_ARCH_AARCH64)
11 | 
12 | #    include "dabun/arm/xbyak.hpp"
13 | 
14 | #else
15 | 
16 | #    include "dabun/x86/xbyak.hpp"
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/include/dabun/common.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #pragma once
  7 | 
  8 | #include "dabun/code_generator/xbyak.hpp"
  9 | #include "dabun/core.hpp"
 10 | 
 11 | #include <cassert>
 12 | #include <cstdint>
 13 | #include <cstring>
 14 | #include <map>
 15 | #include <string>
 16 | #include <type_traits>
 17 | #include <utility>
 18 | 
 19 | namespace dabun
 20 | {
 21 | 
 22 | static inline constexpr int skip_postop = 0b10;
 23 | static inline constexpr int alpha_1     = 0b01;
 24 | static inline constexpr int alpha_0     = 0b00;
 25 | 
 26 | enum access_kind
 27 | {
 28 |     SCALAR,
 29 |     VECTOR_PACKED,
 30 |     VECTOR_STRIDED
 31 | };
 32 | 
 33 | inline std::string to_string(access_kind akind)
 34 | {
 35 |     switch (akind)
 36 |     {
 37 |     case SCALAR:
 38 |         return "scalar";
 39 |     case VECTOR_PACKED:
 40 |         return "vector_packed";
 41 |     case VECTOR_STRIDED:
 42 |         return "vector_strided";
 43 |     }
 44 |     return "unknown";
 45 | }
 46 | 
 47 | struct loop_descriptor
 48 | {
 49 |     std::string var;
 50 |     int         end;
 51 |     int         delta;
 52 | };
 53 | 
 54 | struct tensor_traits
 55 | {
 56 |     std::string   name;
 57 |     access_kind   access;
 58 |     Xbyak::Reg64  reg = Xbyak::Reg64(0);
 59 |     Xbyak::Label* stridesLabel;
 60 |     int           innermost_stride;
 61 |     int           access_len;
 62 | };
 63 | 
 64 | template <int vector_size>
 65 | struct memory_argument_type
 66 | {
 67 |     int                        offset;
 68 |     tensor_traits const*       traits;
 69 |     int                        mask;
 70 |     std::map<std::string, int> coordinates;
 71 | 
 72 |     memory_argument_type(int offset, tensor_traits const* traits, int mask,
 73 |                          std::map<std::string, int> coordinates = {})
 74 |         : offset(offset)
 75 |         , traits(traits)
 76 |         , mask(mask)
 77 |         , coordinates(coordinates){};
 78 |     // We are not comparing the mask
 79 | 
 80 |     bool operator<(memory_argument_type const& o) const
 81 |     {
 82 |         return std::tie(offset, mask, traits->name) <
 83 |                std::tie(o.offset, mask, o.traits->name);
 84 |     }
 85 | 
 86 |     bool operator==(memory_argument_type const& o) const
 87 |     {
 88 |         return std::tie(offset, mask, traits->name) ==
 89 |                std::tie(o.offset, mask, o.traits->name);
 90 |     }
 91 | 
 92 |     std::string readable() const
 93 |     {
 94 |         assert(traits);
 95 |         return traits->name + "[" + std::to_string(offset) + ":" +
 96 |                std::to_string(traits->access == SCALAR ? 1 : vector_size) +
 97 |                "]{" + std::to_string(traits->innermost_stride) + "}{" +
 98 |                std::to_string(mask) + "}";
 99 |     }
100 | };
101 | 
102 | struct in_register_tensor_pointer_type
103 | {
104 |     std::string                name;
105 |     Xbyak::Reg64               reg;
106 |     std::map<std::string, int> strides;
107 | };
108 | 
109 | inline int get_cursor_offset(std::map<std::string, int> coordinates,
110 |                              std::map<std::string, int> strides)
111 | {
112 |     int off = 0;
113 |     for (auto const& s : strides)
114 |     {
115 |         off += coordinates[s.first] * s.second;
116 |     }
117 |     return off;
118 | }
119 | 
120 | } // namespace dabun
121 | 


--------------------------------------------------------------------------------
/include/dabun/configuration.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "dabun/isa.hpp"
 9 | 
10 | #if defined(DABUN_ARCH_AARCH64)
11 | #    include "dabun/arm/configuration.hpp"
12 | #else
13 | #    include "dabun/x86/configuration.hpp"
14 | #endif
15 | 
16 | namespace dabun
17 | {
18 | 
19 | using DABUN_ISA_NAMESPACE ::all_optims;
20 | using DABUN_ISA_NAMESPACE ::no_optims;
21 | using DABUN_ISA_NAMESPACE ::OptimizationConfiguration;
22 | 
23 | } // namespace dabun
24 | 


--------------------------------------------------------------------------------
/include/dabun/core.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #pragma once
  7 | 
  8 | #include <cstring>
  9 | #include <stdexcept>
 10 | #include <type_traits>
 11 | #include <utility>
 12 | 
 13 | #define DABUN_STRINGIFY_0(s) #s
 14 | #define DABUN_STRINGIFY(s) DABUN_STRINGIFY_0(s)
 15 | 
 16 | #if defined(DABUN_REQUIES_TEMPLATE_DEFINITION) ||                              \
 17 |     defined(DABUN_MAYBE_EXTN_TPL_INSTNTON)
 18 | #    error The macros above cannot be defined at this stage!
 19 | #endif
 20 | 
 21 | #if defined(DABUN_HEADER_ONLY)
 22 | #    if defined(DABUN_COMPILING_LIBDABUN)
 23 | #        error Unsupported combination of defines
 24 | #    else
 25 | #        define DABUN_REQUIES_TEMPLATE_DEFINITION
 26 | #    endif
 27 | #else
 28 | #    if defined(DABUN_COMPILING_LIBDABUN)
 29 | #        define DABUN_REQUIES_TEMPLATE_DEFINITION
 30 | #        define DABUN_MAYBE_EXTN_TPL_INSTNTON template
 31 | #    else
 32 | #        define DABUN_MAYBE_EXTN_TPL_INSTNTON extern template
 33 | #    endif
 34 | #endif
 35 | 
 36 | #define strong_assert(condition)                                               \
 37 |     if (!(condition))                                                          \
 38 |     {                                                                          \
 39 |         throw std::runtime_error(                                              \
 40 |             DABUN_STRINGIFY(condition) " failed file: " __FILE__               \
 41 |                                        " line: " DABUN_STRINGIFY((__LINE__))); \
 42 |     }                                                                          \
 43 |     static_cast<void>(0)
 44 | 
 45 | namespace dabun
 46 | {
 47 | 
 48 | #ifndef NDEBUG
 49 | inline constexpr bool compiled_in_debug_mode = true;
 50 | #else
 51 | inline constexpr bool compiled_in_debug_mode = false;
 52 | #endif
 53 | 
 54 | // FROM: https://en.cppreference.com/w/cpp/utility/variant/visit
 55 | 
 56 | template <class... Ts>
 57 | struct overloaded : Ts...
 58 | {
 59 |     using Ts::operator()...;
 60 | };
 61 | 
 62 | // explicit deduction guide (not needed as of C++20)
 63 | template <class... Ts>
 64 | overloaded(Ts...) -> overloaded<Ts...>;
 65 | 
 66 | template <class T>
 67 | struct identity_type
 68 | {
 69 |     using type = T;
 70 | };
 71 | 
 72 | template <class T>
 73 | using identity_type_t = typename identity_type<T>::type;
 74 | 
 75 | // Sourced from https://en.cppreference.com/w/cpp/numeric/bit_cast
 76 | // to enable bit_cast from C++20
 77 | template <class To, class From>
 78 | typename std::enable_if_t<sizeof(To) == sizeof(From) &&
 79 |                               std::is_trivially_copyable_v<From> &&
 80 |                               std::is_trivially_copyable_v<To>,
 81 |                           To>
 82 | // constexpr support needs compiler magic
 83 | bit_cast(const From& src) noexcept
 84 | {
 85 |     static_assert(std::is_trivially_constructible_v<To>,
 86 |                   "This implementation additionally requires destination type "
 87 |                   "to be trivially constructible");
 88 | 
 89 |     To dst;
 90 |     std::memcpy(&dst, &src, sizeof(To));
 91 |     return dst;
 92 | }
 93 | 
 94 | #define DABUN_OP_RESULT_TYPE(OP, T1, T2)                                       \
 95 |     decltype(std::declval<std::decay_t<T1>>()                                  \
 96 |                  OP std::declval<std::decay_t<T2>>())
 97 | 
 98 | #define DABUN_ALWAYS_INLINE __attribute__((always_inline)) inline
 99 | 
100 | } // namespace dabun
101 | 


--------------------------------------------------------------------------------
/include/dabun/elementwise_operation.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "dabun/isa.hpp"
 9 | 
10 | #if defined(DABUN_ARCH_AARCH64)
11 | #    include "dabun/arm/elementwise_operation.hpp"
12 | #else
13 | #    include "dabun/x86/elementwise_operation.hpp"
14 | #endif
15 | 
16 | namespace dabun
17 | {
18 | 
19 | using DABUN_ISA_NAMESPACE ::elementwise_bias;
20 | using DABUN_ISA_NAMESPACE ::elementwise_multiply;
21 | using DABUN_ISA_NAMESPACE ::elementwise_relu;
22 | 
23 | using DABUN_ISA_NAMESPACE ::elementwise_operation;
24 | using DABUN_ISA_NAMESPACE ::relu_elementwise_operation;
25 | using DABUN_ISA_NAMESPACE ::single_tensor_elementwise_operation;
26 | 
27 | } // namespace dabun
28 | 


--------------------------------------------------------------------------------
/include/dabun/hask/apple.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #if defined(__APPLE__)
 9 | 
10 | #    include <cstddef>
11 | #    include <cstdlib>
12 | #    include <sys/sysctl.h>
13 | 
14 | #    ifndef MAP_JIT
15 | #        define MAP_JIT 0x800
16 | #    endif
17 | 
18 | namespace dabun::hask
19 | {
20 | 
21 | inline constexpr int mojave_version = 18;
22 | 
23 | inline int get_macOS_version()
24 | {
25 |     static const int version = []()
26 |     {
27 |         char        buffer[64];
28 |         std::size_t size = sizeof(buffer);
29 | 
30 |         if (auto err =
31 |                 sysctlbyname("kern.osrelease", buffer, &size, nullptr, 0);
32 |             err != 0)
33 |         {
34 |             return 0;
35 |         }
36 | 
37 |         char* endp = nullptr;
38 | 
39 |         int ver_major = std::strtol(buffer, &endp, 10);
40 | 
41 |         if (*endp != '.')
42 |         {
43 |             return 0;
44 |         }
45 |         return ver_major;
46 |     }();
47 | 
48 |     return version;
49 | }
50 | 
51 | } // namespace dabun::hask
52 | 
53 | #endif
54 | 


--------------------------------------------------------------------------------
/include/dabun/isa.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #pragma once
  7 | 
  8 | #include <sysml/predef.hpp>
  9 | 
 10 | #ifndef DABUN_ISA
 11 | 
 12 | #    if defined(__AVX512F__)
 13 | #        define DABUN_ISA avx512
 14 | #    elif defined(__aarch64__)
 15 | #        define DABUN_ISA aarch64
 16 | #    else // default to avx2
 17 | // #elif defined(__AVX2__)
 18 | #        define DABUN_ISA avx2
 19 | // #error "ISA not supported"
 20 | #    endif
 21 | 
 22 | #endif
 23 | 
 24 | #ifndef DABUN_VEX
 25 | #    ifdef DABUN_ARITHMETIC
 26 | #        undef DABUN_ARITHMETIC
 27 | #    endif
 28 | #
 29 | #    if defined(SYSML_ARCH_AMD64)
 30 | #        define DABUN_VEX extension::avx2
 31 | #        define DABUN_ARITHMETIC dabun::float
 32 | #    elif defined(SYSML_ARCH_ARM64)
 33 | #        define DABUN_VEX extension::aarch64
 34 | #        define DABUN_ARITHMETIC dabun::float
 35 | #    else
 36 | #        error "Not supported"
 37 | #    endif
 38 | #endif
 39 | 
 40 | namespace dabun
 41 | {
 42 | 
 43 | struct avx2
 44 | {
 45 | };
 46 | struct avx512
 47 | {
 48 | };
 49 | struct avx2_plus
 50 | {
 51 | };
 52 | struct aarch64
 53 | {
 54 | };
 55 | 
 56 | template <class>
 57 | struct isa_traits;
 58 | 
 59 | template <>
 60 | struct isa_traits<avx2>
 61 | {
 62 |     static constexpr int total_vector_registers = 16;
 63 |     static constexpr int vector_register_mask   = 1;
 64 |     static constexpr int vector_size            = 8;
 65 | };
 66 | 
 67 | template <>
 68 | struct isa_traits<avx512>
 69 | {
 70 |     static constexpr int total_vector_registers = 32;
 71 |     static constexpr int vector_register_mask   = 0;
 72 |     static constexpr int vector_size            = 16;
 73 | };
 74 | 
 75 | template <>
 76 | struct isa_traits<avx2_plus>
 77 | {
 78 |     static constexpr int total_vector_registers = 16;
 79 |     static constexpr int vector_register_mask   = 0;
 80 |     static constexpr int vector_size            = 8;
 81 | };
 82 | 
 83 | template <>
 84 | struct isa_traits<aarch64>
 85 | {
 86 |     static constexpr int total_vector_registers = 32;
 87 |     static constexpr int vector_register_mask   = 0;
 88 |     static constexpr int vector_size            = 4;
 89 |     static constexpr int fp16_vector_size       = 2;
 90 | };
 91 | 
 92 | } // namespace dabun
 93 | 
 94 | // Copyright 2004-present Facebook. All Rights Reserved.
 95 | 
 96 | // #pragma once
 97 | 
 98 | // For deprecated APIs
 99 | // #include "dabun/isa.hpp"
100 | 
101 | #include <cstddef>
102 | #include <cstdint>
103 | 
104 | #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) ||           \
105 |     defined(__x86_64)
106 | #    define DABUN_ARCH_X86_64
107 | #elif defined(__aarch64__)
108 | #    define DABUN_ARCH_AARCH64
109 | #else
110 | #    error "Unknown target architecture"
111 | #endif
112 | 
113 | namespace dabun
114 | {
115 | 
116 | enum class architecture_kind : int
117 | {
118 |     unknown = 0,
119 |     x86_64  = 1,
120 |     aarch64 = 2
121 | };
122 | 
123 | enum class extension : int
124 | {
125 |     unknown = 0,
126 | 
127 |     // #if defined(DABUN_ARCH_X86_64)
128 |     avx        = 1001,
129 |     avx2       = 1002,
130 |     avx512_ymm = 1003,
131 |     avx512     = 1004,
132 |     // #elif defined(DABUN_ARCH_AARCH64)
133 |     neon      = 2001,
134 |     neon_fp16 = 2002
135 |     // #endif
136 | };
137 | 
138 | template <extension E>
139 | struct extension_traits;
140 | 
141 | // TODO(zi) deprecate the following two
142 | template <extension E>
143 | struct extension_to_deprecated_ISA;
144 | 
145 | template <extension E>
146 | using extension_to_deprecated_ISA_t =
147 |     typename extension_to_deprecated_ISA<E>::type;
148 | 
149 | #if defined(DABUN_ARCH_X86_64)
150 | 
151 | template <>
152 | struct extension_traits<extension::avx2>
153 | {
154 |     static constexpr architecture_kind architecture = architecture_kind::x86_64;
155 |     static constexpr int               vector_register_bits = 256;
156 |     static constexpr int               vector_size          = 32;
157 |     static constexpr bool              has_mask_register    = false;
158 |     static constexpr int               num_vector_registers = 32;
159 | };
160 | 
161 | template <>
162 | struct extension_traits<extension::avx512_ymm>
163 | {
164 |     static constexpr architecture_kind architecture = architecture_kind::x86_64;
165 |     static constexpr int               vector_register_bits = 256;
166 |     static constexpr int               vector_size          = 32;
167 |     static constexpr bool              has_mask_register    = true;
168 |     static constexpr int               num_vector_registers = 32;
169 | };
170 | 
171 | template <>
172 | struct extension_traits<extension::avx512>
173 | {
174 |     static constexpr architecture_kind architecture = architecture_kind::x86_64;
175 |     static constexpr int               vector_register_bits = 512;
176 |     static constexpr int               vector_size          = 64;
177 |     static constexpr bool              has_mask_register    = false;
178 |     static constexpr int               num_vector_registers = 32;
179 | };
180 | 
181 | // TODO(zi) deprecate
182 | template <>
183 | struct extension_to_deprecated_ISA<extension::avx2>
184 | {
185 |     using type = avx2;
186 | };
187 | template <>
188 | struct extension_to_deprecated_ISA<extension::avx512_ymm>
189 | {
190 |     using type = avx2_plus;
191 | };
192 | template <>
193 | struct extension_to_deprecated_ISA<extension::avx512>
194 | {
195 |     using type = avx512;
196 | };
197 | 
198 | #elif defined(DABUN_ARCH_AARCH64)
199 | 
200 | template <>
201 | struct extension_traits<extension::neon>
202 | {
203 |     static constexpr architecture_kind architecture =
204 |         architecture_kind::aarch64;
205 |     static constexpr int  vector_register_bits = 128;
206 |     static constexpr int  vector_size          = 16;
207 |     static constexpr bool has_mask_register    = false;
208 |     static constexpr int  num_vector_registers = 32;
209 | };
210 | 
211 | template <>
212 | struct extension_traits<extension::neon_fp16>
213 | {
214 |     static constexpr architecture_kind architecture =
215 |         architecture_kind::aarch64;
216 |     static constexpr int  vector_register_bits = 128;
217 |     static constexpr int  vector_size          = 16;
218 |     static constexpr bool has_mask_register    = false;
219 |     static constexpr int  num_vector_registers = 32;
220 | };
221 | 
222 | // TODO(zi) deprecate
223 | template <>
224 | struct extension_to_deprecated_ISA<extension::neon>
225 | {
226 |     using type = aarch64;
227 | };
228 | template <>
229 | struct extension_to_deprecated_ISA<extension::neon_fp16>
230 | {
231 |     using type = aarch64;
232 | };
233 | 
234 | #endif
235 | 
236 | } // namespace dabun
237 | 
238 | #if defined(DABUN_ARCH_AARCH64)
239 | #    define DABUN_ISA_NAMESPACE arm
240 | #else
241 | #    define DABUN_ISA_NAMESPACE x86
242 | #endif
243 | 


--------------------------------------------------------------------------------
/include/dabun/loop_nest.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "dabun/isa.hpp"
 9 | #include "dabun/numeric.hpp"
10 | 
11 | #if defined(DABUN_ARCH_AARCH64)
12 | #    include "dabun/arm/loop_nest.hpp"
13 | #else
14 | #    include "dabun/x86/loop_nest.hpp"
15 | #endif
16 | 
17 | namespace dabun
18 | {
19 | 
20 | using DABUN_ISA_NAMESPACE ::loop_nest_code_generator;
21 | 
22 | #if defined(DABUN_ARCH_AARCH64)
23 | 
24 | using DABUN_ISA_NAMESPACE ::loop_nest_fp16_code_generator;
25 | 
26 | template <extension VEX, class Arithmetic>
27 | using loop_nest_compiler = std::conditional_t<
28 |     std::is_same_v<Arithmetic, float>,
29 |     loop_nest_code_generator<extension_to_deprecated_ISA_t<VEX>, false>,
30 |     loop_nest_code_generator<extension_to_deprecated_ISA_t<VEX>, true>>;
31 | 
32 | #else
33 | 
34 | template <extension VEX, class Arithmetic>
35 | using loop_nest_compiler = std::conditional_t<
36 |     std::is_same_v<Arithmetic, float>,
37 |     loop_nest_code_generator<extension_to_deprecated_ISA_t<VEX>>, void>;
38 | 
39 | #endif
40 | 
41 | } // namespace dabun
42 | 


--------------------------------------------------------------------------------
/include/dabun/loop_tree/all_nodes.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "dabun/loop_tree/compiled_loop_nest_node.hpp"
 9 | #include "dabun/loop_tree/compiled_transpose_node.hpp"
10 | #include "dabun/loop_tree/compute_node.hpp"
11 | #include "dabun/loop_tree/for_loop_node.hpp"
12 | #include "dabun/loop_tree/transpose_node.hpp"
13 | 


--------------------------------------------------------------------------------
/include/dabun/loop_tree/compiled_transpose_node.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #pragma once
  7 | 
  8 | #include "dabun/loop_tree/node.hpp"
  9 | #include "dabun/utility/tmp_file_name.hpp"
 10 | 
 11 | namespace dabun
 12 | {
 13 | namespace loop_tree
 14 | {
 15 | 
 16 | template <extension VEX, class Arithmetic>
 17 | class compiled_transpose_node : public node<VEX, Arithmetic>
 18 | {
 19 | private:
 20 |     using ISA = typename extension_to_deprecated_ISA<VEX>::type;
 21 | 
 22 |     using super_type = node<VEX, Arithmetic>;
 23 | 
 24 |     std::string                              input;
 25 |     std::string                              output;
 26 |     std::vector<std::pair<std::string, int>> order;
 27 |     strides_map_type                         strides;
 28 |     std::optional<int>                       unroll_limit;
 29 | 
 30 | public:
 31 |     std::string dump(formulas_map_type const& /* formulas */,
 32 |                      std::map<std::string, int> const& sizes,
 33 |                      std::string const&                indent) const override
 34 |     {
 35 |         std::ostringstream ss;
 36 |         ss << indent << "AOT_tranpose" << std::endl;
 37 |         ss << utility::dump_order(order, indent);
 38 |         ss << utility::dump_sizes(sizes, indent);
 39 |         ss << indent << "Input: " << input << std::endl;
 40 |         ss << indent << "Output: " << output << std::endl;
 41 |         ss << utility::dump_strides(strides, indent);
 42 |         return ss.str();
 43 |     }
 44 | 
 45 | public:
 46 |     compiled_transpose_node(
 47 |         std::string const& input, std::string const& output,
 48 |         std::vector<std::pair<std::string, int>> const& order,
 49 |         strides_map_type const&                         strides,
 50 |         std::optional<int> unroll_limit = std::nullopt)
 51 |         : super_type(node_kind::compiled_transpose)
 52 |         , input(input)
 53 |         , output(output)
 54 |         , order(order)
 55 |         , strides(strides)
 56 |         , unroll_limit(unroll_limit)
 57 |     {
 58 |     }
 59 | 
 60 |     compiled_transpose_node(
 61 |         const compiled_transpose_node<VEX, Arithmetic>& other) = default;
 62 | 
 63 |     // creates initial transpose nest
 64 |     compiled_transpose_node(
 65 |         std::shared_ptr<for_loop_node<VEX, Arithmetic>> const&  for_node,
 66 |         std::shared_ptr<transpose_node<VEX, Arithmetic>> const& transpose_node)
 67 |         : compiled_transpose_node(
 68 |               transpose_node->get_input(), transpose_node->get_output(),
 69 |               {{for_node->get_var(), for_node->get_delta()}},
 70 |               transpose_node->get_tensor_strides(),
 71 |               transpose_node->get_unroll_limit())
 72 |     {
 73 |     }
 74 | 
 75 |     // extends the tranpose nest
 76 |     compiled_transpose_node(
 77 |         std::shared_ptr<for_loop_node<VEX, Arithmetic>> const& for_node,
 78 |         std::shared_ptr<compiled_transpose_node<VEX, Arithmetic>> const&
 79 |             transpose_compiler)
 80 |         : compiled_transpose_node(*transpose_compiler)
 81 |     {
 82 |         order.insert(order.begin(),
 83 |                      {for_node->get_var(), for_node->get_delta()});
 84 |     }
 85 | 
 86 |     std::pair<loop_tree_fn_type<Arithmetic>, report_vector>
 87 |     get_fn(std::map<std::string, int> const& tensors_idx,
 88 |            std::map<std::string, int> const& sizes,
 89 |            std::map<std::string, int> const&,
 90 |            formulas_map_type const& /* formulas */,
 91 |            bool spit_asm) const override
 92 |     {
 93 |         auto aot_fn =
 94 |             transposer_compiler<
 95 |                 (VEX == extension::avx512 ? extension::avx512_ymm : VEX),
 96 |                 Arithmetic>(order, sizes, strides.at(output), strides.at(input),
 97 |                             64 /* unroll_limit */)
 98 |                 .get_shared();
 99 | 
100 |         // aot_fn.save_to_file("transpose.asm");
101 | 
102 |         std::string asm_dump = "n/a";
103 | 
104 |         if (spit_asm)
105 |         {
106 |             asm_dump = ::dabun::utility::get_temporary_file_name(".asm");
107 |             aot_fn.save_to_file(asm_dump);
108 |         }
109 | 
110 |         std::string extra_string =
111 |             std::string("output_idx: ") +
112 |             std::to_string(tensors_idx.at(output)) +
113 |             ", input_idx: " + std::to_string(tensors_idx.at(input));
114 | 
115 |         compiled_transpose_node_info info{0, 0, extra_string};
116 | 
117 |         return {[aot_fn, output_idx = tensors_idx.at(output),
118 |                  input_idx = tensors_idx.at(input)](
119 |                     std::vector<Arithmetic*>& tensors, std::vector<int>&)
120 |                 { aot_fn(tensors[output_idx], tensors[input_idx]); },
121 |                 {std::make_shared<node_report>(info)}};
122 |     }
123 | 
124 |     std::set<std::string> get_tensors_used() const override
125 |     {
126 |         return {input, output};
127 |     }
128 | 
129 |     std::set<std::string> get_output_tensors() const override
130 |     {
131 |         return {output};
132 |     }
133 | 
134 |     strides_map_type const& get_tensor_strides() const override
135 |     {
136 |         return strides;
137 |     }
138 | };
139 | 
140 | } // namespace loop_tree
141 | } // namespace dabun
142 | 


--------------------------------------------------------------------------------
/include/dabun/loop_tree/nested_for_loops_node.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #pragma once
  7 | 
  8 | #include "dabun/loop_tree/node.hpp"
  9 | 
 10 | namespace dabun
 11 | {
 12 | namespace loop_tree
 13 | {
 14 | 
 15 | template <extension VEX, class Arithmetic>
 16 | class for_loop_node : public node<VEX, Arithmetic>
 17 | {
 18 | private:
 19 |     using super_type = node<VEX, Arithmetic>;
 20 | 
 21 |     std::string var;
 22 |     int         delta;
 23 | 
 24 |     std::set<std::string> in_scope_tensor_names;
 25 |     std::set<std::string> in_scope_output_tensor_names;
 26 |     strides_map_type      in_scope_tensor_strides;
 27 | 
 28 | public:
 29 |     std::string dump(formulas_map_type const& /* formulas */,
 30 |                      std::map<std::string, int> const& /* sizes */,
 31 |                      std::string const& indent) const override
 32 |     {
 33 |         std::ostringstream ss;
 34 |         ss << indent << "Interpreted For Node" << std::endl;
 35 |         ss << indent << "Var=" << var << ", delta=" << delta << std::endl;
 36 |         return ss.str();
 37 |     }
 38 | 
 39 | private:
 40 |     void set_in_scope_tensor_info()
 41 |     {
 42 |         for (auto c : this->get_children())
 43 |         {
 44 |             auto node_tensor_names = c->get_tensors_used();
 45 |             in_scope_tensor_names.insert(node_tensor_names.begin(),
 46 |                                          node_tensor_names.end());
 47 | 
 48 |             auto node_output_tensor_names = c->get_output_tensors();
 49 |             in_scope_output_tensor_names.insert(
 50 |                 node_output_tensor_names.begin(),
 51 |                 node_output_tensor_names.end());
 52 | 
 53 |             auto node_tensor_strides = c->get_tensor_strides();
 54 | 
 55 |             in_scope_tensor_strides.insert(node_tensor_strides.begin(),
 56 |                                            node_tensor_strides.end());
 57 |         }
 58 |     }
 59 | 
 60 |     std::function<void(std::vector<Arithmetic*>&, int)>
 61 |     get_tensor_advancer(std::map<std::string, int> const& tensors_idx,
 62 |                         std::set<std::string> const&      tensor_names) const
 63 |     {
 64 |         std::vector<std::pair<int, std::int64_t>> to_advance;
 65 | 
 66 |         for (auto const& name : tensor_names)
 67 |         {
 68 |             if (in_scope_tensor_strides.at(name).count(var))
 69 |             {
 70 |                 std::int64_t offset =
 71 |                     in_scope_tensor_strides.at(name).at(var) * delta;
 72 |                 if (offset != 0)
 73 |                 {
 74 |                     int idx = tensors_idx.at(name);
 75 |                     to_advance.push_back({idx, offset});
 76 |                 }
 77 |             }
 78 |         }
 79 | 
 80 |         return [to_advance](std::vector<Arithmetic*>& tensors, int delta = 1)
 81 |         {
 82 |             for (auto const& p : to_advance)
 83 |             {
 84 |                 tensors[p.first] += p.second * delta;
 85 |             }
 86 |         };
 87 |     }
 88 | 
 89 |     std::function<void(std::vector<int>&, int)>
 90 |     get_alpha_offsets_adjuster(std::map<std::string, int> const& tensors_idx,
 91 |                                std::set<std::string> const& output_tensor_names,
 92 |                                formulas_map_type const&     formulas) const
 93 |     {
 94 | 
 95 |         std::vector<int> to_adjust;
 96 |         for (auto const& name : output_tensor_names)
 97 |         {
 98 |             if (formulas.count(name) && formulas.at(name).count(var) == 0)
 99 |             {
100 |                 // reduction variable, so adjust the tensor's alpha
101 |                 to_adjust.push_back(tensors_idx.at(name));
102 |             }
103 |         }
104 | 
105 |         return [to_adjust](std::vector<int>& alpha_offsets, int adjustment)
106 |         {
107 |             for (auto const& idx : to_adjust)
108 |             {
109 |                 alpha_offsets[idx] += adjustment;
110 |             }
111 |         };
112 |     }
113 | 
114 | public:
115 |     std::string const& get_var() const { return var; }
116 |     int                get_delta() const { return delta; }
117 | 
118 |     for_loop_node(std::string var, int delta,
119 |                   std::vector<node_ptr<VEX, Arithmetic>> const& children)
120 |         : super_type(node_kind::for_loop)
121 |         , var(var)
122 |         , delta(delta)
123 |     {
124 |         this->set_children(children);
125 |         set_in_scope_tensor_info();
126 |     }
127 | 
128 |     std::set<std::string> get_tensors_used() const override
129 |     {
130 |         return in_scope_tensor_names;
131 |     }
132 | 
133 |     std::set<std::string> get_output_tensors() const override
134 |     {
135 |         return in_scope_output_tensor_names;
136 |     }
137 | 
138 |     strides_map_type const& get_tensor_strides() const override
139 |     {
140 |         return in_scope_tensor_strides;
141 |     }
142 | 
143 |     std::pair<loop_tree_fn_type<Arithmetic>, report_vector>
144 |     get_fn(std::map<std::string, int> const& tensors_idx,
145 |            std::map<std::string, int> const& sizes,
146 |            std::map<std::string, int> const& outer_iteration_depths,
147 |            formulas_map_type const& formulas, bool debug_mode) const override
148 |     {
149 |         auto var      = this->var;
150 |         auto delta    = this->delta;
151 |         auto children = this->get_children();
152 |         auto limit    = sizes.at(var);
153 | 
154 |         auto const [full, rest] = full_rest(limit, delta);
155 | 
156 |         report_vector report = {
157 |             std::make_shared<node_report>(for_loop_node_info{
158 |                 1, 1, var, full + (rest ? 1 : 0), delta, limit})};
159 | 
160 |         std::vector<loop_tree_fn_type<Arithmetic>> full_fns, tail_fns;
161 | 
162 |         auto iteration_depths = outer_iteration_depths;
163 | 
164 |         int last_iteration = full + (rest ? 1 : 0) - 1;
165 |         strong_assert(last_iteration >= 0);
166 | 
167 |         iteration_depths[var] += last_iteration;
168 | 
169 |         for (auto c : children)
170 |         {
171 |             auto inner_sizes = sizes;
172 | 
173 |             if (full)
174 |             {
175 |                 inner_sizes[var] = delta;
176 |                 auto [fn, rep] =
177 |                     c->get_fn(tensors_idx, inner_sizes, iteration_depths,
178 |                               formulas, debug_mode);
179 |                 full_fns.push_back(fn);
180 |                 report[0]->children.insert(report[0]->children.end(),
181 |                                            rep.begin(), rep.end());
182 |             }
183 |             if (rest)
184 |             {
185 |                 inner_sizes[var] = rest;
186 |                 auto [fn, rep] =
187 |                     c->get_fn(tensors_idx, inner_sizes, iteration_depths,
188 |                               formulas, debug_mode);
189 |                 tail_fns.push_back(fn);
190 |                 report.insert(report.end(), rep.begin(), rep.end());
191 |             }
192 |         }
193 | 
194 |         auto tensor_advancer =
195 |             get_tensor_advancer(tensors_idx, get_tensors_used());
196 |         auto alpha_offsets_adjuster = get_alpha_offsets_adjuster(
197 |             tensors_idx, get_output_tensors(), formulas);
198 | 
199 |         LN_LOG(DEBUG) << "loop_tree: Executing interpreted for(" << var << ","
200 |                       << delta << ")\n";
201 | 
202 |         return {[full, full_fns, tensor_advancer, alpha_offsets_adjuster,
203 |                  tail_fns](std::vector<Arithmetic*>& tensors,
204 |                            std::vector<int>&         alpha_offsets)
205 |                 {
206 |                     for (int i = 0; i < full; ++i)
207 |                     {
208 |                         for (auto const& fn : full_fns)
209 |                         {
210 |                             fn(tensors, alpha_offsets);
211 |                         }
212 |                         tensor_advancer(tensors, 1);
213 |                         alpha_offsets_adjuster(alpha_offsets, 1);
214 |                     }
215 | 
216 |                     for (auto const& fn : tail_fns)
217 |                     {
218 |                         fn(tensors, alpha_offsets);
219 |                     }
220 | 
221 |                     tensor_advancer(tensors, -full);
222 |                     alpha_offsets_adjuster(alpha_offsets, -full);
223 |                 },
224 |                 report};
225 |     }
226 | };
227 | 
228 | template <extension VEX, class Arithmetic>
229 | node_ptr<VEX, Arithmetic>
230 | make_for_loop_node(std::string var, int delta,
231 |                    std::vector<node_ptr<VEX, Arithmetic>> const& children)
232 | {
233 |     return node_ptr<VEX, Arithmetic>(
234 |         new for_loop_node<VEX, Arithmetic>(var, delta, children));
235 | }
236 | 
237 | } // namespace loop_tree
238 | } // namespace dabun
239 | 


--------------------------------------------------------------------------------
/include/dabun/loop_tree/node.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #pragma once
  7 | 
  8 | #include "dabun/code_generator/aot_fn.hpp"
  9 | #include "dabun/configuration.hpp"
 10 | #include "dabun/isa.hpp"
 11 | #include "dabun/loop_nest.hpp"
 12 | #include "dabun/loop_tree/report.hpp"
 13 | #include "dabun/loop_tree/types.hpp"
 14 | #include "dabun/loop_tree/utility.hpp"
 15 | #include "dabun/utility/log.hpp"
 16 | 
 17 | #include <map>
 18 | #include <string>
 19 | #include <vector>
 20 | 
 21 | namespace dabun
 22 | {
 23 | namespace loop_tree
 24 | {
 25 | 
 26 | enum class node_kind
 27 | {
 28 |     for_loop,
 29 |     compute,
 30 |     transpose,
 31 |     compiled_loop_nest,
 32 |     compiled_transpose
 33 | };
 34 | 
 35 | inline std::map<node_kind, std::string> const node_kind_to_str_map = {
 36 |     {node_kind::for_loop, "for_loop_node"},
 37 |     {node_kind::compute, "compute_node"},
 38 |     {node_kind::transpose, "transpose_node"},
 39 |     {node_kind::compiled_loop_nest, "compiled_loop_nest_node"},
 40 |     {node_kind::compiled_transpose, "compiled_transpose_node"}};
 41 | 
 42 | inline std::string const& node_kind_to_str(node_kind kind)
 43 | {
 44 |     return node_kind_to_str_map.at(kind);
 45 | }
 46 | 
 47 | template <extension VEX, class Arithmetic>
 48 | class node
 49 | {
 50 | 
 51 | private:
 52 |     node_kind                              kind_;
 53 |     std::vector<node_ptr<VEX, Arithmetic>> children_;
 54 | 
 55 | public:
 56 |     virtual ~node(){};
 57 | 
 58 |     explicit node(node_kind kind)
 59 |         : kind_(kind)
 60 |     {
 61 |     }
 62 | 
 63 |     std::vector<node_ptr<VEX, Arithmetic>> const& get_children() const
 64 |     {
 65 |         return children_;
 66 |     }
 67 | 
 68 |     void set_children(std::vector<node_ptr<VEX, Arithmetic>> const& children)
 69 |     {
 70 |         children_ = children;
 71 |     }
 72 | 
 73 |     void set_children(std::vector<node_ptr<VEX, Arithmetic>>&& children)
 74 |     {
 75 |         children_ = std::move(children);
 76 |     }
 77 | 
 78 |     node_kind kind() const { return kind_; }
 79 | 
 80 |     // tensor positions, dimension sizes, and tensor formulas
 81 |     virtual std::pair<loop_tree_fn_type<Arithmetic>, report_vector>
 82 |     get_fn(std::map<std::string, int> const&, std::map<std::string, int> const&,
 83 |            std::map<std::string, int> const&, formulas_map_type const&,
 84 |            bool) const = 0;
 85 | 
 86 |     virtual std::set<std::string> get_tensors_used() const = 0;
 87 | 
 88 |     virtual std::set<std::string> get_output_tensors() const = 0;
 89 | 
 90 |     virtual strides_map_type const& get_tensor_strides() const = 0;
 91 | 
 92 |     virtual std::string dump(formulas_map_type const&          formulas,
 93 |                              std::map<std::string, int> const& sizes,
 94 |                              std::string const& indent) const = 0;
 95 | };
 96 | 
 97 | template <extension VEX, class Arithmetic>
 98 | class compute_node;
 99 | 
100 | template <extension VEX, class Arithmetic>
101 | class compiled_loop_nest_node;
102 | 
103 | template <extension VEX, class Arithmetic>
104 | class transpose_node;
105 | 
106 | template <extension VEX, class Arithmetic>
107 | class compiled_transpose_node;
108 | 
109 | template <extension VEX, class Arithmetic>
110 | class for_loop_node;
111 | 
112 | template <extension VEX, class Arithmetic, std::size_t Cardinality>
113 | class nested_for_loops_node;
114 | 
115 | } // namespace loop_tree
116 | } // namespace dabun
117 | 


--------------------------------------------------------------------------------
/include/dabun/loop_tree/report.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #pragma once
  7 | 
  8 | #include <memory>
  9 | #include <sstream>
 10 | #include <string>
 11 | #include <variant>
 12 | #include <vector>
 13 | 
 14 | #include "dabun/common.hpp"
 15 | 
 16 | namespace dabun
 17 | {
 18 | namespace loop_tree
 19 | {
 20 | 
 21 | struct program_node_info
 22 | {
 23 |     std::int64_t const flops           = 0;
 24 |     std::int64_t const effective_flops = 0;
 25 |     std::string const  extra           = "";
 26 | 
 27 |     std::string to_string() const
 28 |     {
 29 |         return std::string("program - FLOPs: ") + std::to_string(flops) +
 30 |                ", effective FLOPs: " + std::to_string(effective_flops) +
 31 |                ", extra\"" + extra + "\"";
 32 |     }
 33 | };
 34 | 
 35 | struct compute_node_info
 36 | {
 37 |     std::int64_t const flops           = 0;
 38 |     std::int64_t const effective_flops = 0;
 39 |     std::string const  extra           = "";
 40 | 
 41 |     std::string to_string() const
 42 |     {
 43 |         return std::string("compute node: 2 FLOPs") + ", extra\"" + extra +
 44 |                "\"";
 45 |     }
 46 | };
 47 | 
 48 | struct compiled_loop_nest_node_info
 49 | {
 50 |     std::int64_t const flops           = 0;
 51 |     std::int64_t const effective_flops = 0;
 52 |     std::string const  asm_dump        = "";
 53 | 
 54 |     access_kind A_access_kind;
 55 |     access_kind B_access_kind;
 56 |     access_kind C_access_kind;
 57 | 
 58 |     std::pair<int, int> register_blocking_info;
 59 | 
 60 |     std::string const extra = "";
 61 | 
 62 |     std::string to_string() const
 63 |     {
 64 |         return std::string("compiled loop_nest - FLOPs: ") +
 65 |                std::to_string(flops) +
 66 |                ", effective FLOPs: " + std::to_string(effective_flops) +
 67 |                ", A access: " + dabun::to_string(A_access_kind) +
 68 |                ", B access: " + dabun::to_string(B_access_kind) +
 69 |                ", C access: " + dabun::to_string(C_access_kind) +
 70 |                ", register blocking: " +
 71 |                std::to_string(register_blocking_info.first) + ":" +
 72 |                std::to_string(register_blocking_info.second) + ", extra\"" +
 73 |                extra + "\"";
 74 |     }
 75 | };
 76 | 
 77 | struct transpose_node_info
 78 | {
 79 |     std::int64_t const flops           = 0;
 80 |     std::int64_t const effective_flops = 0;
 81 |     std::string const  extra           = "";
 82 | 
 83 |     std::string to_string() const
 84 |     {
 85 |         return std::string("transpose_node") + ", extra\"" + extra + "\"";
 86 |     }
 87 | };
 88 | 
 89 | struct compiled_transpose_node_info
 90 | {
 91 |     std::int64_t const flops           = 0;
 92 |     std::int64_t const effective_flops = 0;
 93 |     std::string const  asm_dump        = "";
 94 |     std::string const  extra           = "";
 95 | 
 96 |     std::string to_string() const
 97 |     {
 98 |         return std::string("compiled_transpose_node") + ", extra\"" + extra +
 99 |                "\"";
100 |     }
101 | };
102 | 
103 | struct for_loop_node_info
104 | {
105 |     std::int64_t const flops           = 0;
106 |     std::int64_t const effective_flops = 0;
107 | 
108 |     std::string const  var_name = "";
109 |     std::int64_t const steps    = 0;
110 |     std::int64_t const delta    = 0;
111 |     std::int64_t const size     = 0;
112 | 
113 |     std::string const extra = "";
114 | 
115 |     std::string to_string() const
116 |     {
117 |         return std::string("for_loop - FLOPs: ") + std::to_string(flops) +
118 |                ", effective FLOPs: " + std::to_string(effective_flops) +
119 |                ", var: " + var_name + ", steps: " + std::to_string(steps) +
120 |                ", delta: " + std::to_string(delta) +
121 |                ", size: " + std::to_string(size) + ", extra\"" + extra + "\"";
122 |     }
123 | };
124 | 
125 | using node_info =
126 |     std::variant<compute_node_info, compiled_loop_nest_node_info,
127 |                  transpose_node_info, compiled_transpose_node_info,
128 |                  for_loop_node_info, program_node_info>;
129 | 
130 | struct node_report;
131 | 
132 | using report_vector = std::vector<std::shared_ptr<node_report>>;
133 | 
134 | struct node_report
135 | {
136 |     node_info     info;
137 |     report_vector children;
138 | 
139 |     node_report(node_info i)
140 |         : info(i)
141 |     {
142 |     }
143 | 
144 |     node_report(node_info i, report_vector&& c)
145 |         : info(i)
146 |         , children(std::move(c))
147 |     {
148 |     }
149 | };
150 | 
151 | inline void print_report_helper(std::ostringstream&  oss,
152 |                                 report_vector const& report, int indent = 0)
153 | {
154 |     for (auto const& r : report)
155 |     {
156 |         std::visit(
157 |             [&](auto const& i)
158 |             { oss << std::string(indent, '|') << i.to_string() << '\n'; },
159 |             r->info);
160 |         print_report_helper(oss, r->children, indent + 2);
161 |     }
162 | }
163 | 
164 | inline std::string print_report(report_vector const& report, int indent)
165 | {
166 |     std::ostringstream oss;
167 |     print_report_helper(oss, report, indent);
168 |     return oss.str();
169 | }
170 | 
171 | inline std::string print_report(std::shared_ptr<node_report> const& node,
172 |                                 int                                 indent = 0)
173 | {
174 |     std::ostringstream oss;
175 |     std::visit([&](auto const& i)
176 |                { oss << std::string(indent, '|') << i.to_string() << '\n'; },
177 |                node->info);
178 |     print_report_helper(oss, node->children, indent + 2);
179 |     return oss.str();
180 | }
181 | 
182 | } // namespace loop_tree
183 | } // namespace dabun
184 | 


--------------------------------------------------------------------------------
/include/dabun/loop_tree/transpose_node.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #pragma once
  7 | 
  8 | #include "dabun/loop_tree/node.hpp"
  9 | 
 10 | namespace dabun
 11 | {
 12 | namespace loop_tree
 13 | {
 14 | 
 15 | template <extension VEX, class Arithmetic>
 16 | class transpose_node : public node<VEX, Arithmetic>
 17 | {
 18 | 
 19 | private:
 20 |     using super_type = node<VEX, Arithmetic>;
 21 | 
 22 |     std::string        input;
 23 |     std::string        output;
 24 |     strides_map_type   strides;
 25 |     std::optional<int> unroll_limit;
 26 | 
 27 | public:
 28 |     std::string dump(formulas_map_type const& /* formulas */,
 29 |                      std::map<std::string, int> const& /* sizes */,
 30 |                      std::string const& indent) const override
 31 |     {
 32 |         std::ostringstream ss;
 33 |         ss << indent << "Interpreted transpose" << std::endl;
 34 |         ss << indent << "Input: " << input << std::endl;
 35 |         ss << indent << "Output: " << output << std::endl;
 36 |         ss << utility::dump_strides(strides, indent);
 37 |         return ss.str();
 38 |     }
 39 | 
 40 | public:
 41 |     transpose_node(std::string const& input, std::string const& output,
 42 |                    strides_map_type const& strides,
 43 |                    std::optional<int>      unroll_limit = std::nullopt)
 44 |         : super_type(node_kind::transpose)
 45 |         , input(input)
 46 |         , output(output)
 47 |         , strides(strides)
 48 |         , unroll_limit(unroll_limit)
 49 |     {
 50 |     }
 51 | 
 52 |     std::string const& get_input() const { return input; }
 53 | 
 54 |     std::string const& get_output() const { return output; }
 55 | 
 56 |     std::optional<int> get_unroll_limit() const { return unroll_limit; }
 57 | 
 58 |     std::set<std::string> get_tensors_used() const override
 59 |     {
 60 |         return {input, output};
 61 |     }
 62 | 
 63 |     std::set<std::string> get_output_tensors() const override
 64 |     {
 65 |         return {output};
 66 |     }
 67 | 
 68 |     strides_map_type const& get_tensor_strides() const override
 69 |     {
 70 |         return strides;
 71 |     }
 72 | 
 73 |     std::pair<loop_tree_fn_type<Arithmetic>, report_vector>
 74 |     get_fn(std::map<std::string, int> const& tensors_idx,
 75 |            std::map<std::string, int> const&, std::map<std::string, int> const&,
 76 |            formulas_map_type const&, bool) const override
 77 |     {
 78 |         report_vector report = {
 79 |             std::make_shared<node_report>(transpose_node_info{})};
 80 | 
 81 |         return {[input = this->input, output = this->output,
 82 |                  input_idx  = tensors_idx.at(input),
 83 |                  output_idx = tensors_idx.at(output)](
 84 |                     std::vector<Arithmetic*>& tensors, std::vector<int>&)
 85 |                 {
 86 |                     strong_assert(tensors[input_idx]);
 87 |                     strong_assert(tensors[output_idx]);
 88 | 
 89 |                     Arithmetic* A = tensors[input_idx];
 90 |                     Arithmetic* C = tensors[output_idx];
 91 |                     C[0]          = A[0];
 92 |                 },
 93 |                 report};
 94 |     }
 95 | };
 96 | 
 97 | template <extension VEX, class Arithmetic>
 98 | node_ptr<VEX, Arithmetic>
 99 | make_transpose_node(std::string const& input, std::string const& output,
100 |                     strides_map_type const& strides,
101 |                     std::optional<int>      unroll_limit = std::nullopt)
102 | {
103 |     return node_ptr<VEX, Arithmetic>(new transpose_node<VEX, Arithmetic>(
104 |         input, output, strides, unroll_limit));
105 | }
106 | 
107 | } // namespace loop_tree
108 | } // namespace dabun
109 | 


--------------------------------------------------------------------------------
/include/dabun/loop_tree/types.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "dabun/elementwise_operation.hpp"
 9 | #include "dabun/isa.hpp"
10 | 
11 | #include <map>
12 | #include <memory>
13 | #include <set>
14 | #include <string>
15 | #include <vector>
16 | 
17 | namespace dabun
18 | {
19 | namespace loop_tree
20 | {
21 | 
22 | // forward declaration
23 | template <extension, class>
24 | class node;
25 | 
26 | // Type aliases for readability
27 | // void (vector of tensors, vector of alpha offsets)
28 | template <class Arithmetic>
29 | using loop_tree_fn_type =
30 |     std::function<void(std::vector<Arithmetic*>&, std::vector<int>&)>;
31 | 
32 | // map from name to map of strides
33 | using strides_map_type = std::map<std::string, std::map<std::string, int>>;
34 | 
35 | // map from name to set of dimensions
36 | using formulas_map_type = std::map<std::string, std::set<std::string>>;
37 | 
38 | template <class ISA>
39 | using elementwise_op_ptr = std::shared_ptr<elementwise_operation<ISA>>;
40 | 
41 | template <extension VEX, class Arithmetic>
42 | using node_ptr = std::shared_ptr<node<VEX, Arithmetic>>;
43 | 
44 | // Note: add classes from dabun/arithmetic_operations.hpp
45 | // as needed
46 | enum class arithmetic_op_kind
47 | {
48 |     plus,
49 |     multiplies,
50 |     max,
51 |     min
52 | };
53 | 
54 | } // namespace loop_tree
55 | } // namespace dabun
56 | 


--------------------------------------------------------------------------------
/include/dabun/loop_tree/utility.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #pragma once
  7 | 
  8 | #include <map>
  9 | #include <sstream>
 10 | #include <string>
 11 | #include <utility>
 12 | #include <vector>
 13 | 
 14 | #include "dabun/arithmetic_operation.hpp"
 15 | #include "dabun/isa.hpp"
 16 | #include "dabun/loop_tree/types.hpp"
 17 | 
 18 | namespace dabun
 19 | {
 20 | namespace loop_tree
 21 | {
 22 | namespace utility
 23 | {
 24 | 
 25 | inline std::shared_ptr<operation_pair_base>
 26 | get_operation_pair(arithmetic_op_kind plus_op, arithmetic_op_kind multiplies_op)
 27 | {
 28 | 
 29 |     std::map<std::pair<arithmetic_op_kind, arithmetic_op_kind>,
 30 |              std::shared_ptr<operation_pair_base>>
 31 | #ifndef DABUN_ARCH_AARCH64
 32 |         op_map = {
 33 |             {{arithmetic_op_kind::plus, arithmetic_op_kind::multiplies},
 34 |              std::make_shared<
 35 |                  operation_pair<op::basic_plus, op::basic_multiplies>>()},
 36 |             {{arithmetic_op_kind::max, arithmetic_op_kind::multiplies},
 37 |              std::make_shared<operation_pair<op::max, op::basic_multiplies>>()},
 38 |             {{arithmetic_op_kind::min, arithmetic_op_kind::multiplies},
 39 |              std::make_shared<operation_pair<op::min, op::basic_multiplies>>()},
 40 |             {{arithmetic_op_kind::max, arithmetic_op_kind::plus},
 41 |              std::make_shared<operation_pair<op::max, op::basic_plus>>()}};
 42 | #else
 43 |         op_map = {{{arithmetic_op_kind::plus, arithmetic_op_kind::multiplies},
 44 |                    std::make_shared<operation_pair_base>()}};
 45 | #endif
 46 | 
 47 |     return op_map.at({plus_op, multiplies_op});
 48 | }
 49 | 
 50 | inline std::string dump_strides(strides_map_type const& strides,
 51 |                                 std::string const&      indent)
 52 | {
 53 |     std::ostringstream ss;
 54 |     ss << indent << "Strides: " << std::endl;
 55 |     for (auto const& tensor_strides : strides)
 56 |     {
 57 |         // tensor
 58 |         ss << indent << " " << tensor_strides.first << ": ";
 59 |         // strides
 60 |         for (auto const& entry : tensor_strides.second)
 61 |         {
 62 |             ss << entry.first << ":" << entry.second << " ";
 63 |         }
 64 |         ss << std::endl;
 65 |     }
 66 |     return ss.str();
 67 | }
 68 | 
 69 | inline std::string dump_formula(formulas_map_type const& formulas,
 70 |                                 std::string const&       indent)
 71 | {
 72 |     std::ostringstream ss;
 73 |     ss << indent << "Formulas: " << std::endl;
 74 |     for (auto const& tensor_formula : formulas)
 75 |     {
 76 |         // tensor
 77 |         ss << indent << " " << tensor_formula.first << ": ";
 78 |         // formula
 79 |         for (auto const& entry : tensor_formula.second)
 80 |         {
 81 |             ss << entry << " ";
 82 |         }
 83 |         ss << std::endl;
 84 |     }
 85 |     return ss.str();
 86 | }
 87 | 
 88 | inline std::string dump_tensors(std::vector<std::string> const& tensors,
 89 |                                 std::string const&              indent)
 90 | {
 91 |     std::ostringstream ss;
 92 |     for (auto const& i : tensors)
 93 |     {
 94 |         ss << indent << i << " ";
 95 |     }
 96 |     ss << std::endl;
 97 |     return ss.str();
 98 | }
 99 | 
100 | inline std::string
101 | dump_order(std::vector<std::pair<std::string, int>> const& order,
102 |            std::string const&                              indent)
103 | {
104 |     std::ostringstream ss;
105 |     ss << indent << "Order: ";
106 |     for (auto const& o : order)
107 |     {
108 |         ss << o.first << ":" << o.second << " ";
109 |     }
110 |     ss << std::endl;
111 |     return ss.str();
112 | }
113 | 
114 | inline std::string dump_sizes(std::map<std::string, int> const& sizes,
115 |                               std::string const&                indent)
116 | {
117 |     std::ostringstream ss;
118 |     ss << indent << "Sizes: ";
119 |     for (auto const& s : sizes)
120 |     {
121 |         ss << s.first << ":" << s.second << " ";
122 |     }
123 |     ss << std::endl;
124 |     return ss.str();
125 | }
126 | 
127 | } // namespace utility
128 | } // namespace loop_tree
129 | } // namespace dabun
130 | 


--------------------------------------------------------------------------------
/include/dabun/math.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "dabun/core.hpp"
 9 | 
10 | #include <tuple>
11 | #include <type_traits>
12 | 
13 | namespace dabun
14 | {
15 | 
16 | template <typename T>
17 | constexpr inline T ceil_div(T a, identity_type_t<T> b) noexcept
18 | {
19 |     return (a + b - 1) / b;
20 | }
21 | 
22 | template <typename T>
23 | constexpr inline T round_up(T a, identity_type_t<T> b) noexcept
24 | {
25 |     return ceil_div(a, b) * b;
26 | }
27 | 
28 | template <typename T>
29 | constexpr inline std::tuple<T, T> full_rest(T                  total,
30 |                                             identity_type_t<T> delta) noexcept
31 | {
32 |     return {total / delta, total % delta};
33 | }
34 | 
35 | // Equals to the number of iterations of the loop
36 | // for (T i = from; i < to; i += stride)
37 | // Assumes from <= to
38 | template <typename T>
39 | constexpr inline auto num_iterations(T from, identity_type_t<T> to,
40 |                                      identity_type_t<T> stride) noexcept
41 |     -> std::enable_if_t<std::is_integral_v<T>, T>
42 | {
43 |     return ceil_div(to - from, stride);
44 | }
45 | 
46 | } // namespace dabun
47 | 


--------------------------------------------------------------------------------
/include/dabun/numeric.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <sysml/numeric.hpp>
 9 | 
10 | namespace dabun
11 | {
12 | 
13 | using sysml::fp16_t;
14 | using sysml::fp32_t;
15 | using sysml::fp64_t;
16 | 
17 | using sysml::ivec;
18 | 
19 | using namespace sysml::ivec_specializations;
20 | 
21 | } // namespace dabun
22 | 


--------------------------------------------------------------------------------
/include/dabun/one_constant.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | namespace dabun
 9 | {
10 | 
11 | template <class Float>
12 | Float const one_actual_constant = static_cast<Float>(1);
13 | 
14 | template <class Float>
15 | Float const* const one_constant = &one_actual_constant<Float>;
16 | 
17 | } // namespace dabun
18 | 


--------------------------------------------------------------------------------
/include/dabun/peak_gflops.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "dabun/isa.hpp"
 9 | 
10 | #if defined(DABUN_ARCH_AARCH64)
11 | #    include "dabun/arm/peak_gflops.hpp"
12 | #else
13 | #    include "dabun/x86/peak_gflops.hpp"
14 | #endif
15 | 
16 | namespace dabun::impl
17 | {
18 | 
19 | } // namespace dabun::impl
20 | 
21 | namespace dabun
22 | {
23 | 
24 | namespace detail
25 | {
26 | 
27 | template <class T, class A>
28 | struct peak_gflops_impl
29 | {
30 |     static double peak_gflops(int iterations = 1000000);
31 |     static double measure_peak_gflops(double secs,
32 |                                       int    max_iterations = 1000000);
33 | };
34 | 
35 | #if defined(DABUN_REQUIES_TEMPLATE_DEFINITION)
36 | 
37 | template <class T, class A>
38 | double peak_gflops_impl<T, A>::peak_gflops(int iterations)
39 | {
40 |     auto measurement =
41 |         DABUN_ISA_NAMESPACE ::bench_gflops<T, A>::do_bench(iterations);
42 |     return measurement.first / measurement.second;
43 | }
44 | 
45 | template <class T, class A>
46 | double peak_gflops_impl<T, A>::measure_peak_gflops(double secs,
47 |                                                    int    max_iterations)
48 | {
49 |     int  cur_it = 1;
50 |     auto measurement =
51 |         DABUN_ISA_NAMESPACE ::bench_gflops<T, A>::do_bench(cur_it);
52 | 
53 |     while (measurement.first < secs && cur_it <= max_iterations)
54 |     {
55 |         cur_it *= 2;
56 |         measurement =
57 |             DABUN_ISA_NAMESPACE ::bench_gflops<T, A>::do_bench(cur_it);
58 |     }
59 | 
60 |     return measurement.first / measurement.second;
61 | }
62 | 
63 | #endif
64 | 
65 | #if defined(DABUN_MAYBE_EXTN_TPL_INSTNTON)
66 | 
67 | #    if defined(DABUN_ARCH_AARCH64)
68 | 
69 | DABUN_MAYBE_EXTN_TPL_INSTNTON struct peak_gflops_impl<aarch64, fp32_t>;
70 | DABUN_MAYBE_EXTN_TPL_INSTNTON struct peak_gflops_impl<aarch64, fp16_t>;
71 | 
72 | #    else
73 | 
74 | DABUN_MAYBE_EXTN_TPL_INSTNTON struct peak_gflops_impl<avx2, float>;
75 | DABUN_MAYBE_EXTN_TPL_INSTNTON struct peak_gflops_impl<avx512, float>;
76 | DABUN_MAYBE_EXTN_TPL_INSTNTON struct peak_gflops_impl<avx2_plus, float>;
77 | 
78 | #    endif
79 | 
80 | #endif
81 | 
82 | } // namespace detail
83 | 
84 | template <class T, class A>
85 | double peak_gflops(int iterations = 1000000)
86 | {
87 |     return detail::peak_gflops_impl<T, A>::peak_gflops(iterations);
88 | }
89 | 
90 | template <class T, class A>
91 | double measure_peak_gflops(double secs, int max_iterations = 1000000)
92 | {
93 |     return detail::peak_gflops_impl<T, A>::measure_peak_gflops(secs,
94 |                                                                max_iterations);
95 | }
96 | 
97 | } // namespace dabun
98 | 


--------------------------------------------------------------------------------
/include/dabun/predef.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <sysml/predef.hpp>
 9 | 
10 | // Inspired by boost predef
11 | 
12 | #define DABUN_VERSION_NUMBER(major, minor, patch)                              \
13 |     SYSML_VERSION_NUMBER(major, minor, patch)
14 | 
15 | #if defined(__clang__)
16 | 
17 | #    define DABUN_COMP_CLANG                                                   \
18 |         DABUN_VERSION_NUMBER(__clang_major__, __clang_minor__,                 \
19 |                              __clang_patchlevel__)
20 | 
21 | #elif defined(__GNUC__)
22 | 
23 | #    define DABUN_COMP_GNUC                                                    \
24 |         DABUN_VERSION_NUMBER(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
25 | 
26 | #else
27 | 
28 | #    error "Compiler not supported"
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/include/dabun/random_vector.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #pragma once
  7 | 
  8 | #include "dabun/aligned_vector.hpp"
  9 | #include "dabun/numeric.hpp"
 10 | #include "sysml/random.hpp"
 11 | 
 12 | #include <limits>
 13 | #include <random>
 14 | #include <type_traits>
 15 | 
 16 | namespace dabun
 17 | {
 18 | 
 19 | namespace detail
 20 | {
 21 | 
 22 | template <class T>
 23 | struct random_initalizer_helper
 24 | {
 25 | 
 26 |     template <class Float = T>
 27 |     static auto get_random_vector(unsigned size, unsigned extra_elements)
 28 |         -> std::enable_if_t<std::is_floating_point_v<Float> ||
 29 |                                 std::is_same_v<Float, fp16_t>,
 30 |                             aligned_vector<Float>>
 31 |     {
 32 |         aligned_vector<Float> res(size + extra_elements);
 33 | 
 34 |         std::random_device rd;
 35 |         std::mt19937       gen(0); // rd());
 36 | 
 37 |         sysml::uniform_distribution<double> dis(-1.0, 1.0);
 38 | 
 39 |         for (auto& f : res)
 40 |         {
 41 |             f = dis(gen);
 42 |         }
 43 | 
 44 |         return res;
 45 |     }
 46 | 
 47 |     template <class Integer = T>
 48 |     static auto get_random_vector(unsigned size, unsigned extra_elements)
 49 |         -> std::enable_if_t<std::is_integral_v<Integer>,
 50 |                             aligned_vector<Integer>>
 51 |     {
 52 |         aligned_vector<Integer> res(size + extra_elements);
 53 | 
 54 |         std::random_device rd;
 55 |         std::mt19937       gen(0); // rd());
 56 | 
 57 |         sysml::uniform_distribution<Integer> dis(
 58 |             std::numeric_limits<Integer>::min(),
 59 |             std::numeric_limits<Integer>::max());
 60 | 
 61 |         for (auto& f : res)
 62 |         {
 63 |             f = dis(gen);
 64 |         }
 65 | 
 66 |         return res;
 67 |     }
 68 | 
 69 |     template <class Float = T>
 70 |     static auto get_zero_vector(unsigned size, unsigned extra_elements)
 71 |         -> std::enable_if_t<std::is_floating_point_v<Float> ||
 72 |                                 std::is_same_v<Float, fp16_t>,
 73 |                             aligned_vector<Float>>
 74 |     {
 75 |         aligned_vector<Float> res(size + extra_elements);
 76 |         return res;
 77 |     }
 78 | 
 79 |     template <class Integer = T>
 80 |     static auto get_zero_vector(unsigned size, unsigned extra_elements)
 81 |         -> std::enable_if_t<std::is_integral_v<Integer>,
 82 |                             aligned_vector<Integer>>
 83 |     {
 84 |         aligned_vector<Integer> res(size + extra_elements);
 85 |         return res;
 86 |     }
 87 | };
 88 | 
 89 | } // namespace detail
 90 | 
 91 | template <class T>
 92 | decltype(auto) get_random_vector(unsigned size, unsigned extra_elements = 16)
 93 | {
 94 |     return detail::random_initalizer_helper<T>::get_random_vector(
 95 |         size, extra_elements);
 96 | }
 97 | 
 98 | template <class T>
 99 | decltype(auto) get_zero_vector(unsigned size, unsigned extra_elements = 16)
100 | {
101 |     return detail::random_initalizer_helper<T>::get_zero_vector(size,
102 |                                                                 extra_elements);
103 | }
104 | 
105 | template <class To, class From>
106 | auto aligned_vector_cast(aligned_vector<From> const& from)
107 |     -> std::enable_if_t<std::is_convertible_v<From, To>, aligned_vector<To>>
108 | {
109 |     aligned_vector<To> ret(from.size());
110 | 
111 |     for (std::size_t i = 0; i < from.size(); ++i)
112 |     {
113 |         ret[i] = static_cast<To>(from[i]);
114 |     }
115 | 
116 |     return ret;
117 | }
118 | 
119 | } // namespace dabun
120 | 


--------------------------------------------------------------------------------
/include/dabun/tensillica/dl_compiled_fn.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #include once
 7 | 
 8 | namespace dabun
 9 | {
10 | namespace tensillica
11 | {
12 | 
13 | template <typename Signature>
14 | class unique_dl_compiled_fn;
15 | 
16 | template <typename Signature>
17 | class shared_dl_compiled_fn;
18 | 
19 | template <typename Signature>
20 | class weak_dl_compiled_fn;
21 | 
22 | template <typename ReturnType, typename... Args>
23 | class unique_dl_compiled_fn<ReturnType(Args...)>
24 | {
25 | public:
26 |     using function_pointer_type = ReturnType (*)(Args...);
27 | };
28 | 
29 | } // namespace tensillica
30 | } // namespace dabun
31 | 


--------------------------------------------------------------------------------
/include/dabun/tensillica/multi_vmm.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #pragma once
  7 | 
  8 | #include "dabun/numeric.hpp"
  9 | 
 10 | #include <cassert>
 11 | #include <type_traits>
 12 | 
 13 | namespace dabun
 14 | {
 15 | namespace tensillica
 16 | {
 17 | 
 18 | // The main usage of the multi_vmm class is to increase the amount of
 19 | // independent operations when accumulating to a single vector
 20 | // register.  This is accomplished by using multiple vector registers
 21 | // which are reduced to a single one at the end.  Each of the size_
 22 | // registers is independent of all the other ones.
 23 | 
 24 | template <class VReg, class SReg, class Reg32>
 25 | class multi_vmm
 26 | {
 27 | private:
 28 |     int size_          = 0;
 29 |     int first_         = 0;
 30 |     int current_       = 0;
 31 |     int vlen_          = 4;
 32 |     int original_size_ = 0;
 33 | 
 34 | public:
 35 |     multi_vmm() {}
 36 | 
 37 |     multi_vmm(int s, int f)
 38 |         : size_(s)
 39 |         , first_(f)
 40 |         , current_(0)
 41 |         , original_size_(s)
 42 |     {
 43 |         assert(s > 0);
 44 |     }
 45 | 
 46 |     void reset()
 47 |     {
 48 |         size_    = original_size_;
 49 |         current_ = 0;
 50 |     }
 51 | 
 52 |     multi_vmm(multi_vmm const&)            = delete;
 53 |     multi_vmm& operator=(multi_vmm const&) = delete;
 54 | 
 55 |     multi_vmm(multi_vmm&& o) { *this = std::move(o); }
 56 | 
 57 |     multi_vmm& operator=(multi_vmm&& o)
 58 |     {
 59 |         assert(o.size_ > 0);
 60 |         size_          = o.size_;
 61 |         first_         = o.first_;
 62 |         current_       = o.current_;
 63 |         original_size_ = o.original_size_;
 64 |         return *this;
 65 |     }
 66 | 
 67 |     int size() const { return size_; }
 68 | 
 69 |     VReg operator++(int)
 70 |     {
 71 |         int c    = current_;
 72 |         current_ = (current_ + 1) % size_;
 73 |         return VReg(first_ + c);
 74 |     }
 75 | 
 76 |     VReg operator[](int s) const
 77 |     {
 78 |         assert(s < size_);
 79 |         return VReg(first_ + s);
 80 |     }
 81 | 
 82 |     VReg operator++()
 83 |     {
 84 |         current_ = (current_ + 1) % size_;
 85 |         return VReg(first_ + current_);
 86 |     }
 87 | 
 88 |     VReg current() const { return VReg(first_ + current_); }
 89 | 
 90 |     VReg first() const { return VReg(first_); }
 91 | 
 92 |     template <class Code_Generator>
 93 |     void half(Code_Generator& code_generator)
 94 |     {
 95 |         int h = (size_ + 1) / 2;
 96 |         for (int i = 0; i + h < size_; ++i)
 97 |         {
 98 |             code_generator.fadd(VReg(first_ + i).s4, VReg(first_ + i).s4,
 99 |                                 VReg(first_ + i + h).s4);
100 |         }
101 |         size_    = h;
102 |         current_ = 0;
103 |     }
104 | 
105 |     template <class Code_Generator>
106 |     void reduce(Code_Generator& code_generator)
107 |     {
108 |         while (size_ > 1)
109 |         {
110 |             half(code_generator);
111 |         }
112 |     }
113 | 
114 |     template <class Code_Generator>
115 |     void full_reduce(Code_Generator& code_generator, int mask = 4,
116 |                      int zero_vector = 0)
117 |     {
118 |         reduce(code_generator);
119 |         assert(size_ == 1);
120 | 
121 |         {
122 |             if (mask == 3)
123 |             {
124 |                 // x4/w4 is zero reg by convention in the loop_nest.hpp
125 |                 code_generator.ins(VReg(first_).s4[3], Reg32(zero_vector));
126 |             }
127 |             if (mask > 2)
128 |             {
129 |                 code_generator.faddp(VReg(first_).s4, VReg(first_).s4,
130 |                                      VReg(first_).s4);
131 |             }
132 |             if (mask > 1)
133 |             {
134 |                 code_generator.faddp(SReg(first_), VReg(first_).s2);
135 |             }
136 |         }
137 |     }
138 | };
139 | 
140 | } // namespace tensillica
141 | } // namespace dabun
142 | 


--------------------------------------------------------------------------------
/include/dabun/tensillica/peak_gflops.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <iostream>
 9 | 
10 | #include "dabun/tensillica/cpp_intrinsics_code_generator.hpp"
11 | 
12 | // #include "dabun/isa.hpp"
13 | // #ifdef DABUN_ARCH_AARCH64
14 | 
15 | // #    include "dabun/code_generator/code_generator.hpp"
16 | // #    include "dabun/isa.hpp"
17 | // #    include "dabun/math.hpp"
18 | // #    include "dabun/numeric.hpp"
19 | 
20 | // #    include <sysml/measure.hpp>
21 | 
22 | #include <utility>
23 | 
24 | namespace dabun
25 | {
26 | namespace tensillica
27 | {
28 | 
29 | struct peak_gflops : cpp_intrinsics_code_generator<std::uint64_t(float*, float*, std::uint64_t)>
30 | {
31 |     peak_gflops()
32 |     {
33 |         multi_vmm<Vmm, SReg, Reg32> mvmm(8, 0);
34 |         mvmm.full_reduce(*this, 4, 0);
35 | 
36 |         ldp(vmm0.s4, vmm1.s4, pre_ptr(x0, 4));
37 | 
38 |         mov(vmm1.s4, vmm0.s4);
39 |         fmla(vmm1.s4, vmm1.s4, vmm0.s[1]);
40 | 
41 | 
42 |         // ins(vmm0.s[1], w2);
43 |         stp(vmm0.s4, vmm1.s4, ptr(x1));
44 | 
45 |         custom_string("return x0 + x1 + x2;");
46 |     }
47 | };
48 | 
49 | struct peak_gflopsw : cpp_intrinsics_code_generator<std::uint64_t(float*, float*, std::uint64_t)>
50 | {
51 |     peak_gflopsw()
52 |     {
53 |         multi_vmm<Vmm, SReg, Reg32> mvmm(8, 0);
54 |         mvmm.full_reduce(*this, 4, 0);
55 |         custom_string("return x0 * x1 * x2;");
56 |     }
57 | };
58 | 
59 | 
60 | 
61 | } // namespace tensillica
62 | } // namespace dabun
63 | 


--------------------------------------------------------------------------------
/include/dabun/transposer.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "dabun/isa.hpp"
 9 | 
10 | #if defined(DABUN_ARCH_AARCH64)
11 | #include "dabun/arm/transposer.hpp"
12 | #else
13 | #include "dabun/x86/transposer.hpp"
14 | #endif
15 | 
16 | namespace dabun
17 | {
18 | 
19 | using DABUN_ISA_NAMESPACE ::transposer_code_generator;
20 | 
21 | #if defined(DABUN_ARCH_AARCH64)
22 | 
23 | template <extension VEX, class Arithmetic>
24 | using transposer_compiler =
25 |     transposer_code_generator<extension_to_deprecated_ISA_t<VEX>, Arithmetic>;
26 | 
27 | #else
28 | 
29 | template <extension VEX, class Arithmetic>
30 | using transposer_compiler =
31 |     transposer_code_generator<extension_to_deprecated_ISA_t<VEX>>;
32 | 
33 | #endif
34 | 
35 | } // namespace dabun
36 | 


--------------------------------------------------------------------------------
/include/dabun/utility/log.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <fstream>
 9 | #include <iostream>
10 | 
11 | namespace dabun
12 | {
13 | 
14 | #ifndef NDEBUG
15 | static constexpr bool DEBUG = true;
16 | static constexpr bool INFO  = true;
17 | #else
18 | static constexpr bool DEBUG = false;
19 | static constexpr bool INFO  = false;
20 | #endif
21 | 
22 | #if defined(DABUN_LOG_TO_FILE)
23 | 
24 | class LN_LOG
25 | {
26 | private:
27 |     bool print_ = false;
28 | 
29 | public:
30 |     explicit LN_LOG(bool p)
31 |         : print_(p)
32 |     {
33 |     }
34 | 
35 |     template <class T>
36 |     LN_LOG const& operator<<(T&& v) const
37 |     {
38 |         static std::ofstream fout("dabun_loop_nest.log");
39 |         if (print_)
40 |         {
41 |             fout << v;
42 |         }
43 |         return *this;
44 |     }
45 | };
46 | 
47 | #else
48 | 
49 | class LN_LOG
50 | {
51 | private:
52 |     bool print_ = false;
53 | 
54 | public:
55 |     explicit LN_LOG(bool p)
56 |         : print_(p)
57 |     {
58 |     }
59 | 
60 |     template <class T>
61 |     LN_LOG const& operator<<(T&& v) const
62 |     {
63 |         if (print_)
64 |         {
65 |             std::cout << v;
66 |         }
67 |         return *this;
68 |     }
69 | };
70 | 
71 | #endif
72 | 
73 | } // namespace dabun
74 | 


--------------------------------------------------------------------------------
/include/dabun/utility/most_frequent_queue.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #pragma once
  7 | 
  8 | #include <cassert>
  9 | #include <cstddef>
 10 | #include <map>
 11 | #include <set>
 12 | #include <utility>
 13 | 
 14 | namespace dabun::utility
 15 | {
 16 | 
 17 | // A queue that holds values of type T and a count for each value.  We
 18 | // can queue the most abundant value, remove it, as well as increase
 19 | // or decrease instances of each value by 1.
 20 | 
 21 | template <class T>
 22 | struct most_frequent_queue
 23 | {
 24 | private:
 25 |     using map_t = std::map<std::size_t, std::set<T>>;
 26 | 
 27 |     map_t                    sorted_;
 28 |     map_t                    ignored_;
 29 |     std::map<T, std::size_t> counts_;
 30 | 
 31 |     bool was_read_ = false;
 32 | 
 33 |     std::pair<map_t*, std::size_t> remove_existing(T const& v)
 34 |     {
 35 |         assert(counts_.count(v) > 0);
 36 | 
 37 |         std::size_t s = counts_[v];
 38 | 
 39 |         if (sorted_.count(s) > 0 && sorted_[s].count(v))
 40 |         {
 41 |             auto& bucket = sorted_[s];
 42 | 
 43 |             assert(bucket.count(v) > 0);
 44 | 
 45 |             bucket.erase(v);
 46 |             if (bucket.size() == 0)
 47 |             {
 48 |                 sorted_.erase(s);
 49 |             }
 50 | 
 51 |             counts_.erase(v);
 52 |             return {&sorted_, s};
 53 |         }
 54 |         else
 55 |         {
 56 |             assert(ignored_.count(s) > 0);
 57 | 
 58 |             auto& bucket = ignored_[s];
 59 | 
 60 |             assert(bucket.count(v) > 0);
 61 | 
 62 |             bucket.erase(v);
 63 |             if (bucket.size() == 0)
 64 |             {
 65 |                 ignored_.erase(s);
 66 |             }
 67 | 
 68 |             counts_.erase(v);
 69 |             return {&ignored_, s};
 70 |         }
 71 |     }
 72 | 
 73 |     void add_with_count(map_t* where, T const& v, std::size_t s)
 74 |     {
 75 |         counts_[v] = s;
 76 |         assert(where->count(s) == 0 || (*where)[s].count(v) == 0);
 77 |         (*where)[s].insert(v);
 78 |     }
 79 | 
 80 |     void pop_or_skip(map_t* /* where */) {}
 81 | 
 82 | public:
 83 |     std::size_t size() const { return sorted_.size(); }
 84 | 
 85 |     T top() const
 86 |     {
 87 |         assert(sorted_.size());
 88 |         return *(sorted_.crbegin()->second.cbegin());
 89 |     }
 90 | 
 91 |     T get_top_then_pop()
 92 |     {
 93 |         was_read_ = true;
 94 |         T ret     = top();
 95 |         pop();
 96 |         return ret;
 97 |     }
 98 | 
 99 |     void pop()
100 |     {
101 |         was_read_ = true;
102 |         assert(sorted_.size() > 0);
103 |         auto& slot = sorted_.rbegin()->second;
104 | 
105 |         assert(slot.size() > 0);
106 |         auto const& v = *slot.begin();
107 | 
108 |         assert(counts_.count(v) > 0);
109 |         counts_.erase(v);
110 | 
111 |         slot.erase(slot.begin());
112 |         if (slot.size() == 0)
113 |         {
114 |             sorted_.erase(sorted_.rbegin()->first);
115 |         }
116 |     }
117 | 
118 |     void skip()
119 |     {
120 |         was_read_ = true;
121 |         assert(sorted_.size() > 0);
122 | 
123 |         auto& slot = sorted_.rbegin()->second;
124 | 
125 |         assert(slot.size() > 0);
126 |         auto const& v = *slot.begin();
127 | 
128 |         assert(counts_.count(v) > 0);
129 | 
130 |         slot.erase(slot.begin());
131 |         if (slot.size() == 0)
132 |         {
133 |             sorted_.erase(sorted_.rbegin()->first);
134 |         }
135 | 
136 |         ignored_[counts_[v]].insert(v);
137 |     }
138 | 
139 |     void inc(T const& v)
140 |     {
141 |         assert(!was_read_);
142 | 
143 |         if (counts_.count(v))
144 |         {
145 |             auto s = remove_existing(v);
146 |             add_with_count(s.first, v, s.second + 1);
147 |         }
148 |         else
149 |         {
150 |             add_with_count(&sorted_, v, 1);
151 |         }
152 |     }
153 | 
154 |     void dec(T const& v)
155 |     {
156 |         assert(was_read_);
157 |         assert(counts_.count(v));
158 | 
159 |         auto s = remove_existing(v);
160 |         if (--s.second > 0)
161 |         {
162 |             add_with_count(s.first, v, s.second);
163 |         }
164 |     }
165 | };
166 | 
167 | } // namespace dabun::utility
168 | 


--------------------------------------------------------------------------------
/include/dabun/utility/tmp_file_name.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <atomic>
 9 | #include <sstream>
10 | #include <string>
11 | #include <thread>
12 | 
13 | namespace dabun::utility
14 | {
15 | 
16 | inline std::string get_temporary_file_name(std::string const& suffix,
17 |                                            std::string const& dir = "/tmp")
18 | {
19 |     static std::atomic<int> counter(0);
20 | 
21 |     std::ostringstream oss;
22 |     oss << dir << "/" << std::this_thread::get_id() << "_" << (counter++)
23 |         << suffix;
24 |     return oss.str();
25 | }
26 | 
27 | } // namespace dabun::utility
28 | 


--------------------------------------------------------------------------------
/include/dabun/x86/configuration.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "dabun/isa.hpp"
 9 | #ifdef DABUN_ARCH_X86_64
10 | 
11 | namespace dabun
12 | {
13 | namespace x86
14 | {
15 | 
16 | class OptimizationConfiguration
17 | {
18 | private:
19 |     bool delay_innermost_operations_;
20 |     bool split_vector_registers_;
21 |     bool use_address_packer_;
22 | 
23 | public:
24 |     OptimizationConfiguration(bool delay_innermost_operations,
25 |                               bool split_vector_registers,
26 |                               bool use_address_packer)
27 |         : delay_innermost_operations_(delay_innermost_operations)
28 |         , split_vector_registers_(split_vector_registers)
29 |         , use_address_packer_(use_address_packer)
30 | 
31 |     {
32 |     }
33 | 
34 |     OptimizationConfiguration()
35 |         : delay_innermost_operations_(true)
36 |         , split_vector_registers_(true)
37 |         , use_address_packer_(true)
38 |     {
39 |     }
40 | 
41 |     bool delay_innermost_operations() const
42 |     {
43 |         return delay_innermost_operations_;
44 |     }
45 | 
46 |     bool split_vector_registers() const { return split_vector_registers_; }
47 | 
48 |     bool use_address_packer() const { return use_address_packer_; }
49 | };
50 | 
51 | inline OptimizationConfiguration all_optims(true, true, true);
52 | 
53 | // technically no optimizations beyond output tensor register blocking
54 | inline OptimizationConfiguration no_optims(false, false, false);
55 | 
56 | } // namespace x86
57 | } // namespace dabun
58 | 
59 | #endif
60 | 


--------------------------------------------------------------------------------
/include/dabun/x86/denormals.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "dabun/isa.hpp"
 9 | #ifdef DABUN_ARCH_X86_64
10 | 
11 | // Mosty from
12 | // https://en.wikipedia.org/wiki/Denormal_number#Disabling_denormal_floats_at_the_code_level
13 | 
14 | #include <xmmintrin.h>
15 | 
16 | #define LN_MM_DENORMALS_ZERO_MASK 0x0040
17 | #define LN_MM_DENORMALS_ZERO_ON 0x0040
18 | #define LN_MM_DENORMALS_ZERO_OFF 0x0000
19 | 
20 | #define LN_MM_FLUSH_ZERO_MASK 0x8000
21 | #define LN_MM_FLUSH_ZERO_ON 0x8000
22 | #define LN_MM_FLUSH_ZERO_OFF 0x0000
23 | 
24 | #define LN_MM_SET_DENORMALS_ZERO_MODE(mode)                                    \
25 |     _mm_setcsr((_mm_getcsr() & ~LN_MM_DENORMALS_ZERO_MASK) | (mode))
26 | 
27 | #define LN_MM_GET_DENORMALS_ZERO_MODE()                                        \
28 |     (_mm_getcsr() & LN_MM_DENORMALS_ZERO_MASK)
29 | 
30 | #define LN_MM_SET_FLUSH_ZERO_MODE(mode)                                        \
31 |     _mm_setcsr((_mm_getcsr() & ~LN_MM_FLUSH_ZERO_MASK) | (mode))
32 | 
33 | #define LN_MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & LN_MM_FLUSH_ZERO_MASK)
34 | 
35 | namespace dabun::detail
36 | {
37 | class denormals_disabler
38 | {
39 | private:
40 |     unsigned int previous_value;
41 | 
42 | public:
43 |     denormals_disabler()
44 |     {
45 |         previous_value = _mm_getcsr();
46 |         _mm_setcsr(previous_value | LN_MM_DENORMALS_ZERO_ON |
47 |                    LN_MM_FLUSH_ZERO_ON);
48 |     }
49 | 
50 |     ~denormals_disabler() { _mm_setcsr(previous_value); }
51 | 
52 |     denormals_disabler(denormals_disabler const&) = delete;
53 |     denormals_disabler& operator=(denormals_disabler const&) = delete;
54 | };
55 | 
56 | inline denormals_disabler denormals_disabler_instance;
57 | 
58 | } // namespace dabun::detail
59 | 
60 | #undef LN_MM_DENORMALS_ZERO_MASK
61 | #undef LN_MM_DENORMALS_ZERO_ON
62 | #undef LN_MM_DENORMALS_ZERO_OFF
63 | 
64 | #undef LN_MM_SET_DENORMALS_ZERO_MODE
65 | #undef LN_MM_GET_DENORMALS_ZERO_MODE
66 | 
67 | #undef LN_MM_FLUSH_ZERO_MASK
68 | #undef LN_MM_FLUSH_ZERO_ON
69 | #undef LN_MM_FLUSH_ZERO_OFF
70 | 
71 | #undef LN_MM_SET_FLUSH_ZERO_MODE
72 | #undef LN_MM_GET_FLUSH_ZERO_MODE
73 | 
74 | #endif
75 | 


--------------------------------------------------------------------------------
/include/dabun/x86/multi_vmm.hpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #pragma once
  7 | 
  8 | #include "dabun/isa.hpp"
  9 | #ifdef DABUN_ARCH_X86_64
 10 | 
 11 | #include "dabun/x86/arithmetic_operation.hpp"
 12 | 
 13 | #include <cassert>
 14 | 
 15 | namespace dabun
 16 | {
 17 | namespace x86
 18 | {
 19 | 
 20 | // The main usage of the multi_vmm class is to increase the amount of
 21 | // independent operations when accumulating to a single vector
 22 | // register.  This is accomplished by using multiple vector registers
 23 | // which are reduced to a single one at the end.  Each of the size_
 24 | // registers is independent of all the other ones.
 25 | 
 26 | template <class Vmm>
 27 | class multi_vmm
 28 | {
 29 | private:
 30 |     int size_          = 0;
 31 |     int first_         = 0;
 32 |     int current_       = 0;
 33 |     int original_size_ = 0;
 34 |     int max_touched_   = 0;
 35 | 
 36 | public:
 37 |     multi_vmm() {}
 38 | 
 39 |     multi_vmm(int s, int f)
 40 |         : size_(s)
 41 |         , first_(f)
 42 |         , current_(0)
 43 |         , original_size_(s)
 44 |         , max_touched_(0)
 45 |     {
 46 |         assert(s > 0);
 47 |     }
 48 | 
 49 |     void reset()
 50 |     {
 51 |         size_    = original_size_;
 52 |         current_ = 0;
 53 |     }
 54 | 
 55 |     multi_vmm(multi_vmm const&) = delete;
 56 |     multi_vmm& operator=(multi_vmm const&) = delete;
 57 | 
 58 |     multi_vmm(multi_vmm&& o) { *this = std::move(o); }
 59 | 
 60 |     multi_vmm& operator=(multi_vmm&& o)
 61 |     {
 62 |         assert(o.size_ > 0);
 63 |         size_          = o.size_;
 64 |         first_         = o.first_;
 65 |         current_       = o.current_;
 66 |         original_size_ = o.original_size_;
 67 |         max_touched_   = o.max_touched_;
 68 |         return *this;
 69 |     }
 70 | 
 71 |     int size() const { return size_; }
 72 | 
 73 |     Vmm operator++(int)
 74 |     {
 75 |         int c        = current_;
 76 |         current_     = (current_ + 1) % size_;
 77 |         max_touched_ = std::max(max_touched_, current_);
 78 |         return Vmm(first_ + c);
 79 |     }
 80 | 
 81 |     Vmm operator[](int s) const
 82 |     {
 83 |         assert(s < size_);
 84 |         return Vmm(first_ + s);
 85 |     }
 86 | 
 87 |     Vmm operator++()
 88 |     {
 89 |         current_ = (current_ + 1) % size_;
 90 |         return Vmm(first_ + current_);
 91 |     }
 92 | 
 93 |     Vmm current() const { return Vmm(first_ + current_); }
 94 | 
 95 |     Vmm first() const { return Vmm(first_); }
 96 | 
 97 |     template <class Code_Generator>
 98 |     void half(Code_Generator&                      code_generator,
 99 |               std::shared_ptr<operation_pair_base> op_pair)
100 |     {
101 |         int h = (size_ + 1) / 2;
102 |         for (int i = 0; i + h < size_; ++i)
103 |         {
104 |             op_pair->plus(code_generator, Vmm(first_ + i), Vmm(first_ + i),
105 |                           Vmm(first_ + i + h));
106 |         }
107 |         size_    = h;
108 |         current_ = 0;
109 |     }
110 | 
111 |     template <class Code_Generator>
112 |     void reduce(Code_Generator&                      code_generator,
113 |                 std::shared_ptr<operation_pair_base> op_pair)
114 |     {
115 |         // size_ = max_touched_ + 1;
116 |         while (size_ > 1)
117 |         {
118 |             half(code_generator, op_pair);
119 |         }
120 |     }
121 | };
122 | 
123 | } // namespace x86
124 | } // namespace dabun
125 | 
126 | #endif
127 | 


--------------------------------------------------------------------------------
/include/dabun/x86/peak_gflops.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "dabun/isa.hpp"
 9 | 
10 | #ifdef DABUN_ARCH_X86_64
11 | 
12 | #    include "dabun/code_generator/code_generator.hpp"
13 | #    include "dabun/math.hpp"
14 | #    include "dabun/x86/xbyak.hpp"
15 | 
16 | #    include <sysml/measure.hpp>
17 | 
18 | #    include <utility>
19 | 
20 | namespace dabun
21 | {
22 | namespace x86
23 | {
24 | 
25 | template <class ISA, class Float = float>
26 | struct bench_gflops
27 | {
28 | private:
29 |     static_assert(std::is_same_v<ISA, avx2> || std::is_same_v<ISA, avx512> ||
30 |                   std::is_same_v<ISA, avx2_plus>);
31 |     static_assert(std::is_same_v<Float, float>);
32 | 
33 |     using Vmm =
34 |         std::conditional_t<std::is_same_v<ISA, avx512>, Xbyak::Zmm, Xbyak::Ymm>;
35 |     static constexpr int vector_size = isa_traits<ISA>::vector_size;
36 |     static constexpr int num_vector_regs =
37 |         isa_traits<ISA>::total_vector_registers;
38 | 
39 |     class test : public code_generator<void(float*)>
40 |     {
41 |     public:
42 |         test(int iterations)
43 |         {
44 |             Label loopLabel;
45 |             mov(rax, 0);
46 |             L(loopLabel);
47 | 
48 |             vbroadcastss(Vmm(num_vector_regs - 1), ptr[rdi]);
49 |             vbroadcastss(Vmm(num_vector_regs - 2), ptr[rdi]);
50 | 
51 |             for (int i = 0; i < 10; ++i)
52 |             {
53 |                 for (int j = 0; j < num_vector_regs - 2; ++j)
54 |                 {
55 |                     vfmadd231ps(Vmm(j), Vmm(num_vector_regs - 1),
56 |                                 Vmm(num_vector_regs - 2));
57 |                 }
58 |             }
59 | 
60 |             add(rax, 1);
61 |             cmp(rax, iterations);
62 |             jl(loopLabel);
63 |             ret();
64 |         }
65 |     };
66 | 
67 | public:
68 |     static std::pair<double, double> do_bench(int iterations = 10000000)
69 |     {
70 |         auto  fn      = test(iterations).get_shared();
71 |         float data[1] = {0};
72 | 
73 |         auto secs = sysml::measure_fastest([&]() { fn(data); }, 100);
74 | 
75 |         double gflops = 2.0 * iterations * 10 * (num_vector_regs - 2) *
76 |                         vector_size / 1000000000;
77 | 
78 |         return {gflops, secs};
79 |     }
80 | };
81 | 
82 | #    ifndef DABUN_HEADER_ONLY
83 | 
84 | extern template struct dabun::x86::bench_gflops<avx2, float>;
85 | extern template struct dabun::x86::bench_gflops<avx512, float>;
86 | extern template struct dabun::x86::bench_gflops<avx2_plus, float>;
87 | 
88 | #    endif
89 | 
90 | } // namespace x86
91 | } // namespace dabun
92 | 
93 | #endif
94 | 


--------------------------------------------------------------------------------
/include/dabun/x86/xbyak.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "dabun/isa.hpp"
 9 | #include "dabun/predef.hpp"
10 | 
11 | #ifdef DABUN_ARCH_X86_64
12 | 
13 | #    if DABUN_COMP_GNUC
14 | #        if DABUN_COMP_GNUC >= DABUN_VERSION_NUMBER(11, 0, 0)
15 | #            define DABUN_XBYAK_SUPRESS_WARRAY_BOUNDS
16 | #        endif
17 | #    endif
18 | 
19 | #    if !defined(XBYAK_NO_OP_NAMES)
20 | #        define XBYAK_NO_OP_NAMES
21 | #    endif
22 | 
23 | #    ifdef DABUN_XBYAK_SUPRESS_WARRAY_BOUNDS
24 | #        pragma GCC diagnostic push
25 | #        pragma GCC diagnostic ignored "-Warray-bounds"
26 | #    endif
27 | 
28 | #    include "xbyak/xbyak.h"
29 | #    include "xbyak/xbyak_util.h"
30 | 
31 | #    ifdef DABUN_XBYAK_SUPRESS_WARRAY_BOUNDS
32 | #        pragma GCC diagnostic pop
33 | #        undef DABUN_XBYAK_SUPRESS_WARRAY_BOUNDS
34 | #    endif
35 | 
36 | using xbyak_buffer_type = Xbyak::uint8;
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/src/loop_nest.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #ifndef DABUN_HEADER_ONLY
 7 | 
 8 | #include "dabun/loop_nest.hpp"
 9 | 
10 | namespace dabun
11 | {
12 | 
13 | #if defined(DABUN_ARCH_AARCH64)
14 | 
15 | namespace arm
16 | {
17 | 
18 | template class loop_nest_code_generator<aarch64, true>;
19 | template class loop_nest_code_generator<aarch64, false>;
20 | 
21 | } // namespace arm
22 | 
23 | #else
24 | 
25 | namespace x86
26 | {
27 | 
28 | template class loop_nest_code_generator<avx2>;
29 | template class loop_nest_code_generator<avx512>;
30 | // template struct dabun::x86::loop_nest_code_generator<dabun::avx2_plus>;
31 | 
32 | } // namespace x86
33 | 
34 | #endif
35 | 
36 | } // namespace dabun
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/src/peak_gflops.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #ifndef DABUN_HEADER_ONLY
 7 | 
 8 | #    include "dabun/peak_gflops.hpp"
 9 | 
10 | namespace dabun
11 | {
12 | 
13 | #    if defined(DABUN_ARCH_AARCH64)
14 | 
15 | namespace arm
16 | {
17 | 
18 | template struct bench_gflops<aarch64, fp32_t>;
19 | template struct bench_gflops<aarch64, fp16_t>;
20 | 
21 | } // namespace arm
22 | 
23 | #    else
24 | 
25 | namespace x86
26 | {
27 | 
28 | template struct bench_gflops<avx2, float>;
29 | template struct bench_gflops<avx512, float>;
30 | template struct bench_gflops<avx2_plus, float>;
31 | 
32 | } // namespace x86
33 | 
34 | #    endif
35 | 
36 | } // namespace dabun
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/src/transposer.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #ifndef DABUN_HEADER_ONLY
 7 | 
 8 | #    include "dabun/transposer.hpp"
 9 | 
10 | namespace dabun
11 | {
12 | 
13 | #    if defined(DABUN_ARCH_AARCH64)
14 | 
15 | namespace arm
16 | {
17 | 
18 | template class transposer_code_generator<aarch64, fp32_t>;
19 | template class transposer_code_generator<aarch64, fp16_t>;
20 | 
21 | } // namespace arm
22 | 
23 | #    else
24 | 
25 | namespace x86
26 | {
27 | 
28 | template class transposer_code_generator<avx2>;
29 | template class transposer_code_generator<avx2_plus>;
30 | template class transposer_code_generator<avx512>;
31 | 
32 | } // namespace x86
33 | 
34 | #    endif
35 | 
36 | } // namespace dabun
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/src/x86/multi_vmm.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
2 | //
3 | // This source code is licensed under the MIT license found in the
4 | // LICENSE file in the root directory of this source tree.
5 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(DABUN_TESTS_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 2 | 
 3 | add_library(dabun_tests_catch2_main
 4 |   catch2_main.cpp)
 5 | 
 6 | target_link_libraries(dabun_tests_catch2_main
 7 |   PUBLIC Catch2::Catch2)
 8 | 
 9 | function(dabun_blanket_test name)
10 |   message(STATUS "dabun_blanket_test ${name}_blanket ${name}.cpp")
11 |   add_executable(${name}_blanket ${name}.cpp)
12 |   target_link_libraries(${name}_blanket
13 |     PUBLIC dabun
14 |     PUBLIC dabun_tests_catch2_main
15 |     PUBLIC -lpthread)
16 | endfunction(dabun_blanket_test)
17 | 
18 | dabun_blanket_test(sentinel)
19 | 
20 | function(dabun_test name vex float isa)
21 |   message(STATUS "dabun_test ${name}.${vex}.${float} ${name}.cpp")
22 |   add_executable(${name}.${vex}.${float} ${name}.cpp)
23 |   target_link_libraries(${name}.${vex}.${float}
24 |     PUBLIC dabun
25 |     PUBLIC dabun_tests_catch2_main)
26 |   target_compile_options(${name}.${vex}.${float}
27 |     PRIVATE "-DDABUN_ISA=${isa}"
28 |     PRIVATE "-DDABUN_VEX=extension::${vex}"
29 |     PRIVATE "-DDABUN_ARITHMETIC=dabun::${float}")
30 | endfunction(dabun_test)
31 | 
32 | function(dabun_common_tests vex float isa)
33 |   dabun_test(handpicked_loop_nest_test ${vex} ${float} ${isa})
34 | endfunction(dabun_common_tests)
35 | 
36 | function(dabun_x86_tests vex float isa)
37 |   dabun_common_tests(${vex} ${float} ${isa})
38 | endfunction(dabun_x86_tests)
39 | 
40 | function(dabun_arm_tests vex float isa)
41 |   dabun_common_tests(${vex} ${float} ${isa})
42 | endfunction(dabun_arm_tests)
43 | 
44 | if("${DABUN_HOST_ARCHITECTURE}" STREQUAL "x86_64")
45 |   if (DABUN_BUILD_TESTS_FOR_ALL_ARCH_VEX OR DABUN_BUILD_TESTS_FOR_AVX2)
46 |     dabun_x86_tests(avx2 fp32_t avx2)
47 |   endif()
48 |   if (DABUN_BUILD_TESTS_FOR_ALL_ARCH_VEX OR DABUN_BUILD_TESTS_FOR_AVX512)
49 |     dabun_x86_tests(avx512 fp32_t avx512)
50 |   endif()
51 | elseif("${DABUN_HOST_ARCHITECTURE}" STREQUAL "aarch64")
52 |   if (DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX OR DABUN_BUILD_APPS_FOR_NEON)
53 |     dabun_arm_tests(neon fp32_t aarch64)
54 |     dabun_test(transpose_meta_mnemonics neon fp32_t aarch64)
55 |   endif()
56 |   if (DABUN_BUILD_APPS_FOR_ALL_SUPPORTED_VEX OR DABUN_BUILD_APPS_FOR_NEON_FP16)
57 |     dabun_arm_tests(neon_fp16 fp16_t aarch64)
58 |   endif()
59 | endif()
60 | 


--------------------------------------------------------------------------------
/tests/baseline/matrix_transpose_baseline.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
 2 | //
 3 | // This source code is licensed under the MIT license found in the
 4 | // LICENSE file in the root directory of this source tree.
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <cstddef>
 9 | 
10 | namespace dabun::tests::baseline
11 | {
12 | 
13 | template <typename T>
14 | void reorder_array2d(T* out, T const* in, int rows, int cols,
15 |                      int in_row_stride = cols, int in_col_stride = 1,
16 |                      int out_row_stride = 1, int out_col_stride = rows) noexcept
17 | {
18 |     for (int r = 0; r < rows; ++r)
19 |     {
20 |         for (int c = 0; c < cols; ++c)
21 |         {
22 |             out[out_row_stride * row + out_col_stride * col] =
23 |                 in[in_row_stride * row + in_col_stride * col];
24 |         }
25 |     }
26 | }
27 | 
28 | template <typename T, class Fn>
29 | void for_all_elements_of_two_array2d(T const* a1, T const* a2, Fn&& fn int rows,
30 |                                      int cols, int a1_row_stride,
31 |                                      int a1_col_stride, int a2_row_stride,
32 |                                      int a2_col_stride)
33 | {
34 |     for (int r = 0; r < rows; ++r)
35 |     {
36 |         for (int c = 0; c < cols; ++r)
37 |         {
38 |             fn(a1[r * a1_row_stride + c * a1_col_stride],
39 |                a2{r * a2_row_stride + c * a2_col_stride]);
40 |         }
41 |         }
42 |     }
43 | }
44 | 
45 | } // namespace dabun::tests::baseline
46 | 


--------------------------------------------------------------------------------
/tests/catch2_main.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
2 | //
3 | // This source code is licensed under the MIT license found in the
4 | // LICENSE file in the root directory of this source tree.
5 | 
6 | #define CATCH_CONFIG_MAIN
7 | 
8 | #include <catch2/catch.hpp>
9 | 


--------------------------------------------------------------------------------
/tests/sentinel.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
  2 | //
  3 | // This source code is licensed under the MIT license found in the
  4 | // LICENSE file in the root directory of this source tree.
  5 | 
  6 | #include <sysml/numeric.hpp>
  7 | #include <sysml/numerical_error.hpp>
  8 | #include <sysml/type_traits.hpp>
  9 | 
 10 | #include <sysml/thread/cpu_pool.hpp>
 11 | #include <sysml/thread/parallel_for.hpp>
 12 | 
 13 | #include <catch2/catch.hpp>
 14 | #include <iostream>
 15 | 
 16 | TEST_CASE("ZZZZ", "[single-file]")
 17 | {
 18 |     float a[] = {1.f, 2.f, 3.f, 4.f};
 19 |     float b[] = {1.f, 2.f, 3.f, 4.1f};
 20 | 
 21 |     std::cout << sysml::is_any_of_v<int, float, double, short, int> << "\n";
 22 | 
 23 |     std::cout << sysml::max_abs_difference(a, a + 4, b) << "\n";
 24 |     std::cout << sysml::max_abs_difference_n(a, 4, b) << "\n";
 25 | }
 26 | 
 27 | TEST_CASE("WTF", "[single-file]")
 28 | {
 29 |     sysml::int16x4_t  a(1, 2, 3, 4);
 30 |     sysml::uint16x4_t b(1, 2, 3, 4);
 31 | 
 32 |     std::cout << a << "\n";
 33 |     std::cout << b << "\n";
 34 | 
 35 |     a += b;
 36 | 
 37 |     std::cout << a << "\n";
 38 | 
 39 |     a -= b;
 40 | 
 41 |     std::cout << a << "\n";
 42 |     std::cout << a * b << "\n";
 43 |     std::cout << -a << "\n";
 44 | 
 45 |     // std::cout << sysml::int16x4_t::sign_bitmask << "\n";
 46 | 
 47 |     sysml::fp16_t z = static_cast<sysml::fp16_t>(1.3);
 48 |     z               = 3.f;
 49 | 
 50 |     // z *= 4;
 51 | 
 52 |     std::cout << z << "\n";
 53 |     std::cout << (z < 12.f) << "\n";
 54 | 
 55 |     // unsign
 56 | }
 57 | 
 58 | int Factorial(int number)
 59 | {
 60 |     // return number <= 1 ? number : Factorial(number - 1) * number; // fail
 61 |     return number <= 1 ? 1 : Factorial(number - 1) * number; // pass
 62 | }
 63 | 
 64 | TEST_CASE("Factorial of 0 is 1 (fail)", "[single-file]")
 65 | {
 66 |     REQUIRE(Factorial(0) == 1);
 67 | }
 68 | 
 69 | TEST_CASE("Factorials of 1 and higher are computed (pass)", "[single-file]")
 70 | {
 71 |     REQUIRE(Factorial(1) == 1);
 72 |     REQUIRE(Factorial(2) == 2);
 73 |     REQUIRE(Factorial(3) == 6);
 74 |     REQUIRE(Factorial(10) == 3628800);
 75 | }
 76 | 
 77 | TEST_CASE("Random threaded test", "[single-file]")
 78 | {
 79 |     return;
 80 |     // sysml::thread::cpu_pool oset({0, 1, 5, 12, 18});
 81 |     sysml::thread::cpu_pool oset(10);
 82 |     //  int                              i;
 83 |     //  std::cin >> i;
 84 |     //  std::cout << "Was sleeping? "
 85 |     //            << (oset.set_sleeping_mode(true) ? " Yes" : "No") <<
 86 |     //            std::endl;
 87 | 
 88 |     // std::cin >> i;
 89 | 
 90 |     // for (int i = 0; i < 10000000; ++i)
 91 |     // {
 92 |     //     int x = rand() % 2;
 93 |     //     // std::cout << "Requesting: "
 94 |     //     //           << (oset.set_sleeping_mode(x) ? " Yes" : "No") << ' ';
 95 |     //     // std::cout << "Was sleeping? "
 96 |     //     //           << (oset.set_sleeping_mode(x) ? " Yes" : "No") <<
 97 |     //     //           std::endl;
 98 |     //     oset.set_sleeping_mode(x);
 99 |     // }
100 | 
101 |     int const len = 10000;
102 | 
103 |     {
104 |         std::vector<int> all_zeros(len);
105 | 
106 |         sysml::thread::naive_parallel_for(oset, 0, len, 1,
107 |                                           [&](int idx) { all_zeros[idx] = 1; });
108 | 
109 |         REQUIRE(std::accumulate(all_zeros.begin(), all_zeros.end(), 0) == len);
110 |     }
111 | 
112 |     {
113 |         std::vector<int> all_zeros(len);
114 | 
115 |         sysml::thread::single_queue_parallel_for(oset, 0, len, 1,
116 |                                                  [&](auto const& c, int idx)
117 |                                                  {
118 |                                                      std::cout << c.cpu_index
119 |                                                                << "\n";
120 |                                                      all_zeros[idx] = 1;
121 |                                                  });
122 | 
123 |         REQUIRE(std::accumulate(all_zeros.begin(), all_zeros.end(), 0) == len);
124 |     }
125 | 
126 |     std::atomic<int> zi{0};
127 | 
128 |     oset.execute_on_all_cpus(
129 |         [&](auto)
130 |         {
131 |             while (zi.fetch_add(1) < len)
132 |             {
133 |             }
134 |         });
135 | 
136 |     REQUIRE(zi.load() == len + oset.size());
137 | 
138 |     // std::cout
139 |     //     << std::alignment_of_v<
140 |     //            sysml::detail::primitive_aligned_wrapper<int, 1024>> <<
141 |     //            "\n\n";
142 | 
143 |     // std::cout << sizeof(dabun::detail::primitive_aligned_wrapper<int, 1024>)
144 |     //           << "\n\n";
145 | 
146 |     // int i;
147 |     // std::cin >> i;
148 |     {
149 |         // using dabun::vek;
150 | 
151 |         // vek<int, 1> b1 = std::array<int, 1>{1};
152 |         // vek<int, 2> b2{{1, 1}};
153 | 
154 |         // auto begi = concat(b1, b2);
155 | 
156 |         // auto begin = begi + 1;
157 | 
158 |         // // dabun::vek<int, 3> begin{1, 1, 1};
159 |         // dabun::vek<int, 3> end{{3, 4, 5}};
160 |         // end += 1;
161 | 
162 |         // std::cout << (b2 == b2) << " " << (b2 != b2) << "\n";
163 | 
164 |         // dabun::coord_for_loop(begin, end,
165 |         //                       [](auto const& v)
166 |         //                       {
167 |         //                           std::cout << v << "\n";
168 |         //                       });
169 | 
170 |         // std::cout << "\n";
171 |         // std::cout << "\n";
172 |         // std::cout << "\n";
173 |         // std::cout << "\n";
174 | 
175 |         // // dabun::coord_for_loop(end,
176 |         // //                       [](auto const& v)
177 |         // //                       {
178 |         // //                           std::cout << v[0];
179 |         // //                           for (int i = 1; i < v.size(); ++i)
180 |         // //                           {
181 |         // //                               std::cout << ", " << v[i];
182 |         // //                           }
183 |         // //                           std::cout << "\n";
184 |         // //                       });
185 | 
186 |         // std::cout << -(end + 3) << "\n";
187 | 
188 |         // std::cout << dabun::to_string(-end + 3, ',') << "\n";
189 | 
190 |         // {
191 |         //     auto r = -end + 3;
192 |         //     // auto z = dabun::head<2>(r);
193 |         //     std::cout << dabun::head<2>(r) << "\n";
194 |         //     // std::cout << z << "\n";
195 | 
196 |         // }
197 | 
198 |         std::cout << "HWC: " << std::thread::hardware_concurrency() << "\n";
199 |     }
200 | }
201 | 


--------------------------------------------------------------------------------
/utils.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <chrono>
  4 | #include <cstdlib>
  5 | #include <iostream>
  6 | #include <random>
  7 | #include <vector>
  8 | 
  9 | #include "AlignedVec.h"
 10 | 
 11 | #define LOOP_NEST_STRINGIFY_0(s) #s
 12 | #define LOOP_NEST_STRINGIFY(s) LOOP_NEST_STRINGIFY_0(s)
 13 | 
 14 | #define strong_assert(condition)                                               \
 15 |     if (!(condition))                                                          \
 16 |     {                                                                          \
 17 |         throw std::runtime_error(LOOP_NEST_STRINGIFY(                          \
 18 |             condition) " failed file: " __FILE__                               \
 19 |                        " line: " LOOP_NEST_STRINGIFY((__LINE__)));             \
 20 |     }                                                                          \
 21 |     static_cast<void>(0)
 22 | 
 23 | // FROM: https://en.cppreference.com/w/cpp/utility/variant/visit
 24 | 
 25 | // helper type for the visitor #4
 26 | template <class... Ts>
 27 | struct overloaded : Ts...
 28 | {
 29 |     using Ts::operator()...;
 30 | };
 31 | // explicit deduction guide (not needed as of C++20)
 32 | template <class... Ts>
 33 | overloaded(Ts...) -> overloaded<Ts...>;
 34 | 
 35 | template <class Float>
 36 | void apply_relu(Float* Begin, Float* End)
 37 | {
 38 |     for (; Begin != End; ++Begin)
 39 |     {
 40 |         if constexpr (std::is_same_v<Float, fp16>)
 41 |         {
 42 |             *Begin = static_cast<fp16>(
 43 |                 std::max(static_cast<float>(0), static_cast<float>(*Begin)));
 44 |         }
 45 |         else
 46 |         {
 47 |             *Begin = std::max(static_cast<Float>(0), *Begin);
 48 |         }
 49 |     }
 50 | }
 51 | 
 52 | template <class Float>
 53 | Float max_abs_difference(Float const* LBegin, Float const* LEnd,
 54 |                          Float const* RBegin)
 55 | {
 56 |     Float res = 0;
 57 |     for (; LBegin != LEnd; ++LBegin, ++RBegin)
 58 |     {
 59 |         res = std::max(res, std::abs(*LBegin - *RBegin));
 60 |     }
 61 |     return res;
 62 | }
 63 | 
 64 | template <class Float>
 65 | Float max_abs_differenceVerbose(Float const* LBegin, Float const* LEnd,
 66 |                                 Float const* RBegin)
 67 | {
 68 |     int   off = 0;
 69 |     Float res = 0;
 70 |     for (; LBegin != LEnd; ++LBegin, ++RBegin)
 71 |     {
 72 |         std::cout << off++ << " : " << (*LBegin) << " " << (*RBegin) << " "
 73 |                   << std::abs(*LBegin - *RBegin) << "\n";
 74 |         res = std::max(res, std::abs(*LBegin - *RBegin));
 75 |     }
 76 |     return res;
 77 | }
 78 | 
 79 | template <class Float>
 80 | Float max_abs_differenceVerbose(Float const* LBegin, Float const* LEnd,
 81 |                                 Float const* RBegin, float delta)
 82 | {
 83 |     int   off = 0;
 84 |     Float res = 0;
 85 |     for (; LBegin != LEnd; ++LBegin, ++RBegin)
 86 |     {
 87 |         if (std::abs(*LBegin - *RBegin) > delta)
 88 |         {
 89 |             std::cout << off << " : " << (*LBegin) << " " << (*RBegin) << " "
 90 |                       << std::abs(*LBegin - *RBegin) << "\n";
 91 |         }
 92 |         res = std::max(res, std::abs(*LBegin - *RBegin));
 93 |         off++;
 94 |     }
 95 |     return res;
 96 | }
 97 | 
 98 | template <class Float>
 99 | aligned_vector<Float> get_random_vector(unsigned size,
100 |                                         unsigned extra_elements = 16)
101 | {
102 |     aligned_vector<Float> res(size + extra_elements);
103 | 
104 |     std::random_device rd;
105 |     std::mt19937       gen(0); // rd());
106 | 
107 |     std::uniform_real_distribution<double> dis(-1.0, 1.0);
108 | 
109 |     for (auto& f : res)
110 |     {
111 |         f = dis(gen);
112 |     }
113 | 
114 |     return res;
115 | }
116 | 
117 | template <class Float>
118 | aligned_vector<Float> getZeroVector(unsigned size, unsigned extra_elements = 16)
119 | {
120 |     aligned_vector<Float> res(size + extra_elements);
121 |     return res;
122 | }
123 | 
124 | template <class Fn>
125 | double measureFastestWithWarmup(Fn&& fn, int warmupIterations,
126 |                                 int measuredIterations = 1)
127 | {
128 |     for (int i = 0; i < warmupIterations; ++i)
129 |     {
130 |         fn();
131 |     }
132 | 
133 |     auto start = std::chrono::high_resolution_clock::now();
134 |     fn();
135 |     auto end = std::chrono::high_resolution_clock::now();
136 |     auto nsecs =
137 |         std::chrono::duration_cast<std::chrono::nanoseconds>(end - start)
138 |             .count();
139 | 
140 |     for (int i = 1; i < measuredIterations; ++i)
141 |     {
142 |         start = std::chrono::high_resolution_clock::now();
143 |         fn();
144 |         end = std::chrono::high_resolution_clock::now();
145 | 
146 |         auto new_time =
147 |             std::chrono::duration_cast<std::chrono::nanoseconds>(end - start)
148 |                 .count();
149 | 
150 |         // LN_LOG(INFO) << "T: " << new_time << "\n";
151 |         nsecs = std::min(nsecs, new_time);
152 |     }
153 | 
154 |     return static_cast<double>(nsecs) / 1e9;
155 | }
156 | 
157 | inline std::uint64_t rdtsc()
158 | {
159 | #if !defined(LOOP_NEST_ARM)
160 |     unsigned hi, lo;
161 |     __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
162 |     return ((std::uint64_t)lo) | (((std::uint64_t)hi) << 32);
163 | #else
164 |     return 0;
165 | #endif
166 | }
167 | 
168 | template <class Fn>
169 | double measureMinCyclesWithWarmup(Fn&& fn, int warmupIterations,
170 |                                   int measuredIterations = 1)
171 | {
172 |     for (int i = 0; i < warmupIterations; ++i)
173 |     {
174 |         fn();
175 |     }
176 | 
177 |     auto start = rdtsc();
178 |     fn();
179 |     auto end = rdtsc();
180 |     auto cyc = end - start;
181 | 
182 |     for (int i = 1; i < measuredIterations; ++i)
183 |     {
184 |         start = rdtsc();
185 |         fn();
186 |         end = rdtsc();
187 | 
188 |         auto new_time = end - start;
189 |         // LN_LOG(INFO) << "T: " << new_time << "\n";
190 |         cyc = std::min(cyc, new_time);
191 |     }
192 | 
193 |     return static_cast<double>(cyc);
194 | }
195 | 
196 | template <class BaseLineImpl, class JITImpl>
197 | void check_correctness(BaseLineImpl&& baseline_fn, JITImpl&& jit_fn, int A_size,
198 |                        int B_size, int C_size, int alpha = 0)
199 | {
200 |     auto A = get_random_vector<float>(A_size);
201 |     auto B = get_random_vector<float>(B_size);
202 | 
203 |     auto CN = aligned_vector<float>(C_size);
204 |     auto CJ = std::vector<float>(C_size);
205 | 
206 |     baseline_fn(CN.data(), A.data(), B.data());
207 |     jit_fn(CJ.data(), A.data(), B.data(), alpha);
208 | 
209 |     std::cout << "MAXABSDIFF: "
210 |               << max_abs_difference(CJ.data(), CJ.data() + C_size, CN.data())
211 |               << "\n";
212 | }
213 | 
214 | template <class Fn>
215 | void bench_implementation(Fn&& fn, int A_size, int B_size, int C_size,
216 |                           double gflops, int warmup = 5, int iters = 10)
217 | {
218 |     auto A = get_random_vector<float>(A_size);
219 |     auto B = get_random_vector<float>(B_size);
220 |     auto C = std::vector<float>(C_size);
221 | 
222 |     auto secs = measureFastestWithWarmup(
223 |         [&]() { fn(C.data(), A.data(), B.data(), 0); }, warmup, iters);
224 | 
225 |     std::cout << "GFLOPS: " << (gflops / secs) << "\n";
226 | }
227 | 
228 | template <class Fn>
229 | void bench_implementation_fmas_per_cycle(Fn&& fn, int A_size, int B_size,
230 |                                          int C_size, double flops,
231 |                                          int warmup = 5, int iters = 10)
232 | {
233 |     auto A = get_random_vector<float>(A_size);
234 |     auto B = get_random_vector<float>(B_size);
235 |     auto C = std::vector<float>(C_size);
236 | 
237 |     auto secs = measureMinCyclesWithWarmup(
238 |         [&]() { fn(C.data(), A.data(), B.data(), 0); }, warmup, iters);
239 | 
240 |     std::cout << "FLOPS per CYCLE: " << (flops / secs) << "\n";
241 | }
242 | 


--------------------------------------------------------------------------------