├── doc
    ├── vamo-format.adoc
    ├── Makefile
    ├── vtype-format.adoc
    ├── valu-format.adoc
    └── vmem-format.adoc
├── README.md
├── vstdlib
    ├── CMakeLists.txt
    ├── vstdlib.hpp
    ├── memset.cpp
    ├── memmove.cpp
    ├── memcpy.cpp
    └── memcpy_backward.cpp
├── tests
    ├── CMakeLists.txt
    ├── stdlib.cpp
    ├── unit_tests.cpp
    └── vector_examples.cpp
├── CMakeLists.txt
├── include
    └── riscv
    │   └── ext
    │       └── v.hpp
└── src
    └── riscv32
        └── v.cpp


/doc/vamo-format.adoc:
--------------------------------------------------------------------------------
1 | ----
2 | Format for Vector AMO Instructions under AMO major opcode
3 | 31    27 26  25  24      20 19       15 14   12 11      7 6     0
4 |  amoop  |wd| vm |   vs2    |    rs1    | width | vs3/vd  |0101111| VAMO*
5 |    5      1   1      5           5         3        5        7
6 | ----
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RISCV-V V extension simulator
 2 | 
 3 | RISC-V vector extension v0.7 (base) simulator implemented in C++.
 4 | 
 5 | # Requirements
 6 | 
 7 | * CMake >= 3.6
 8 | * Boost >= 1.66
 9 | 
10 | # Building
11 | 
12 | ```
13 | mkdir build
14 | cd build
15 | cmake ..
16 | cmake --build .
17 | 
18 | tests/vector_examples
19 | tests/stdlib
20 | ```
21 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all docbook clean
 2 | 
 3 | adoc_src:= $(wildcard  ./*.adoc)
 4 | pdfs:=$(patsubst %.adoc,%.pdf, $(adoc_src))
 5 | xmls:=$(patsubst %.adoc,%.xml, $(adoc_src))
 6 | 
 7 | $(pdfs):%.pdf:%.adoc
 8 | 	asciidoctor-pdf -o $@ $<
 9 | 
10 | all: $(pdfs) docbook
11 | 
12 | docbook: $(xmls)
13 | 
14 | $(xmls):%.xml:%.adoc
15 | 	asciidoctor -v -b docbook -o $@ $<
16 | 
17 | clean:
18 | 	$(RM) $(pdfs) $(xmls)
19 | 


--------------------------------------------------------------------------------
/doc/vtype-format.adoc:
--------------------------------------------------------------------------------
 1 | 
 2 | .`vtype` register layout
 3 | [cols="2,4,10"]
 4 | |===
 5 | |     Bits | Name       | Description
 6 | 
 7 | |   XLEN-1 | vill       | Illegal value if set
 8 | | XLEN-2:7 |            | Reserved (write 0)
 9 | |      6:5 | vediv[1:0] | Used by EDIV extension
10 | |      4:2 | vsew[2:0]  | Standard element width (SEW) setting
11 | |      1:0 | vlmul[1:0] | Vector register group multiplier (LMUL) setting
12 | |===
13 | 


--------------------------------------------------------------------------------
/vstdlib/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0)
 2 | 
 3 | project(vstdlib) 
 4 | 
 5 | add_library(vstdlib STATIC
 6 |     vstdlib.hpp
 7 |     memcpy.cpp
 8 |     memset.cpp
 9 |     memmove.cpp
10 |     memcpy_backward.cpp
11 | )
12 | set_target_properties(vstdlib PROPERTIES FOLDER "Libs")
13 | target_include_directories(vstdlib PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/..")
14 | target_link_libraries(vstdlib PUBLIC riscv32)
15 | 
16 | target_compile_features(vstdlib PUBLIC ${CXX_FEATURES})
17 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0)
 2 | 
 3 | project(Vendetta_tests)
 4 | 
 5 | set(tests
 6 |     unit_tests
 7 |     stdlib
 8 |     vector_examples
 9 | )
10 | 
11 | foreach(name ${tests})
12 |     add_executable(${name} "${CMAKE_CURRENT_SOURCE_DIR}/${name}.cpp")
13 |     set_target_properties(${name} PROPERTIES FOLDER "tests")
14 |     add_test(test_${name} ${name})
15 | endforeach()
16 | 
17 | target_link_libraries(unit_tests riscv32)
18 | target_link_libraries(stdlib riscv32 vstdlib)
19 | target_link_libraries(vector_examples riscv32)
20 | 


--------------------------------------------------------------------------------
/vstdlib/vstdlib.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |     @file vstdlib.hpp
 3 |     @copyright ©2019 Syntacore.
 4 |     @authors
 5 |         Grigory Okhotnikov <go@syntacore.com>
 6 |     @brief Vector extension simulator (v0.7) example
 7 | */
 8 | 
 9 | #ifndef RISCV_VEXAMPLES_HPP_
10 | #define RISCV_VEXAMPLES_HPP_
11 | 
12 | #include "riscv/ext/v.hpp"
13 | 
14 | #include <type_traits>
15 | #include <iterator>
16 | 
17 | namespace rvv {
18 | 
19 | void *memset(void *const dest, int c, size_t count);
20 | void *memcpy(void * const dest, void const *src, size_t count);
21 | void *memmove(void *const dest, void const *src, size_t count);
22 | void *memcpy_backward(void *pd, void const *ps, size_t count);
23 | 
24 | }  // namespace rvv
25 | 
26 | #endif  // RISCV_VEXAMPLES_HPP_
27 | 


--------------------------------------------------------------------------------
/doc/valu-format.adoc:
--------------------------------------------------------------------------------
 1 | ----
 2 | Formats for Vector Arithmetic Instructions under OP-V major opcode
 3 | 
 4 | 31       26  25   24      20 19      15 14   12 11      7 6     0
 5 |   funct6   | vm  |   vs2    |    vs1   | 0 0 0 |    vd   |1010111| OP-V (OPIVV)
 6 |   funct6   | vm  |   vs2    |    vs1   | 0 0 1 |    vd   |1010111| OP-V (OPFVV)
 7 |   funct6   | vm  |   vs2    |    vs1   | 0 1 0 |  vd/rd  |1010111| OP-V (OPMVV)
 8 |   funct6   | vm  |   vs2    |   simm5  | 0 1 1 |    vd   |1010111| OP-V (OPIVI)
 9 |   funct6   | vm  |   vs2    |    rs1   | 1 0 0 |    vd   |1010111| OP-V (OPIVX)
10 |   funct6   | vm  |   vs2    |    rs1   | 1 0 1 |    vd   |1010111| OP-V (OPFVF)
11 |   funct6   | vm  |   vs2    |    rs1   | 1 1 0 |  vd/rd  |1010111| OP-V (OPMVX)
12 |      6        1        5          5        3        5        7
13 | ----
14 | 


--------------------------------------------------------------------------------
/vstdlib/memset.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |     @file memset.cpp
 3 |     @copyright ©2019 Syntacore.
 4 |     @authors
 5 |         Grigory Okhotnikov <go@syntacore.com>
 6 |     @brief Vector extension simulator (v0.7) example
 7 | */
 8 | 
 9 | #include "vstdlib/vstdlib.hpp"
10 | 
11 | #include <cassert>
12 | 
13 | namespace rvv {
14 | 
15 | using namespace ::riscv::v;
16 | 
17 | void*
18 | memset(void *const dest, int c, size_t count)
19 | {
20 |     if (0 != count) {
21 |         vsetvli(count, vtypei(e8, m8));
22 |         vmv_v_x(v0, c);
23 | 
24 |         int8_t *pd = static_cast<int8_t *>(dest);
25 |         do {
26 |             size_t const vl = vsetvli(count, vtypei(e8, m8));
27 |             count -= vl;
28 |             vsb_v(v0, pd);
29 |             pd += vl;
30 |         } while (count);
31 |     }
32 | 
33 |     return dest;
34 | }
35 | 
36 | }  // namespace rvv
37 | 


--------------------------------------------------------------------------------
/vstdlib/memmove.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |     @file memmove.cpp
 3 |     @copyright ©2019 Syntacore.
 4 |     @authors
 5 |         Grigory Okhotnikov <go@syntacore.com>
 6 |     @brief Vector extension simulator (v0.7) example
 7 | */
 8 | 
 9 | #include "vstdlib/vstdlib.hpp"
10 | 
11 | #include <cassert>
12 | 
13 | namespace rvv {
14 | 
15 | using namespace ::riscv::v;
16 | 
17 | void*
18 | memmove(void *const dest, void const *src, size_t count)
19 | {
20 |     uint8_t const *ps = static_cast<uint8_t const*>(src);
21 |     uint8_t *pd = static_cast<uint8_t *>(dest);
22 |     if (0 != count || pd == ps) {
23 |         if (pd < ps || ps + count <= pd) {
24 |             return memcpy(dest, src, count);
25 |         } else {
26 |             memcpy_backward(pd, ps, count);
27 |             return dest;
28 |         }
29 |     }
30 | 
31 |     return dest;
32 | }
33 | }  // namespace rvv
34 | 


--------------------------------------------------------------------------------
/vstdlib/memcpy.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |     @file memcpy.cpp
 3 |     @copyright ©2019 Syntacore.
 4 |     @authors
 5 |         Grigory Okhotnikov <go@syntacore.com>
 6 |     @brief Vector extension simulator (v0.7) example
 7 | */
 8 | 
 9 | #include "vstdlib/vstdlib.hpp"
10 | 
11 | #include <cassert>
12 | 
13 | namespace rvv {
14 | 
15 | using namespace ::riscv::v;
16 | 
17 | void*
18 | memcpy(void * const dest, void const *src, size_t count)
19 | {
20 |     if (0 != count) {
21 |         int8_t const *ps = static_cast<int8_t const*>(src);
22 |         int8_t *pd = static_cast<int8_t *>(dest);
23 |         do {
24 |             size_t const vl = vsetvli(count, vtypei(e8, m8));
25 |             count -= vl;
26 |             vlb_v(v0, ps);
27 |             ps += vl;
28 |             vsb_v(v0, pd);
29 |             pd += vl;
30 |         } while (count);
31 |     }
32 | 
33 |     return dest;
34 | }
35 | }  // namespace rvv
36 | 


--------------------------------------------------------------------------------
/doc/vmem-format.adoc:
--------------------------------------------------------------------------------
 1 | ----
 2 | Format for Vector Load Instructions under LOAD-FP major opcode
 3 | 31 29 28 26  25  24      20 19       15 14   12 11      7 6     0
 4 |  nf  | mop | vm |  lumop   |    rs1    | width |    vd   |0000111| VL*  unit-stride
 5 |  nf  | mop | vm |   rs2    |    rs1    | width |    vd   |0000111| VLS* strided
 6 |  nf  | mop | vm |   vs2    |    rs1    | width |    vd   |0000111| VLX* indexed
 7 |   3     3     1      5           5         3         5       7
 8 | 
 9 | Format for Vector Store Instructions under STORE-FP major opcode
10 | 31 29 28 26  25  24      20 19       15 14   12 11      7 6     0
11 |  nf  | mop | vm |  sumop   |    rs1    | width |   vs3   |0100111| VS*  unit-stride
12 |  nf  | mop | vm |   rs2    |    rs1    | width |   vs3   |0100111| VSS* strided
13 |  nf  | mop | vm |   vs2    |    rs1    | width |   vs3   |0100111| VSX* indexed
14 |   3     3     1      5           5         3         5        7
15 | ----
16 | 


--------------------------------------------------------------------------------
/vstdlib/memcpy_backward.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |     @file memcpy_backward.cpp
 3 |     @copyright ©2019 Syntacore.
 4 |     @authors
 5 |         Grigory Okhotnikov <go@syntacore.com>
 6 |     @brief Vector extension simulator (v0.7) example
 7 | */
 8 | 
 9 | #include "vstdlib/vstdlib.hpp"
10 | 
11 | #include <cassert>
12 | 
13 | namespace rvv {
14 | 
15 | using namespace ::riscv::v;
16 | 
17 | void*
18 | memcpy_backward(void *const dest, void const *src, size_t count)
19 | {
20 |     if (0 != count) {
21 |         int8_t const *ps = static_cast<int8_t const*>(src) + (count - 1);
22 |         int8_t *pd = static_cast<int8_t *>(dest) + (count - 1);
23 | 
24 |         static ptrdiff_t const stride = -1;
25 |         do {
26 |             size_t const vl = vsetvli(count, vtypei(e8, m8));
27 |             count -= vl;
28 |             vlsb_v(v0, ps, stride);
29 |             ps -= vl;
30 |             vssb_v(v0, pd, stride);
31 |             pd -= vl;
32 |         } while (count);
33 |     }
34 |     return dest;
35 | }
36 | }  // namespace rvv
37 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.6)
 2 | 
 3 | project(rvv-simulator)
 4 | 
 5 | set(Boost_USE_MULTITHREADED ON)
 6 | find_package(Boost 1.66)
 7 | 
 8 | enable_testing()
 9 | 
10 | set_property(GLOBAL PROPERTY USE_FOLDERS ON)
11 | 
12 | if (MSVC)
13 |     add_definitions(
14 |         -D_SCL_SECURE_NO_WARNINGS
15 |         -D_USE_MATH_DEFINES
16 |         -DNOMINMAX
17 |         )
18 |     add_compile_options(/bigobj)
19 | endif(MSVC)
20 | 
21 | set(RVV_ELEN 64 CACHE STRING "The maximum size of a single vector element in bits")
22 | set(RVV_VLEN 256 CACHE STRING "The number of bits in a vector register") # VLEN ≥ ELEN
23 | set(RVV_SLEN 64 CACHE STRING "The striping distance in bits") # VLEN ≥ SLEN ≥ 32
24 | 
25 | set(CXX_FEATURES
26 |     #cxx_aggregate_default_initializers
27 |     cxx_alias_templates
28 |     cxx_alignas
29 |     cxx_alignof
30 |     cxx_attributes
31 |     cxx_attribute_deprecated
32 |     cxx_auto_type
33 |     cxx_binary_literals
34 |     cxx_constexpr
35 |     cxx_contextual_conversions
36 |     #cxx_decltype_incomplete_return_types
37 |     cxx_decltype
38 |     cxx_decltype_auto
39 |     cxx_default_function_template_args
40 |     cxx_defaulted_functions
41 |     cxx_defaulted_move_initializers
42 |     cxx_delegating_constructors
43 |     cxx_deleted_functions
44 |     cxx_digit_separators
45 |     cxx_enum_forward_declarations
46 |     cxx_explicit_conversions
47 |     cxx_extended_friend_declarations
48 |     cxx_extern_templates
49 |     cxx_final
50 |     cxx_func_identifier
51 |     cxx_generalized_initializers
52 |     cxx_generic_lambdas
53 |     cxx_inheriting_constructors
54 |     cxx_inline_namespaces
55 |     cxx_lambdas
56 |     cxx_lambda_init_captures
57 |     cxx_local_type_template_args
58 |     cxx_long_long_type
59 |     cxx_noexcept
60 |     cxx_nonstatic_member_init
61 |     cxx_nullptr
62 |     cxx_override
63 |     cxx_range_for
64 |     cxx_raw_string_literals
65 |     cxx_reference_qualified_functions
66 |     #cxx_relaxed_constexpr
67 |     cxx_return_type_deduction
68 |     cxx_right_angle_brackets
69 |     cxx_rvalue_references
70 |     cxx_sizeof_member
71 |     cxx_static_assert
72 |     cxx_strong_enums
73 |     cxx_thread_local
74 |     cxx_trailing_return_types
75 |     cxx_unicode_literals
76 |     cxx_uniform_initialization
77 |     cxx_unrestricted_unions
78 |     cxx_user_literals
79 |     cxx_variable_templates
80 |     cxx_variadic_macros
81 |     cxx_variadic_templates
82 |     cxx_template_template_parameters
83 | )
84 | 
85 | add_library(riscv32 STATIC
86 |     src/riscv32/v.cpp
87 |     include/riscv/ext/v.hpp
88 | )
89 | target_compile_definitions(riscv32 PUBLIC -DRVV_ELEN=${RVV_ELEN} PUBLIC -DRVV_VLEN=${RVV_VLEN} PUBLIC -DRVV_SLEN=${RVV_SLEN})
90 | set_target_properties(riscv32 PROPERTIES FOLDER "Libs")
91 | target_include_directories(riscv32 PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
92 | target_link_libraries(riscv32 Boost::boost)
93 | target_compile_features(riscv32 PUBLIC ${CXX_FEATURES})
94 | 
95 | add_subdirectory(vstdlib)
96 | add_subdirectory(tests)
97 | 


--------------------------------------------------------------------------------
/tests/stdlib.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |     @file stdlib.cpp
  3 |     @copyright ©2019 Syntacore.
  4 |     @authors
  5 |         Grigory Okhotnikov <go@syntacore.com>
  6 |     @brief Vector extension simulator (v0.7) standard library tests
  7 | */
  8 | 
  9 | #define BOOST_TEST_MODULE stdlib
 10 | 
 11 | #include <boost/test/included/unit_test.hpp>
 12 | #include <boost/test/data/test_case.hpp>
 13 | #include <boost/test/data/monomorphic.hpp>
 14 | 
 15 | #include "vstdlib/vstdlib.hpp"
 16 | 
 17 | #include <algorithm>
 18 | #include <vector>
 19 | #include <functional>
 20 | #include <random>
 21 | 
 22 | namespace bdata = boost::unit_test::data;
 23 | 
 24 | namespace {
 25 |     std::default_random_engine generator;
 26 | }  // namespace
 27 | 
 28 | BOOST_AUTO_TEST_CASE(test_memcpy)
 29 | {
 30 |     using std::begin;
 31 |     using std::end;
 32 |     typedef std::vector<char> buf_type;
 33 |     buf_type in_buf;
 34 |     static std::uniform_int_distribution<int> distribution(0, 255);
 35 |     static auto const gen = []() {return distribution(generator); };
 36 |     std::generate_n(std::back_inserter(in_buf), 1024, gen);
 37 |     buf_type out_buf(in_buf.size());
 38 |     rvv::memcpy(&out_buf[0], &in_buf[0], in_buf.size() * sizeof(buf_type::value_type));
 39 |     BOOST_TEST(in_buf == out_buf);
 40 | }
 41 | 
 42 | BOOST_AUTO_TEST_CASE(test_memcpy_backward)
 43 | {
 44 |     typedef std::vector<char> buf_type;
 45 | 
 46 |     static std::uniform_int_distribution<int> distribution(0, 255);
 47 |     static auto const gen = []() {return distribution(generator); };
 48 | 
 49 |     buf_type in_buf;
 50 |     using std::begin;
 51 |     using std::end;
 52 |     std::generate_n(std::back_inserter(in_buf), 1024, gen);
 53 | 
 54 |     buf_type out_buf(in_buf.size());
 55 |     rvv::memcpy_backward(&out_buf[0], &in_buf[0], in_buf.size() * sizeof(buf_type::value_type));
 56 |     BOOST_TEST(in_buf == out_buf);
 57 | }
 58 | 
 59 | BOOST_AUTO_TEST_CASE(test_memmove_forward)
 60 | {
 61 |     typedef std::vector<char> buf_type;
 62 |     static std::uniform_int_distribution<int> distribution(0, 255);
 63 |     static auto const gen = []() {return distribution(generator); };
 64 | 
 65 |     buf_type ref_buf;
 66 |     using std::begin;
 67 |     using std::end;
 68 |     static size_t const buf_size = 1024;
 69 |     std::generate_n(std::back_inserter(ref_buf), buf_size, gen);
 70 | 
 71 |     buf_type tst_buf = ref_buf;
 72 | 
 73 |     std::memmove(&ref_buf[buf_size / 4], &ref_buf[0], buf_size / 2);
 74 |     rvv::memmove(&tst_buf[buf_size / 4], &tst_buf[0], buf_size / 2);
 75 |     BOOST_TEST(ref_buf == tst_buf);
 76 | }
 77 | 
 78 | BOOST_AUTO_TEST_CASE(test_memmove_backward)
 79 | {
 80 |     static size_t const buf_size = 1024;
 81 |     typedef std::vector<char> buf_type;
 82 |     static std::uniform_int_distribution<int> distribution(0, 255);
 83 |     static auto const gen = []() {return distribution(generator); };
 84 | 
 85 |     buf_type ref_buf;
 86 |     using std::begin;
 87 |     using std::end;
 88 |     std::generate_n(std::back_inserter(ref_buf), buf_size, gen);
 89 | 
 90 |     buf_type tst_buf = ref_buf;
 91 | 
 92 |     std::memmove(&ref_buf[0], &ref_buf[buf_size / 4], buf_size / 2);
 93 |     rvv::memmove(&tst_buf[0], &tst_buf[buf_size / 4], buf_size / 2);
 94 |     BOOST_TEST(ref_buf == tst_buf);
 95 | }
 96 | 
 97 | BOOST_DATA_TEST_CASE(test_memset, bdata::xrange(256), val)
 98 | {
 99 |     typedef std::vector<char> buf_type;
100 |     buf_type buf(1024);
101 |     rvv::memset(&buf[0], val, buf.size() * sizeof(buf_type::value_type));
102 | 
103 |     using std::begin;
104 |     using std::end;
105 |     namespace ph = std::placeholders;
106 |     BOOST_TEST(std::all_of(begin(buf), end(buf), std::bind(std::equal_to<char>(), val, ph::_1)));
107 | }
108 | 


--------------------------------------------------------------------------------
/tests/unit_tests.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |     @file unit_tests.cpp
  3 |     @copyright ©2019 Syntacore.
  4 |     @authors
  5 |         Grigory Okhotnikov <go@syntacore.com>
  6 |     @brief Vector extension simulator (v0.7) per-instruction tests
  7 | */
  8 | 
  9 | #define BOOST_TEST_MODULE basic_ops
 10 | 
 11 | #include <boost/test/included/unit_test.hpp>
 12 | #include <boost/test/data/test_case.hpp>
 13 | #include <boost/test/data/monomorphic.hpp>
 14 | 
 15 | #include "riscv/ext/v.hpp"
 16 | 
 17 | using namespace ::riscv::v;
 18 | 
 19 | namespace {
 20 | std::default_random_engine generator(0);
 21 | 
 22 | template<typename Ty>
 23 | void*
 24 | add(void * const dest, void const *src_a, void const *src_b, size_t count)
 25 | {
 26 |     if (0 != count) {
 27 |         Ty const *pa = static_cast<Ty const*>(src_a);
 28 |         Ty const *pb = static_cast<Ty const*>(src_b);
 29 |         Ty *pd = static_cast<Ty*>(dest);
 30 | 
 31 |         do {
 32 |             size_t const vl = vsetvl(count, vtype(e32, m8));
 33 |             count -= vl;
 34 |             vlw_v(v0, pa);
 35 |             pa += vl;
 36 |             vlw_v(v8, pb);
 37 |             pb += vl;
 38 |             vadd_vv(v16, v8, v0);
 39 |             vsw_v(v16, pd);
 40 |             pd += vl;
 41 |         } while (count);
 42 |     }
 43 | 
 44 |     return dest;
 45 | }
 46 | 
 47 | template<typename Ty>
 48 | void*
 49 | addx(void * const dest, void const *src_a, int32_t const src_b, size_t count)
 50 | {
 51 |     if (0 != count) {
 52 |         Ty const *pa = static_cast<Ty const*>(src_a);
 53 |         Ty *pd = static_cast<Ty*>(dest);
 54 | 
 55 |         do {
 56 |             size_t const vl = vsetvl(count, vtype(e32, m8));
 57 |             count -= vl;
 58 |             vlw_v(v0, pa);
 59 |             pa += vl;
 60 |             vadd_vx(v16, v0, src_b);
 61 |             vsw_v(v16, pd);
 62 |             pd += vl;
 63 |         } while (count);
 64 |     }
 65 | 
 66 |     return dest;
 67 | }
 68 | 
 69 | template<typename Ty>
 70 | void*
 71 | addi(void * const dest, void const *src_a, int16_t const imm, size_t count)
 72 | {
 73 |     if (0 != count) {
 74 |         Ty const *pa = static_cast<Ty const*>(src_a);
 75 |         Ty *pd = static_cast<Ty*>(dest);
 76 | 
 77 |         do {
 78 |             size_t const vl = vsetvl(count, vtype(e32, m8));
 79 |             count -= vl;
 80 |             vlw_v(v0, pa);
 81 |             pa += vl;
 82 |             vadd_vi(v16, v0, imm);
 83 |             vsw_v(v16, pd);
 84 |             pd += vl;
 85 |         } while (count);
 86 |     }
 87 | 
 88 |     return dest;
 89 | }
 90 | 
 91 | template<typename Ty>
 92 | void*
 93 | sub(void * const dest, void const *src_a, void const *src_b, size_t count)
 94 | {
 95 |     if (0 != count) {
 96 |         Ty const *pa = static_cast<Ty const*>(src_a);
 97 |         Ty const *pb = static_cast<Ty const*>(src_b);
 98 |         Ty *pd = static_cast<Ty*>(dest);
 99 | 
100 |         do {
101 |             size_t const vl = vsetvl(count, vtype(e32, m8));
102 |             count -= vl;
103 |             vlw_v(v0, pa);
104 |             pa += vl;
105 |             vlw_v(v8, pb);
106 |             pb += vl;
107 |             vsub_vv(v16, v8, v0);
108 |             vsw_v(v16, pd);
109 |             pd += vl;
110 |         } while (count);
111 |     }
112 | 
113 |     return dest;
114 | }
115 | }  // namespace
116 | 
117 | BOOST_AUTO_TEST_CASE(addition)
118 | {
119 |     using std::begin;
120 |     using std::end;
121 |     typedef std::vector<int32_t> buf_type;
122 |     const size_t buf_size = 16;
123 |     buf_type in_buf_a;
124 |     buf_type in_buf_b;
125 |     static std::uniform_int_distribution<buf_type::value_type> distribution(0, 255);
126 |     static auto const gen = []() {return distribution(generator); };
127 |     std::generate_n(std::back_inserter(in_buf_a), buf_size, gen);
128 |     std::generate_n(std::back_inserter(in_buf_b), buf_size, gen);
129 | 
130 |     buf_type out_buf(in_buf_a.size());
131 |     buf_type ref_buf;
132 | 
133 |     add<buf_type::value_type>(&out_buf[0], &in_buf_a[0], &in_buf_b[0], in_buf_a.size());
134 |     std::transform(in_buf_a.begin(), in_buf_a.end(), in_buf_b.begin(), std::back_inserter(ref_buf), std::plus<int32_t>());
135 | 
136 |     BOOST_TEST(ref_buf == out_buf);
137 | }
138 | 
139 | BOOST_AUTO_TEST_CASE(addition_scalar)
140 | {
141 |     using std::begin;
142 |     using std::end;
143 |     namespace ph = std::placeholders;
144 |     typedef std::vector<int32_t> buf_type;
145 |     size_t const buf_size = 16;
146 |     int32_t const x = 127;
147 |     buf_type in_buf_a;
148 |     static std::uniform_int_distribution<buf_type::value_type> distribution(0, 255);
149 |     static auto const gen = []() {return distribution(generator); };
150 |     std::generate_n(std::back_inserter(in_buf_a), buf_size, gen);
151 | 
152 |     buf_type out_buf(in_buf_a.size());
153 |     buf_type ref_buf;
154 | 
155 |     addx<buf_type::value_type>(&out_buf[0], &in_buf_a[0], x, in_buf_a.size());
156 |     std::transform(in_buf_a.begin(), in_buf_a.end(), std::back_inserter(ref_buf), std::bind(std::plus<int32_t>(), ph::_1, x));
157 | 
158 |     BOOST_TEST(ref_buf == out_buf);
159 | }
160 | 
161 | BOOST_AUTO_TEST_CASE(addition_immediate)
162 | {
163 |     using std::begin;
164 |     using std::end;
165 |     namespace ph = std::placeholders;
166 |     typedef std::vector<int32_t> buf_type;
167 |     size_t const buf_size = 16;
168 |     int16_t const imm = 127;
169 |     buf_type in_buf_a;
170 |     static std::uniform_int_distribution<buf_type::value_type> distribution(0, 255);
171 |     static auto const gen = []() {return distribution(generator); };
172 |     std::generate_n(std::back_inserter(in_buf_a), buf_size, gen);
173 | 
174 |     buf_type out_buf(in_buf_a.size());
175 |     buf_type ref_buf;
176 | 
177 |     addi<buf_type::value_type>(&out_buf[0], &in_buf_a[0], imm, in_buf_a.size());
178 |     std::transform(in_buf_a.begin(), in_buf_a.end(), std::back_inserter(ref_buf), std::bind(std::plus<int32_t>(), ph::_1, imm));
179 | 
180 |     BOOST_TEST(ref_buf == out_buf);
181 | }
182 | 
183 | BOOST_AUTO_TEST_CASE(subtraction)
184 | {
185 |     using std::begin;
186 |     using std::end;
187 |     typedef std::vector<int32_t> buf_type;
188 |     const size_t buf_size = 32;
189 |     buf_type in_buf_a;
190 |     buf_type in_buf_b;
191 |     static std::uniform_int_distribution<buf_type::value_type> distribution(0, 255);
192 |     static auto const gen = []() {return distribution(generator); };
193 |     std::generate_n(std::back_inserter(in_buf_a), buf_size, gen);
194 |     std::generate_n(std::back_inserter(in_buf_b), buf_size, gen);
195 | 
196 |     buf_type out_buf(in_buf_a.size());
197 |     buf_type ref_buf;
198 | 
199 |     sub<buf_type::value_type>(&out_buf[0], &in_buf_a[0], &in_buf_b[0], in_buf_a.size());
200 |     std::transform(in_buf_a.begin(), in_buf_a.end(), in_buf_b.begin(), std::back_inserter(ref_buf), std::minus<int32_t>());
201 | 
202 |     BOOST_TEST(ref_buf == out_buf);
203 | }
204 | 


--------------------------------------------------------------------------------
/tests/vector_examples.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |     @file vector_examples.cpp
  3 |     @copyright ©2019 Syntacore.
  4 |     @authors
  5 |         Grigory Okhotnikov <go@syntacore.com>
  6 |     @brief RISCV-V Vector extension (v0.7) simulator usage examples and tests
  7 | */
  8 | 
  9 | #define BOOST_TEST_MODULE vector_examples
 10 | 
 11 | #include <boost/test/included/unit_test.hpp>
 12 | #include <boost/test/data/test_case.hpp>
 13 | #include <boost/test/data/monomorphic.hpp>
 14 | 
 15 | #include "riscv/ext/v.hpp"
 16 | 
 17 | using namespace ::riscv::v;
 18 | 
 19 | namespace {
 20 | std::default_random_engine generator(0);
 21 | 
 22 | void*
 23 | vvaddint32(size_t n, void * const dest, void const *src_a, void const *src_b)
 24 | {
 25 |     if (0 != n) {
 26 |         int32_t const *pa = static_cast<int32_t const*>(src_a);
 27 |         int32_t const *pb = static_cast<int32_t const*>(src_b);
 28 |         int32_t *pd = static_cast<int32_t*>(dest);
 29 | 
 30 |         do {
 31 |             size_t const vl = vsetvli(n, vtypei(e32, m1)); // Set vector length based on 32-bit vectors
 32 |             vlw_v(v0, pa);        // Get first vector
 33 |             n -= vl;              // Decrement number done
 34 |             pa += vl;             // Bump pointer
 35 |             vlw_v(v1, pb);        // Get second vector
 36 |             pb += vl;             // Bump pointer
 37 |             vadd_vv(v2, v0, v1);  // Sum vectors
 38 |             vsw_v(v2, pd);        // Store result
 39 |             pd += vl;             // Bump pointer
 40 |         } while (n);              // Loop back
 41 |     }
 42 | 
 43 |     return dest;
 44 | }
 45 | 
 46 | void
 47 | mixed_width(size_t n, void const *a, void * const b, void const *c)
 48 | {
 49 |     if (0 != n) {
 50 |         int8_t const *pa = static_cast<int8_t const*>(a);
 51 |         int32_t *pb = static_cast<int32_t*>(b);
 52 |         int32_t const *pc = static_cast<int32_t const*>(c);
 53 | 
 54 |         do {
 55 |             size_t vl = vsetvli(n, vtypei(e8, m1)); // Byte vector for predicate calc
 56 |             vlb_v(v1, pa);                      // Load a[i]
 57 |             pa += vl;                           // Bump pointer
 58 |             vmsle_vi(v0, v1, 5 - 1);            // a[i] < 5?
 59 | 
 60 |             vl = vsetvli(n, vtypei(e32, m4));   // Vector of 32-bit values
 61 |             n -= vl;                            // Decrement count
 62 |             vmv_v_i(v4, 1);                     // Splat immediate to destination
 63 |             vlw_v(v4, pc, vop_type::masked_in); // Load requested elements of C
 64 |             pc += vl;                           // Bump pointer
 65 |             vsw_v(v4, pb);                      // Store b[i]
 66 |             pb += vl;                           // Bump pointer
 67 |         } while (n);                            // Any more?
 68 |     }
 69 | }
 70 | 
 71 | void*
 72 | vmemcpy(void * const dest, void const *src, size_t n)
 73 | {
 74 |     if (0 != n) {
 75 |         int8_t const *ps = static_cast<int8_t const*>(src);
 76 |         int8_t *pd = static_cast<int8_t *>(dest);
 77 |         do {
 78 |             size_t const vl = vsetvli(n, vtypei(e8, m8)); // Vectors of 8b
 79 |             vlb_v(v0, ps);     // Load bytes
 80 |             ps += vl;          // Bump pointer
 81 |             n -= vl;           // Decrement count
 82 |             vsb_v(v0, pd);     // Store bytes
 83 |             pd += vl;          // Bump pointer
 84 |         } while (n);           // Any more?
 85 |     }
 86 | 
 87 |     return dest;
 88 | }
 89 | 
 90 | void
 91 | conditional(size_t n, void const *x, void const *a, void const *b, void * const z)
 92 | {
 93 |     if (0 != n) {
 94 |         int8_t const *px = static_cast<int8_t const*>(x);
 95 |         int16_t const *pa = static_cast<int16_t const*>(a);
 96 |         int16_t const *pb = static_cast<int16_t const*>(b);
 97 |         int16_t *pz = static_cast<int16_t*>(z);
 98 | 
 99 |         do {
100 |             size_t const vl = vsetvli(n, vtypei(e16)); // Use 16b elements.
101 |             vlb_v(v0, px);           // Get x[i], sign-extended to 16b
102 |             n -= vl;                 // Decrement element count
103 |             px += vl;                // x[i] Bump pointer
104 |             vmsle_vi(v0, v0, 5 - 1); // Set mask in v0
105 |             vlh_v(v1, pa, vop_type::masked_in); // z[i] = a[i] case
106 |             vmnot_m(v0, v0);         // Invert v0
107 |             pa += vl;                // a[i] bump pointer
108 |             vlh_v(v1, pb, vop_type::masked_in); // z[i] = b[i] case
109 |             pb += vl;                // b[i] bump pointer
110 |             vsh_v(v1, pz);           // Store z
111 |             pz += vl;                // b[i] bump pointer
112 |         } while (n);
113 |     }
114 | }
115 | 
116 | void
117 | saxpy(size_t n, float const a, float const *x, float *y)
118 | {
119 |     if (0 != n) {
120 |         do {
121 |             size_t const vl = vsetvli(n, vtypei(e32, m8));
122 |             vlw_v(v0, reinterpret_cast<int32_t const*>(x));
123 |             n -= vl;
124 |             x += vl;
125 |             vlw_v(v8, reinterpret_cast<int32_t*>(y));
126 |             vfmacc_vf(v8, a, v0);
127 |             vsw_v(v8, reinterpret_cast<int32_t*>(y));
128 |             y += vl;
129 |         } while (n);
130 |     }
131 | }
132 | 
133 | void
134 | sgemm(size_t n, size_t m, size_t k, float const *a, size_t lda, float const *b, size_t ldb, float *c, size_t ldc)
135 | {
136 |     if ((n == 0) || (m == 0) || (k == 0)) {
137 |         return;
138 |     }
139 | 
140 |     size_t const astride = lda;
141 |     size_t const bstride = ldb;
142 |     size_t const cstride = ldc;
143 | 
144 |     while (m >= 16) { // Loop across rows of C blocks
145 |         size_t nt = n; // Initialize n counter for next row of C blocks
146 |         float const *bnp = b; // Initialize B n-loop pointer to start
147 |         float *cnp = c; // Initialize C n-loop pointer
148 | 
149 |         while (nt) { // Loop across one row of C blocks
150 |             size_t const nvl = vsetvli(nt, vtypei(e32)); // 32-bit vectors, LMUL=1
151 |             float const *akp = a; // reset pointer into A to beginning
152 |             float const *bkp = bnp; // step to next column in B matrix
153 | 
154 |             // Initialize current C submatrix block from memory
155 |             vlw_v(v0, reinterpret_cast<int32_t*>(cnp));
156 |             float *ccp = cnp + cstride;
157 |             vlw_v(v1, reinterpret_cast<int32_t*>(ccp));
158 |             ccp += cstride;
159 |             vlw_v(v2, reinterpret_cast<int32_t*>(ccp));
160 |             ccp += cstride;
161 |             vlw_v(v3, reinterpret_cast<int32_t*>(ccp));
162 |             ccp += cstride;
163 |             vlw_v(v4, reinterpret_cast<int32_t*>(ccp));
164 |             ccp += cstride;
165 |             vlw_v(v5, reinterpret_cast<int32_t*>(ccp));
166 |             ccp += cstride;
167 |             vlw_v(v6, reinterpret_cast<int32_t*>(ccp));
168 |             ccp += cstride;
169 |             vlw_v(v7, reinterpret_cast<int32_t*>(ccp));
170 |             ccp += cstride;
171 |             vlw_v(v8, reinterpret_cast<int32_t*>(ccp));
172 |             ccp += cstride;
173 |             vlw_v(v9, reinterpret_cast<int32_t*>(ccp));
174 |             ccp += cstride;
175 |             vlw_v(v10, reinterpret_cast<int32_t*>(ccp));
176 |             ccp += cstride;
177 |             vlw_v(v11, reinterpret_cast<int32_t*>(ccp));
178 |             ccp += cstride;
179 |             vlw_v(v12, reinterpret_cast<int32_t*>(ccp));
180 |             ccp += cstride;
181 |             vlw_v(v13, reinterpret_cast<int32_t*>(ccp));
182 |             ccp += cstride;
183 |             vlw_v(v14, reinterpret_cast<int32_t*>(ccp));
184 |             ccp += cstride;
185 |             vlw_v(v15, reinterpret_cast<int32_t*>(ccp));
186 | 
187 |             size_t kt = k; // Initialize inner loop counter
188 | 
189 |             // Inner loop scheduled assuming 4-clock occupancy of vfmacc instruction and single-issue pipeline
190 |             // Software pipeline loads
191 |             float ft0 = akp[0];
192 |             float const *amp = akp + astride;
193 |             float ft1 = amp[0];
194 |             amp += astride;
195 |             float ft2 = amp[0];
196 |             amp += astride;
197 |             float ft3 = amp[0];
198 |             amp += astride;
199 | 
200 |             float ft15 = 0;
201 | 
202 |             // Get vector from B matrix
203 |             vlw_v(v16, reinterpret_cast<int32_t const*>(bkp));
204 | 
205 |             while (kt) { // Loop on inner dimension for current C block
206 |                 vfmacc_vf(v0, ft0, v16);
207 |                 bkp += bstride;
208 |                 float ft4 = amp[0];
209 |                 amp += astride;
210 |                 vfmacc_vf(v1, ft1, v16);
211 |                 kt -= 1; // Decrement k counter
212 |                 float ft5 = amp[0];
213 |                 amp += astride;
214 |                 vfmacc_vf(v2, ft2, v16);
215 |                 float ft6 = amp[0];
216 |                 amp += astride;
217 |                 float ft7 = amp[0];
218 |                 vfmacc_vf(v3, ft3, v16);
219 |                 amp += astride;
220 |                 float ft8 = amp[0];
221 |                 amp += astride;
222 |                 vfmacc_vf(v4, ft4, v16);
223 |                 float ft9 = amp[0];
224 |                 amp += astride;
225 |                 vfmacc_vf(v5, ft5, v16);
226 |                 float ft10 = amp[0];
227 |                 amp += astride;
228 |                 vfmacc_vf(v6, ft6, v16);
229 |                 float ft11 = amp[0];
230 |                 amp += astride;
231 |                 vfmacc_vf(v7, ft7, v16);
232 |                 float ft12 = amp[0];
233 |                 amp += astride;
234 |                 vfmacc_vf(v8, ft8, v16);
235 |                 float ft13 = amp[0];
236 |                 amp += astride;
237 |                 vfmacc_vf(v9, ft9, v16);
238 |                 float ft14 = amp[0];
239 |                 amp += astride;
240 |                 vfmacc_vf(v10, ft10, v16);
241 |                 float ft15 = amp[0];
242 |                 amp += astride;
243 |                 akp += 1; // Move to next column of a
244 |                 vfmacc_vf(v11, ft11, v16);
245 |                 // Don't load past end of matrix
246 |                 if (0 != kt) {
247 |                     ft0 = akp[0];
248 |                     amp = akp + astride;
249 |                 }
250 |                 vfmacc_vf(v12, ft12, v16);
251 |                 if (0 != kt) {
252 |                     ft1 = amp[0];
253 |                     amp += astride;
254 |                 }
255 |                 vfmacc_vf(v13, ft13, v16);
256 |                 if (0 != kt) {
257 |                     ft2 = amp[0];
258 |                     amp += astride;
259 |                 }
260 |                 vfmacc_vf(v14, ft14, v16);
261 |                 if (0 != kt) {
262 |                     ft3 = amp[0];
263 |                     amp += astride;
264 |                 }
265 |                 vfmacc_vf(v15, ft15, v16);
266 |                 vlw_v(v16, reinterpret_cast<int32_t const *>(bkp));
267 |             } // k_loop
268 |             vfmacc_vf(v15, ft15, v16);
269 | 
270 |             // Save C matrix block back to memory
271 |             vsw_v(v0, reinterpret_cast<int32_t*>(cnp));
272 |             ccp = cnp + cstride;
273 |             vsw_v(v1, reinterpret_cast<int32_t*>(ccp));
274 |             ccp += cstride;
275 |             vsw_v(v2, reinterpret_cast<int32_t*>(ccp));
276 |             ccp += cstride;
277 |             vsw_v(v3, reinterpret_cast<int32_t*>(ccp));
278 |             ccp += cstride;
279 |             vsw_v(v4, reinterpret_cast<int32_t*>(ccp));
280 |             ccp += cstride;
281 |             vsw_v(v5, reinterpret_cast<int32_t*>(ccp));
282 |             ccp += cstride;
283 |             vsw_v(v6, reinterpret_cast<int32_t*>(ccp));
284 |             ccp += cstride;
285 |             vsw_v(v7, reinterpret_cast<int32_t*>(ccp));
286 |             ccp += cstride;
287 |             vsw_v(v8, reinterpret_cast<int32_t*>(ccp));
288 |             ccp += cstride;
289 |             vsw_v(v9, reinterpret_cast<int32_t*>(ccp));
290 |             ccp += cstride;
291 |             vsw_v(v10, reinterpret_cast<int32_t*>(ccp));
292 |             ccp += cstride;
293 |             vsw_v(v11, reinterpret_cast<int32_t*>(ccp));
294 |             ccp += cstride;
295 |             vsw_v(v12, reinterpret_cast<int32_t*>(ccp));
296 |             ccp += cstride;
297 |             vsw_v(v13, reinterpret_cast<int32_t*>(ccp));
298 |             ccp += cstride;
299 |             vsw_v(v14, reinterpret_cast<int32_t*>(ccp));
300 |             ccp += cstride;
301 |             vsw_v(v15, reinterpret_cast<int32_t*>(ccp));
302 | 
303 |             // Following tail instructions should be scheduled earlier in free slots during C block save
304 | 
305 |             //  Bump pointers for loop across blocks in one row
306 |             cnp += nvl; // Move C block pointer over
307 |             bnp += nvl; // Move B block pointer over
308 |             nt -= nvl; // Decrement element count in n dimension
309 |         } // c_col_loop
310 | 
311 |         m -= 16;
312 |         a += astride * 16;
313 |         c += cstride * 16;
314 |     } // c_row_loop
315 |     // TODO: Handle end of matrix with fewer than 16 rows.
316 | }
317 | }  // namespace
318 | 
319 | BOOST_AUTO_TEST_CASE(vector_vector_add_example)
320 | {
321 |     using std::begin;
322 |     using std::end;
323 |     typedef std::vector<int32_t> buf_type;
324 |     const size_t buf_size = 128;
325 |     buf_type in_buf_a;
326 |     buf_type in_buf_b;
327 |     static std::uniform_int_distribution<buf_type::value_type> distribution(0, 255);
328 |     static auto const gen = []() {return distribution(generator); };
329 |     std::generate_n(std::back_inserter(in_buf_a), buf_size, gen);
330 |     std::generate_n(std::back_inserter(in_buf_b), buf_size, gen);
331 | 
332 |     buf_type out_buf(in_buf_a.size());
333 |     buf_type ref_buf;
334 | 
335 |     vvaddint32(buf_size, &out_buf[0], &in_buf_a[0], &in_buf_b[0]);
336 |     std::transform(in_buf_a.begin(), in_buf_a.end(), in_buf_b.begin(), std::back_inserter(ref_buf), std::plus<int32_t>());
337 | 
338 |     BOOST_TEST(ref_buf == out_buf);
339 | }
340 | 
341 | BOOST_AUTO_TEST_CASE(mixed_width_example)
342 | {
343 |     using std::begin;
344 |     using std::end;
345 |     typedef std::vector<int32_t> buf_type;
346 |     const size_t buf_size = 128;
347 |     std::vector<int8_t> a;
348 |     buf_type b;
349 |     buf_type c;
350 | 
351 |     static std::uniform_int_distribution<buf_type::value_type> distribution(0, 255);
352 |     static std::uniform_int_distribution<int16_t> distribution_a(0, 10);
353 |     static auto const gen = []() { return distribution(generator); };
354 |     static auto const gen_a = []() { return distribution_a(generator); };
355 |     std::generate_n(std::back_inserter(a), buf_size, gen_a);
356 |     std::generate_n(std::back_inserter(b), buf_size, gen);
357 |     std::generate_n(std::back_inserter(c), buf_size, gen);
358 | 
359 |     buf_type out_buf(buf_size);
360 |     buf_type ref_buf;
361 | 
362 |     for (size_t i = 0; i < buf_size; ++i) {
363 |         ref_buf.push_back(a[i] < 5 ? c[i] : 1);
364 |     }
365 | 
366 |     mixed_width(buf_size, &a[0], &b[0], &c[0]);
367 | 
368 |     BOOST_TEST(ref_buf == b);
369 | }
370 | 
371 | BOOST_AUTO_TEST_CASE(memcpy_example)
372 | {
373 |     using std::begin;
374 |     using std::end;
375 |     typedef std::vector<char> buf_type;
376 |     const size_t buf_size = 1024;
377 |     buf_type in_buf;
378 |     static std::uniform_int_distribution<int> distribution(0, 255);
379 |     static auto const gen = []() {return distribution(generator); };
380 |     std::generate_n(std::back_inserter(in_buf), buf_size, gen);
381 | 
382 |     buf_type out_buf(in_buf.size());
383 |     vmemcpy(&out_buf[0], &in_buf[0], in_buf.size() * sizeof(buf_type::value_type));
384 | 
385 |     BOOST_TEST(in_buf == out_buf);
386 | }
387 | 
388 | BOOST_AUTO_TEST_CASE(conditional_example)
389 | {
390 |     using std::begin;
391 |     using std::end;
392 |     typedef std::vector<int16_t> buf_type;
393 |     const size_t buf_size = 128;
394 |     buf_type a;
395 |     buf_type b;
396 |     std::vector<int8_t> x;
397 |     static std::uniform_int_distribution<buf_type::value_type> distribution(0, 255);
398 |     static std::uniform_int_distribution<int16_t> distribution_x(0, 10);
399 |     static auto const gen = []() { return distribution(generator); };
400 |     static auto const gen_x = []() { return distribution_x(generator); };
401 |     std::generate_n(std::back_inserter(x), buf_size, gen_x);
402 |     std::generate_n(std::back_inserter(a), buf_size, gen);
403 |     std::generate_n(std::back_inserter(b), buf_size, gen);
404 | 
405 |     buf_type out_buf(a.size());
406 |     buf_type ref_buf;
407 | 
408 |     conditional(buf_size, &x[0], &a[0], &b[0], &out_buf[0]);
409 | 
410 |     for (size_t i = 0; i < buf_size; ++i) {
411 |         ref_buf.push_back(x[i] < 5 ? a[i] : b[i]);
412 |     }
413 | 
414 |     BOOST_TEST(ref_buf == out_buf);
415 | }
416 | 
417 | BOOST_AUTO_TEST_CASE(saxpy_example)
418 | {
419 |     using std::begin;
420 |     using std::end;
421 |     typedef std::vector<float> buf_type;
422 |     const size_t buf_size = 128;
423 |     float const a = 2.7f;
424 | 
425 |     buf_type x;
426 |     buf_type y;
427 | 
428 |     static std::uniform_real_distribution<float> distribution(0, 255);
429 |     static auto const gen = []() { return distribution(generator); };
430 |     std::generate_n(std::back_inserter(x), buf_size, gen);
431 |     std::generate_n(std::back_inserter(y), buf_size, gen);
432 | 
433 |     buf_type ref_buf;
434 |     for (size_t i = 0; i < buf_size; ++i) {
435 |         ref_buf.push_back(a * x[i] + y[i]);
436 |     }
437 | 
438 |     saxpy(buf_size, a, &x[0], &y[0]);
439 | 
440 |     BOOST_TEST(ref_buf == y);
441 | }
442 | 
443 | BOOST_AUTO_TEST_CASE(sgemm_example)
444 | {
445 |     using std::begin;
446 |     using std::end;
447 | 
448 |     const size_t M = 16;
449 |     const size_t K = 16;
450 |     const size_t N = 16;
451 | 
452 |     typedef std::vector<float> buf_type;
453 | 
454 |     buf_type a;
455 |     buf_type b;
456 |     buf_type c;
457 | 
458 |     static std::uniform_real_distribution<float> distribution(0, 10);
459 |     static auto const gen = []() { return distribution(generator); };
460 | 
461 |     std::generate_n(std::back_inserter(a), M * K, gen);
462 |     std::generate_n(std::back_inserter(b), K * N, gen);
463 |     std::generate_n(std::back_inserter(c), M * N, gen);
464 | 
465 |     buf_type ref_buf(M * N);
466 | 
467 |     for (size_t i = 0; i < M; ++i) {
468 |         for (size_t j = 0; j < N; ++j) {
469 |             float sum = c[i * N + j];
470 |             for (size_t k = 0; k < K; k++)
471 |                 sum += a[i * K + k] * b[k * N + j];
472 |             ref_buf[i * N + j] = sum;
473 |         }
474 |     }
475 | 
476 |     sgemm(N, M, K, &a[0], M, &b[0], K, &c[0], M);
477 | 
478 |     BOOST_TEST(ref_buf == c);
479 | }
480 | 


--------------------------------------------------------------------------------
/include/riscv/ext/v.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |     @file v.hpp
  3 |     @copyright ©2019 Syntacore.
  4 |     @authors
  5 |         Grigory Okhotnikov <go@syntacore.com>
  6 |     @brief Vector extension simulator (v0.7)
  7 | */
  8 | 
  9 | #ifndef RISCV_EXT_V_HPP_
 10 | #define RISCV_EXT_V_HPP_
 11 | 
 12 | #include <cstdint>
 13 | #include <cstddef>
 14 | #include <stdexcept>
 15 | #include <string>
 16 | 
 17 | #define PASTE_(OP1, OP2) OP1 ## OP2
 18 | #define CONCAT_(OP1, OP2) PASTE_(OP1, OP2)
 19 | 
 20 | namespace riscv {
 21 | 
 22 | typedef int64_t xreg_type;
 23 | 
 24 | class Invalid_instruction
 25 |     : public std::invalid_argument
 26 | {
 27 | protected:
 28 |     Invalid_instruction(std::string const& msg)
 29 |         : std::invalid_argument(std::string("exception: invalid instruction: ") + msg)
 30 |     {}
 31 | 
 32 | };
 33 | 
 34 | namespace v {
 35 | inline namespace spec_0_7 {
 36 | 
 37 | enum class vop_type : uint8_t
 38 | {
 39 |     thread_all = 0b0,
 40 |     masked_in = 0b1,
 41 | };
 42 | 
 43 | enum vreg_no : uint8_t
 44 | {
 45 |     v0 = 0,
 46 |     v1 = 1,
 47 |     v2 = 2,
 48 |     v3 = 3,
 49 |     v4 = 4,
 50 |     v5 = 5,
 51 |     v6 = 6,
 52 |     v7 = 7,
 53 |     v8 = 8,
 54 |     v9 = 9,
 55 |     v10 = 10,
 56 |     v11 = 11,
 57 |     v12 = 12,
 58 |     v13 = 13,
 59 |     v14 = 14,
 60 |     v15 = 15,
 61 |     v16 = 16,
 62 |     v17 = 17,
 63 |     v18 = 18,
 64 |     v19 = 19,
 65 |     v20 = 20,
 66 |     v21 = 21,
 67 |     v22 = 22,
 68 |     v23 = 23,
 69 |     v24 = 24,
 70 |     v25 = 25,
 71 |     v26 = 26,
 72 |     v27 = 27,
 73 |     v28 = 28,
 74 |     v29 = 29,
 75 |     v30 = 30,
 76 |     v31 = 31
 77 | };
 78 | 
 79 | enum vreg_ew : uint8_t
 80 | {
 81 |     e8 = 0b00,
 82 |     e16 = 0b01,
 83 |     e32 = 0b10,
 84 |     e64 = 0b11,
 85 |     e128 = 0b100
 86 | };
 87 | 
 88 | enum vreg_mul : uint8_t
 89 | {
 90 |     m1 = 0b00,
 91 |     m2 = 0b01,
 92 |     m4 = 0b10,
 93 |     m8 = 0b11
 94 | };
 95 | 
 96 | inline size_t
 97 | vtype(vreg_ew ew, vreg_mul mul = m1, size_t vill = 0)
 98 | {
 99 |     return vill << (sizeof(xreg_type) - 1) | ew << 2 | mul;
100 | }
101 | 
102 | inline int16_t
103 | vtypei(vreg_ew ew, vreg_mul mul = m1)
104 | {
105 |     return (ew << 2) | mul;
106 | }
107 | 
108 | namespace implementation {
109 | 
110 | typedef float float32_t;
111 | typedef double float64_t;
112 | 
113 | #if 0
114 | class Bad_element_size
115 |     : public Invalid_instruction
116 | {
117 | public:
118 |     Bad_element_size(size_t el_size)
119 |         : Invalid_instruction(std::string("RVV config bad element size: ") + std::to_string(el_size))
120 |     {}
121 | };
122 | #endif
123 | 
124 | class State_not_configured
125 |     : public Invalid_instruction
126 | {
127 | public:
128 |     State_not_configured()
129 |         : Invalid_instruction(std::string("Illegal configuration (vill is set)"))
130 |     {}
131 | };
132 | 
133 | class Register_out_of_config_range
134 |     : public Invalid_instruction
135 | {
136 | public:
137 |     Register_out_of_config_range(vreg_no reg)
138 |         : Invalid_instruction(std::string("RVV register is out of config range : ") + std::to_string(static_cast<unsigned>(reg)))
139 |     {}
140 | };
141 | 
142 | class Load_wider_value_to_narrowed_element
143 |     : public Invalid_instruction
144 | {
145 | public:
146 |     Load_wider_value_to_narrowed_element(size_t value_size, size_t el_size)
147 |         : Invalid_instruction(std::string("RVV load wider value (size=") + std::to_string(value_size) + ") to narrowed element (size" + std::to_string(el_size) + ")")
148 |     {}
149 | };
150 | 
151 | class Instruction_undefined_for_element_size
152 |     : public Invalid_instruction
153 | {
154 | public:
155 |     Instruction_undefined_for_element_size(size_t el_size)
156 |         : Invalid_instruction(std::string("Instruction undefined for element size ") + std::to_string(el_size))
157 |     {}
158 | };
159 | 
160 | class V_unit;
161 | class State;
162 | 
163 | template<size_t N>
164 | struct Size_traits;
165 | 
166 | template<typename Memory_type>
167 | class Loader
168 | {
169 | private:
170 |     Loader(Loader const&) = delete;
171 |     Loader& operator = (Loader const&) = delete;
172 | 
173 | protected:
174 |     Loader() = default;
175 |     virtual ~Loader() = default;
176 | 
177 | protected:
178 |     static Memory_type
179 |     to_element(bool const &_val)
180 |     {
181 |         return Memory_type(!!_val);
182 |     }
183 | 
184 |     template<typename Value_type>
185 |     static typename std::enable_if<
186 |         (sizeof(Value_type) > sizeof(Memory_type)),
187 |         Memory_type
188 |     >::type
189 |     to_element(Value_type const &)
190 |     {
191 |         throw Load_wider_value_to_narrowed_element(sizeof(Value_type), sizeof(Memory_type));
192 |     }
193 | 
194 |     template<typename Value_type>
195 |     static typename std::enable_if<
196 |         sizeof(Memory_type) == sizeof(Value_type),
197 |         Memory_type
198 |     >::type
199 |     to_element(Value_type const &_val)
200 |     {
201 |         return reinterpret_cast<Memory_type const &>(_val);
202 |     }
203 | 
204 |     template<typename Value_type>
205 |     static typename std::enable_if<
206 |         (sizeof(Value_type) < sizeof(Memory_type)) &&
207 |         std::is_integral<Value_type>::value &&
208 |         std::is_unsigned<Value_type>::value,
209 |         Memory_type
210 |     >::type
211 |     to_element(Value_type const &_val)
212 |     {
213 |         return static_cast<Memory_type>(static_cast<typename std::make_unsigned<Memory_type>::type>(_val));
214 |     }
215 | 
216 |     template<typename Value_type>
217 |     static typename std::enable_if<
218 |         (sizeof(Value_type) < sizeof(Memory_type)) &&
219 |         std::is_integral<Value_type>::value &&
220 |         std::is_signed<Value_type>::value,
221 |         Memory_type
222 |     >::type
223 |     to_element(Value_type const &_val)
224 |     {
225 |         return static_cast<Memory_type>(_val);
226 |     }
227 | 
228 |     template<typename Value_type>
229 |     static typename std::enable_if<
230 |         (sizeof(Value_type) < sizeof(Memory_type)) &&
231 |         std::is_floating_point<Value_type>::value,
232 |         Memory_type
233 |     >::type
234 |     to_element(Value_type const &_val)
235 |     {
236 |         typedef typename std::make_unsigned<Memory_type>::type uel_type;
237 |         //        static uel_type const NaN_box = ~uel_type(0) << (CHAR_BIT * sizeof(Value_type));
238 | 
239 |         typedef typename Size_traits<sizeof(Value_type)>::uint_type uval_type;
240 |         //        return static_cast<Memory_type>(NaN_box | uel_type(reinterpret_cast<uval_type const&>(_val)));
241 |         return static_cast<Memory_type>(uel_type(reinterpret_cast<uval_type const &>(_val)));
242 |     }
243 | 
244 | public:
245 |     virtual void operator()(V_unit& vu, vreg_no vd, Memory_type const* rs1, ptrdiff_t rs2, vop_type mode) = 0;
246 |     virtual void operator()(V_unit& vu, vreg_no vd, Memory_type const* rs1, vreg_no idx, vop_type mode) = 0;
247 | };
248 | 
249 | template<typename Memory_type>
250 | class Saver
251 | {
252 | private:
253 |     Saver(Saver const&) = delete;
254 |     Saver& operator = (Saver const&) = delete;
255 | 
256 | protected:
257 |     Saver() = default;
258 |     virtual ~Saver() = default;
259 | 
260 | public:
261 |     virtual void operator()(V_unit& vu, vreg_no vs1, Memory_type* rs1, ptrdiff_t rs2, vop_type mode) const = 0;
262 |     virtual void operator()(V_unit& vu, vreg_no vs1, Memory_type* rs1, vreg_no idx, vop_type mode) const = 0;
263 | };
264 | 
265 | class Operations
266 | {
267 | private:
268 |     Operations(Operations const&) = delete;
269 |     Operations& operator = (Operations const&) = delete;
270 | 
271 | protected:
272 |     Operations() = default;
273 |     virtual ~Operations() = default;
274 | 
275 | public:
276 |     virtual operator Loader<int8_t>& () = 0;
277 |     virtual operator Loader<int16_t>& () = 0;
278 |     virtual operator Loader<int32_t>& () = 0;
279 |     virtual operator Loader<int64_t>& () = 0;
280 | 
281 |     virtual operator Loader<uint8_t>& () = 0;
282 |     virtual operator Loader<uint16_t>& () = 0;
283 |     virtual operator Loader<uint32_t>& () = 0;
284 | 
285 |     virtual operator Saver<int8_t>& () = 0;
286 |     virtual operator Saver<int16_t>& () = 0;
287 |     virtual operator Saver<int32_t>& () = 0;
288 |     virtual operator Saver<int64_t>& () = 0;
289 | 
290 |     virtual void vadd_vv(vreg_no vd, vreg_no vs2, vreg_no vs1, vop_type mode = vop_type::thread_all) = 0;
291 |     virtual void vadd_vx(vreg_no vd, vreg_no vs2, xreg_type rs1, vop_type mode = vop_type::thread_all) = 0;
292 |     virtual void vadd_vi(vreg_no vd, vreg_no vs2, int16_t imm, vop_type mode = vop_type::thread_all) = 0;
293 | 
294 |     virtual void vsub_vv(vreg_no vd, vreg_no vs1, vreg_no vs2, vop_type mode = vop_type::thread_all) = 0;
295 |     virtual void vsub_vx(vreg_no vd, vreg_no vs2, xreg_type rs1, vop_type mode = vop_type::thread_all) = 0;
296 | 
297 |     virtual void vmsle_vv(vreg_no vd, vreg_no vs2, vreg_no vs1, vop_type mode = vop_type::thread_all) = 0;
298 |     virtual void vmsle_vx(vreg_no vd, vreg_no vs2, xreg_type rs1, vop_type mode = vop_type::thread_all) = 0;
299 |     virtual void vmsle_vi(vreg_no vd, vreg_no vs2, int16_t imm, vop_type mode = vop_type::thread_all) = 0;
300 | 
301 |     virtual void vmand_mm(vreg_no vd, vreg_no vs2, vreg_no vs1) = 0;
302 |     virtual void vmnand_mm(vreg_no vd, vreg_no vs2, vreg_no vs1) = 0;
303 | 
304 |     virtual void vmnot_m(vreg_no vd, vreg_no vs1) = 0;
305 | 
306 |     virtual void vmv_v_v(vreg_no vd, vreg_no vs1) = 0;
307 |     virtual void vmv_v_x(vreg_no vd, xreg_type rs1) = 0;
308 |     virtual void vmv_v_i(vreg_no vd, int16_t imm) = 0;
309 | #if 0
310 |     virtual void vaddw(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
311 |     virtual void vsubw(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
312 | 
313 |     virtual void vsll(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
314 |     virtual void vsra(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
315 |     virtual void vsrl(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
316 | 
317 |     virtual void vand(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
318 |     virtual void vor(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
319 |     virtual void vxor(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
320 | 
321 |     virtual void vseq(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
322 |     virtual void vslt(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
323 |     virtual void vsge(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
324 |     virtual void vsltu(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
325 |     virtual void vsgeu(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
326 | 
327 |     virtual void vaddi(vreg_no vd, vreg_no vs1, int16_t imm) = 0;
328 |     virtual void vslli(vreg_no vd, vreg_no vs1, int16_t imm) = 0;
329 |     virtual void vsrli(vreg_no vd, vreg_no vs1, int16_t imm) = 0;
330 |     virtual void vsrai(vreg_no vd, vreg_no vs1, int16_t imm) = 0;
331 |     virtual void vandi(vreg_no vd, vreg_no vs1, int16_t imm) = 0;
332 |     virtual void vori(vreg_no vd, vreg_no vs1, int16_t imm) = 0;
333 |     virtual void vxori(vreg_no vd, vreg_no vs1, int16_t imm) = 0;
334 |     virtual void vaddwi(vreg_no vd, vreg_no vs1, int16_t imm) = 0;
335 | 
336 |     virtual void vmul(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
337 |     virtual void vmulh(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
338 |     virtual void vmulhsu(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
339 |     virtual void vmulhu(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
340 | 
341 |     virtual void vdiv(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
342 |     virtual void vdivu(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
343 |     virtual void vrem(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
344 |     virtual void vremu(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
345 | 
346 |     virtual void vfadd_w(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
347 |     virtual void vfadd_d(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
348 |     virtual void vfsub_w(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
349 |     virtual void vfsub_d(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
350 |     virtual void vfmul_w(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
351 |     virtual void vfmul_d(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
352 |     virtual void vfdiv_w(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
353 |     virtual void vfdiv_d(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
354 | 
355 |     virtual void vfsgnj_w(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
356 |     virtual void vfsgnj_d(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
357 |     virtual void vfsgnjn_w(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
358 |     virtual void vfsgnjn_d(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
359 |     virtual void vfsgnjx_w(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
360 |     virtual void vfsgnjx_d(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
361 | 
362 |     virtual void vfmadd_w(vreg_no vd, vreg_no vs1, vreg_no vs2, vreg_no vs3) = 0;
363 |     virtual void vfmadd_d(vreg_no vd, vreg_no vs1, vreg_no vs2, vreg_no vs3) = 0;
364 |     virtual void vfmsub_w(vreg_no vd, vreg_no vs1, vreg_no vs2, vreg_no vs3) = 0;
365 |     virtual void vfmsub_d(vreg_no vd, vreg_no vs1, vreg_no vs2, vreg_no vs3) = 0;
366 |     virtual void vfmaddwdn_w(vreg_no vd, vreg_no vs1, vreg_no vs2, vreg_no vs3) = 0;
367 |     virtual void vfmsubwdn_w(vreg_no vd, vreg_no vs1, vreg_no vs2, vreg_no vs3) = 0;
368 | 
369 |     virtual void vfmin_w(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
370 |     virtual void vfmin_d(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
371 |     virtual void vfmax_w(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
372 |     virtual void vfmax_d(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
373 |     virtual void vfeq_w(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
374 |     virtual void vfeq_d(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
375 |     virtual void vflt_w(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
376 |     virtual void vflt_d(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
377 |     virtual void vfle_w(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
378 |     virtual void vfle_d(vreg_no vd, vreg_no vs1, vreg_no vs2) = 0;
379 |     virtual void vfsqrt_w(vreg_no vd, vreg_no vs1) = 0;
380 |     virtual void vfsqrt_d(vreg_no vd, vreg_no vs1) = 0;
381 | 
382 |     virtual void vinsx(vreg_no vd, int32_t value, size_t idx) = 0;
383 |     virtual void vmiota(vreg_no vd) = 0;
384 | #endif
385 | };
386 | 
387 | class Float_operations
388 | {
389 | private:
390 |     Float_operations(Float_operations const&) = delete;
391 |     Float_operations& operator = (Float_operations const&) = delete;
392 | 
393 | protected:
394 |     Float_operations() = default;
395 |     virtual ~Float_operations() = default;
396 | 
397 | public:
398 |     virtual void vfmacc_vf(vreg_no vd, float rs1, vreg_no vs2, vop_type mode = vop_type::thread_all) = 0;
399 | };
400 | 
401 | #ifndef RVV_ELEN
402 | #define RVV_ELEN 64
403 | #endif
404 | 
405 | #ifndef RVV_VLEN
406 | #define RVV_VLEN 256
407 | #endif
408 | 
409 | #ifndef RVV_SLEN
410 | #define RVV_SLEN 64
411 | #endif
412 | 
413 | class V_unit
414 | {
415 | private:
416 |     V_unit(V_unit const&) = delete;
417 |     V_unit& operator=(V_unit const&) = delete;
418 | 
419 | protected:
420 |     V_unit() = default;
421 |     virtual ~V_unit() = default;
422 | 
423 | public:
424 |     static size_t const ELEN = RVV_ELEN;
425 |     static size_t const VLEN = RVV_VLEN;
426 |     static size_t const SLEN = RVV_SLEN;
427 |     static size_t const NREGS = 32;
428 | 
429 |     static V_unit& instance();
430 | 
431 |     virtual Operations& get_op_performer()const = 0;
432 |     virtual Float_operations& get_fop_performer()const = 0;
433 | };
434 | 
435 | template<typename Ty>
436 | inline void
437 | load(vreg_no vd, Ty const* rs1, ptrdiff_t rs2 = sizeof(Ty), vop_type mode = vop_type::thread_all)
438 | {
439 |     static_cast<Loader<Ty>&>(static_cast<Operations&>(V_unit::instance().get_op_performer()))(V_unit::instance(), vd, rs1, rs2, mode);
440 | }
441 | 
442 | template<typename Ty>
443 | inline void
444 | load(vreg_no vd, Ty const* rs1, vreg_no vs1, vop_type mode = vop_type::thread_all)
445 | {
446 |     static_cast<Loader<Ty>&>(static_cast<Operations&>(V_unit::instance().get_op_performer()))(V_unit::instance(), vd, rs1, vs1, mode);
447 | }
448 | 
449 | template<typename Ty>
450 | inline void
451 | save(vreg_no vs1, Ty* rs1, ptrdiff_t rs2 = sizeof(Ty), vop_type mode = vop_type::thread_all)
452 | {
453 |     static_cast<Saver<Ty>&>(static_cast<Operations&>(V_unit::instance().get_op_performer()))(V_unit::instance(), vs1, rs1, rs2, mode);
454 | }
455 | 
456 | template<typename Ty>
457 | inline void
458 | save(vreg_no vs1, Ty* rs1, vreg_no vs2, vop_type mode = vop_type::thread_all)
459 | {
460 |     static_cast<Saver<Ty>&>(static_cast<Operations&>(V_unit::instance().get_op_performer()))(V_unit::instance(), vs1, rs1, vs2, mode);
461 | }
462 | 
463 | }  // namespace implementation
464 | 
465 | size_t
466 | vsetvl(size_t, size_t);
467 | 
468 | size_t
469 | vsetvli(size_t, int16_t);
470 | 
471 | #define DEF_B_D(INNER_DEF_) INNER_DEF_(b,8) INNER_DEF_(h,16) INNER_DEF_(w,32)
472 | 
473 | /// Load constant-stride instructions
474 | ///@{
475 | 
476 | #define DEF_LOAD_CONSTANT_STRIDE_(NAME,TYPE) \
477 |     inline void CONCAT_(NAME,_v)(vreg_no vd, TYPE const* rs1, ptrdiff_t rs2, vop_type mode = vop_type::thread_all) \
478 |     { \
479 |         using namespace implementation; \
480 |         load(vd, rs1, rs2, mode); \
481 |     }
482 | 
483 | /// Load as signed integer (sign extended)
484 | ///@{
485 | 
486 | #define DEF_LOAD_INT_CONSTANT_STRIDE_(CHR,NUM) DEF_LOAD_CONSTANT_STRIDE_(CONCAT_(vls,CHR), CONCAT_(int,CONCAT_(NUM,_t)))
487 | DEF_B_D(DEF_LOAD_INT_CONSTANT_STRIDE_)
488 | #undef DEF_LOAD_INT_CONSTANT_STRIDE_
489 | ///@}
490 | 
491 | /// Load as unsigned integer (zero extended)
492 | ///@{
493 | #define DEF_LOAD_UNSIGNED_CONSTANT_STRIDE_(CHR,NUM) DEF_LOAD_CONSTANT_STRIDE_(CONCAT_(CONCAT_(vls,CHR),u), CONCAT_(CONCAT_(uint,NUM),_t))
494 | DEF_B_D(DEF_LOAD_UNSIGNED_CONSTANT_STRIDE_)
495 | #undef DEF_LOAD_UNSIGNED_CONSTANT_STRIDE_
496 | ///@}
497 | 
498 | #undef DEF_LOAD_CONSTANT_STRIDE_
499 | 
500 | ///@}
501 | 
502 | /// Load unit-stride instructions
503 | ///@{
504 | 
505 | #define DEF_LOAD_UNIT_STRIDE_(NAME,TYPE) \
506 |     inline void CONCAT_(NAME,_v)(vreg_no vd, TYPE const rs1[], vop_type mode = vop_type::thread_all) \
507 |     { \
508 |         using namespace implementation; \
509 |         load(vd, rs1, sizeof(TYPE), mode); \
510 |     }
511 | 
512 | /// Load as signed integer (sign extended)
513 | ///@{
514 | #define DEF_LOAD_INT_UNIT_STRIDE_(CHR,NUM) DEF_LOAD_UNIT_STRIDE_(CONCAT_(vl,CHR), CONCAT_(CONCAT_(int,NUM),_t))
515 | DEF_B_D(DEF_LOAD_INT_UNIT_STRIDE_)
516 | #undef DEF_LOAD_INT_UNIT_STRIDE_
517 | ///@}
518 | 
519 | /// Load as unsigned integer (zero extended)
520 | ///@{
521 | #define DEF_LOAD_UNSIGNED_UNIT_STRIDE_(CHR,NUM) DEF_LOAD_UNIT_STRIDE_(CONCAT_(CONCAT_(vl,CHR),u), CONCAT_(CONCAT_(uint,NUM),_t))
522 | DEF_B_D(DEF_LOAD_UNSIGNED_UNIT_STRIDE_)
523 | #undef DEF_LOAD_UNSIGNED_UNIT_STRIDE_
524 | ///@}
525 | 
526 | #undef DEF_LOAD_UNIT_STRIDE_
527 | ///@}
528 | 
529 | /// Load indexed (scatter-gather)
530 | ///@{
531 | #define DEF_LOAD_INDEXED_(NAME,TYPE) \
532 |     inline void CONCAT_(NAME,_v)(vreg_no vd, TYPE const* rs1, vreg_no vs1, vop_type mode = vop_type::thread_all) \
533 |     { \
534 |         using namespace implementation; \
535 |         load(vd, rs1, vs1, mode); \
536 |     }
537 | 
538 | /// Load as signed integer (sign extended)
539 | ///@{
540 | #define DEF_LOAD_INT_INDEXED_(CHR,NUM) DEF_LOAD_INDEXED_(CONCAT_(vlx,CHR), CONCAT_(CONCAT_(int,NUM),_t))
541 | DEF_B_D(DEF_LOAD_INT_INDEXED_)
542 | #undef DEF_LOAD_INT_INDEXED_
543 | ///@}
544 | 
545 | /// Load as unsigned integer (zero extended)
546 | ///@{
547 | #define DEF_LOAD_UNSIGNED_INDEXED_(CHR,NUM) DEF_LOAD_INDEXED_(CONCAT_(vlx,CONCAT_(CHR,u)), CONCAT_(CONCAT_(uint,NUM),_t))
548 | DEF_B_D(DEF_LOAD_UNSIGNED_INDEXED_)
549 | #undef DEF_LOAD_UNSIGNED_INDEXED_
550 | ///@}
551 | 
552 | ///@}
553 | 
554 | /// Constant-stride store instructions
555 | ///@{
556 | #define DEF_SAVE_CONSTANT_STRIDE_(NAME,TYPE) \
557 |     inline void CONCAT_(NAME,_v)(vreg_no vs1, TYPE* rs1, ptrdiff_t rs2, vop_type mode = vop_type::thread_all) \
558 |     { \
559 |         using namespace implementation; \
560 |         save(vs1, rs1, rs2, mode); \
561 |     }
562 | 
563 | #define DEF_SAVE_INT_CONSTANT_STRIDE_(CHR,NUM) DEF_SAVE_CONSTANT_STRIDE_(CONCAT_(vss,CHR), CONCAT_(CONCAT_(int,NUM),_t))
564 | DEF_B_D(DEF_SAVE_INT_CONSTANT_STRIDE_)
565 | #undef DEF_SAVE_INT_CONSTANT_STRIDE_
566 | #undef DEF_SAVE_CONSTANT_STRIDE_
567 | ///@}
568 | 
569 | /// Store unit-stride instructions
570 | ///@{
571 | #define DEF_SAVE_UNIT_STRIDE_(NAME,TYPE) \
572 |     inline void CONCAT_(NAME,_v)(vreg_no vs1, TYPE rs1[], vop_type mode = vop_type::thread_all) \
573 |     { \
574 |         using namespace implementation; \
575 |         save(vs1, rs1, sizeof(TYPE), mode); \
576 |     }
577 | 
578 | #define DEF_SAVE_INT_UNIT_STRIDE_(CHR,NUM) DEF_SAVE_UNIT_STRIDE_(CONCAT_(vs,CHR), CONCAT_(CONCAT_(int,NUM),_t))
579 | DEF_B_D(DEF_SAVE_INT_UNIT_STRIDE_)
580 | #undef DEF_SAVE_INT_UNIT_STRIDE_
581 | #undef DEF_SAVE_UNIT_STRIDE_
582 | ///@}
583 | 
584 | /// indexed-ordered store (scatter) instructions
585 | ///@{
586 | #define DEF_SAVE_INDEXED_(NAME,TYPE) \
587 |     inline void CONCAT_(NAME,_v)(vreg_no vs1, TYPE* rs1, vreg_no vs2, vop_type mode = vop_type::thread_all) \
588 |     { \
589 |         using namespace implementation; \
590 |         save(vs1, rs1, vs2, mode); \
591 |     }
592 | 
593 | #define DEF_SAVE_INT_INDEXED_(CHR,NUM) DEF_SAVE_INDEXED_(CONCAT_(vsx,CHR), CONCAT_(CONCAT_(int,NUM),_t))
594 | DEF_B_D(DEF_SAVE_INT_INDEXED_)
595 | #undef DEF_SAVE_INT_INDEXED_
596 | ///@}
597 | 
598 | #undef DEF_SAVE_INDEXED_
599 | 
600 | #undef DEF_B_D
601 | #undef DEF_B_W
602 | 
603 | #define DEF_BIN_OP_VV(NAM) \
604 |     inline void CONCAT_(NAM,_vv)(vreg_no vd, vreg_no vs2, vreg_no vs1, vop_type mode = vop_type::thread_all) \
605 |     { \
606 |         using namespace implementation; \
607 |         static_cast<Operations&>(V_unit::instance().get_op_performer()).CONCAT_(NAM,_vv)(vd, vs2, vs1, mode); \
608 |     }
609 | 
610 | #define DEF_BIN_OP_VX(NAM) \
611 |     inline void CONCAT_(NAM,_vx)(vreg_no vd, vreg_no vs2, xreg_type rs1, vop_type mode = vop_type::thread_all) \
612 |     { \
613 |         using namespace implementation; \
614 |         static_cast<Operations&>(V_unit::instance().get_op_performer()).CONCAT_(NAM,_vx)(vd, vs2, rs1, mode); \
615 |     }
616 | 
617 | #define DEF_BIN_OP_VI(NAM) \
618 |     inline void CONCAT_(NAM,_vi)(vreg_no vd, vreg_no vs2, int16_t imm, vop_type mode = vop_type::thread_all) \
619 |     { \
620 |         using namespace implementation; \
621 |         static_cast<Operations&>(V_unit::instance().get_op_performer()).CONCAT_(NAM,_vi)(vd, vs2, imm, mode); \
622 |     }
623 | 
624 | #define DEF_BIN_OP_VXI(nam) \
625 |     DEF_BIN_OP_VV(nam) \
626 |     DEF_BIN_OP_VX(nam) \
627 |     DEF_BIN_OP_VI(nam)
628 | 
629 | DEF_BIN_OP_VXI(vadd)
630 | 
631 | DEF_BIN_OP_VV(vsub)
632 | DEF_BIN_OP_VX(vsub)
633 | 
634 | // DEF_BIN_OP_VXI(vmsle)
635 | DEF_BIN_OP_VI(vmsle)
636 | 
637 | #define DEF_BIN_OP_MM(NAM) \
638 |     inline void CONCAT_(NAM,_mm)(vreg_no vd, vreg_no vs2, vreg_no vs1) \
639 |     { \
640 |         using namespace implementation; \
641 |         static_cast<Operations&>(V_unit::instance().get_op_performer()).CONCAT_(NAM,_mm)(vd, vs2, vs1); \
642 |     }
643 | 
644 | #define DEF_BIN_OP_M(NAM) \
645 |     inline void CONCAT_(NAM,_m)(vreg_no vd, vreg_no vs1, vop_type mode = vop_type::thread_all) \
646 |     { \
647 |         using namespace implementation; \
648 |         static_cast<Operations&>(V_unit::instance().get_op_performer()).CONCAT_(NAM,_m)(vd, vs1, mode); \
649 |     }
650 | 
651 | #define DEF_BIN_OP_M_NO_MODE(NAM) \
652 |     inline void CONCAT_(NAM,_m)(vreg_no vd, vreg_no vs1) \
653 |     { \
654 |         using namespace implementation; \
655 |         static_cast<Operations&>(V_unit::instance().get_op_performer()).CONCAT_(NAM,_m)(vd, vs1); \
656 |     }
657 | 
658 | DEF_BIN_OP_MM(vmand)
659 | DEF_BIN_OP_MM(vmnand)
660 | DEF_BIN_OP_M_NO_MODE(vmnot)
661 | 
662 | #undef DEF_BIN_OP_M_NO_MODE
663 | #undef DEF_BIN_OP_MM
664 | #undef DEF_BIN_OP_M
665 | 
666 | #if 0
667 | DEF_BIN_OP(vsll)
668 | DEF_BIN_OP(vsra)
669 | DEF_BIN_OP(vsrl)
670 | DEF_BIN_OP(vand)
671 | DEF_BIN_OP(vor)
672 | DEF_BIN_OP(vxor)
673 | 
674 | DEF_BIN_OP(vseq)
675 | DEF_BIN_OP(vslt)
676 | DEF_BIN_OP(vsge)
677 | DEF_BIN_OP(vsltu)
678 | DEF_BIN_OP(vsgeu)
679 | #endif
680 | 
681 | #define DEF_BIN_IMM_OP(NAM) \
682 |     inline void NAM(vreg_no vd, vreg_no vs1, int16_t imm, vop_type mode = vop_type::thread_all) \
683 |     { \
684 |         using namespace implementation; \
685 |         static_cast<Operations&>(V_unit::instance().get_op_performer()).NAM(vd, vs1, imm, mode); \
686 |     }
687 | 
688 | #if 0
689 | DEF_BIN_IMM_OP(vaddi)
690 | DEF_BIN_IMM_OP(vslli)
691 | DEF_BIN_IMM_OP(vsrli)
692 | DEF_BIN_IMM_OP(vsrai)
693 | DEF_BIN_IMM_OP(vandi)
694 | DEF_BIN_IMM_OP(vori)
695 | DEF_BIN_IMM_OP(vxori)
696 | 
697 | /// 32-bit operations
698 | ///@{
699 | DEF_BIN_OP(vaddw)
700 | DEF_BIN_OP(vsubw)
701 | DEF_BIN_IMM_OP(vaddwi)
702 | ///@}
703 | 
704 | DEF_BIN_OP(vmul)
705 | DEF_BIN_OP(vmulh)
706 | DEF_BIN_OP(vmulhsu)
707 | DEF_BIN_OP(vmulhu)
708 | 
709 | DEF_BIN_OP(vdiv)
710 | DEF_BIN_OP(vdivu)
711 | DEF_BIN_OP(vrem)
712 | DEF_BIN_OP(vremu)
713 | #endif
714 | 
715 | inline void vmv_v_v(vreg_no vd, vreg_no vs1)
716 | {
717 |     using namespace implementation;
718 |     static_cast<Operations&>(V_unit::instance().get_op_performer()).vmv_v_v(vd, vs1);
719 | }
720 | 
721 | inline void vmv_v_x(vreg_no vd, xreg_type rs1)
722 | {
723 |     using namespace implementation;
724 |     static_cast<Operations&>(V_unit::instance().get_op_performer()).vmv_v_x(vd, rs1);
725 | }
726 | 
727 | inline void vmv_v_i(vreg_no vd, int16_t imm)
728 | {
729 |     using namespace implementation;
730 |     static_cast<Operations&>(V_unit::instance().get_op_performer()).vmv_v_i(vd, imm);
731 | }
732 | 
733 | #if 0
734 | template<vreg_no vd, vreg_no vs1, vreg_no vs2, vop_type mode = vop_type::thread_all> inline void vmulwdn();
735 | 
736 | /// Integer reduction operations
737 | ///@{
738 | template<vreg_no vd, vreg_no vs1, vop_type mode> inline void vredsum();
739 | 
740 | template<vreg_no vd, vreg_no vs1, vop_type mode> inline void vredmax();
741 | template<vreg_no vd, vreg_no vs1, vop_type mode> inline void vredmaxu();
742 | 
743 | template<vreg_no vd, vreg_no vs1, vop_type mode> inline void vredmin();
744 | template<vreg_no vd, vreg_no vs1, vop_type mode> inline void vredminu();
745 | ///@}
746 | #endif
747 | 
748 | // void vfmacc_vf(vreg_no vd, float rs1, vreg_no vs2, vop_type mode = vop_type::thread_all) = 0;
749 | 
750 | #define DEF_BIN_OP_VF(NAM) \
751 |     inline void CONCAT_(NAM,_vf)(vreg_no vd, float rs1, vreg_no vs2, vop_type mode = vop_type::thread_all) \
752 |     { \
753 |         using namespace implementation; \
754 |         static_cast<Float_operations&>(V_unit::instance().get_fop_performer()).CONCAT_(NAM,_vf)(vd, rs1, vs2, mode); \
755 |     }
756 | 
757 | DEF_BIN_OP_VF(vfmacc)
758 | 
759 | #if 0
760 | DEF_BIN_OP(vfadd_w)
761 | DEF_BIN_OP(vfadd_d)
762 | 
763 | DEF_BIN_OP(vfsub_w)
764 | DEF_BIN_OP(vfsub_d)
765 | 
766 | DEF_BIN_OP(vfmul_w)
767 | DEF_BIN_OP(vfmul_d)
768 | 
769 | DEF_BIN_OP(vfdiv_w)
770 | DEF_BIN_OP(vfdiv_d)
771 | 
772 | DEF_BIN_OP(vfsgnj_w)
773 | DEF_BIN_OP(vfsgnj_d)
774 | 
775 | DEF_BIN_OP(vfsgnjn_w)
776 | DEF_BIN_OP(vfsgnjn_d)
777 | 
778 | DEF_BIN_OP(vfsgnjx_w)
779 | DEF_BIN_OP(vfsgnjx_d)
780 | 
781 | DEF_BIN_OP(vfmin_w)
782 | DEF_BIN_OP(vfmin_d)
783 | 
784 | DEF_BIN_OP(vfmax_w)
785 | DEF_BIN_OP(vfmax_d)
786 | 
787 | DEF_BIN_OP(vfeq_w)
788 | DEF_BIN_OP(vfeq_d)
789 | 
790 | DEF_BIN_OP(vflt_w)
791 | DEF_BIN_OP(vflt_d)
792 | 
793 | DEF_BIN_OP(vfle_w)
794 | DEF_BIN_OP(vfle_d)
795 | #endif
796 | 
797 | #define DEF_UNARY_OP_V(NAM) \
798 |     template<vreg_no vd, vreg_no vs1, vop_type mode = vop_type::thread_all> inline void CONCAT_(NAM, _v)() \
799 |     { \
800 |         using namespace implementation; \
801 |         static_cast<Operations&>(V_unit::instance().get_op_performer()).NAM(vd, vs1); \
802 |     }
803 | 
804 | #if 0
805 | DEF_UNARY_OP_V(vfsqrt)
806 | #endif
807 | 
808 | #if 0
809 | template<vreg_no vd, vreg_no vs1, vop_type mode = vop_type::thread_all> inline void vfclass_v();
810 | #endif
811 | 
812 | #if 0
813 | /// Floating-point reduction operations
814 | ///@{
815 | template<vreg_no vd, vreg_no vs1, vop_type mode> inline void vfredosum_v();
816 | template<vreg_no vd, vreg_no vs1, vop_type mode> inline void vfredsum_v();
817 | template<vreg_no vd, vreg_no vs1, vop_type mode> inline void vfredmax_v();
818 | template<vreg_no vd, vreg_no vs1, vop_type mode> inline void vfredmin_v();
819 | ///@}
820 | #endif
821 | 
822 | /// Vector floating-point fused multiply-add
823 | ///@{
824 | #define DEF_3_OP(NAM) \
825 |     inline void NAM(vreg_no vd, vreg_no vs1, vreg_no vs2, vreg_no vs3, vop_type mode = vop_type::thread_all) \
826 |     { \
827 |         using namespace implementation; \
828 |         static_cast<Operations&>(V_unit::instance().get_op_performer()).NAM(vd, vs1, vs2, vs3, mode); \
829 |     }
830 | 
831 | #if 0
832 | DEF_3_OP(vfmadd_vv)
833 | DEF_3_OP(vfmadd_vf)
834 | 
835 | DEF_3_OP(vfmsub_vv)
836 | DEF_3_OP(vfmsub_vf)
837 | #endif
838 | 
839 | #undef DEF_3_OP
840 | ///@}
841 | 
842 | #if 0
843 | /// Convert integer to narrower integer
844 | ///@{
845 | //template<typename to, typename from, vop_type mode = vop_type::thread_all> void vcvt(vreg_no vd, vreg_no vs1);
846 | ///@}
847 | 
848 | /// Convert integer to float
849 | ///@{
850 | template<typename to, typename from, vop_type mode = vop_type::thread_all> void vfcvt(vreg_no vd, vreg_no vs1);
851 | ///@}
852 | 
853 | /// Move to/from floating-point (f) registers.
854 | ///@{
855 | template<typename to, typename from> void vfmv(vreg_no vd, from fs1);
856 | template<typename to> to vfmv(vreg_no vs1);
857 | ///@}
858 | #endif
859 | 
860 | #if 0
861 | template<vreg_no vd, vop_type mode = vop_type::thread_all> inline void vmiota()
862 | {
863 |     using namespace implementation;
864 |     static_cast<Operations<mode>&>(V_unit::instance()).vmiota(vd);
865 | }
866 | #endif
867 | 
868 | #if 0
869 | template<vreg_no vd, vop_type mode = vop_type::thread_all> inline void vinsx(int32_t value, size_t idx = 0)
870 | {
871 |     using namespace implementation;
872 |     static_cast<Operations<mode>&>(V_unit::instance()).vinsx(vd, value, idx);
873 | }
874 | #endif
875 | 
876 | #undef DEF_UNARY_OP
877 | #undef DEF_BIN_IMM_OP
878 | #undef DEF_BIN_OP_VV
879 | #undef DEF_BIN_OP_VX
880 | #undef DEF_BIN_OP_VI
881 | #undef DEF_BIN_OP_VXI
882 | #undef DEF_BIN_OP_VF
883 | 
884 | }  // namespace spec_0_7
885 | }  // namespace v
886 | }  // namespace riscv
887 | 
888 | #undef PASTE_
889 | #undef CONCAT_
890 | 
891 | #endif  // RISCV_EXT_V_HPP_
892 | 


--------------------------------------------------------------------------------
/src/riscv32/v.cpp:
--------------------------------------------------------------------------------
   1 | /**
   2 |     @file v.cpp
   3 |     @copyright ©2019 Syntacore.
   4 |     @authors
   5 |         Grigory Okhotnikov <go@syntacore.com>
   6 |     @brief Vector extension simulator (v0.7)
   7 | */
   8 | 
   9 | #include "riscv/ext/v.hpp"
  10 | 
  11 | #include <memory>
  12 | #include <algorithm>
  13 | #include <cassert>
  14 | #include <array>
  15 | #include <vector>
  16 | #include <type_traits>
  17 | #include <functional>
  18 | #include <cmath>
  19 | #include <limits>
  20 | #include <climits>
  21 | 
  22 | #ifdef _MSC_VER
  23 | #pragma warning( disable : 4250)
  24 | #endif
  25 | 
  26 | namespace riscv {
  27 | namespace v {
  28 | namespace spec_0_7 {
  29 | namespace implementation {
  30 | namespace {
  31 | 
  32 | template<size_t N>
  33 | struct Size_traits;
  34 | 
  35 | template<>
  36 | struct Size_traits<1u>
  37 | {
  38 |     typedef int8_t int_type;
  39 |     typedef uint8_t uint_type;
  40 | };
  41 | 
  42 | template<>
  43 | struct Size_traits<2u>
  44 | {
  45 |     typedef int16_t int_type;
  46 |     typedef uint16_t uint_type;
  47 | };
  48 | 
  49 | template<>
  50 | struct Size_traits<4u>
  51 | {
  52 |     typedef int32_t int_type;
  53 |     typedef uint32_t uint_type;
  54 |     typedef float32_t float_type;
  55 | };
  56 | 
  57 | template<>
  58 | struct Size_traits<8u>
  59 | {
  60 |     typedef int64_t int_type;
  61 |     typedef uint64_t uint_type;
  62 |     typedef float64_t float_type;
  63 | };
  64 | 
  65 | template<typename Ty>
  66 | inline
  67 | typename std::enable_if<std::is_floating_point<Ty>::value, Ty>::type
  68 | fsgnj(Ty const &s1, Ty const &s2)
  69 | {
  70 |     typedef Size_traits<sizeof(Ty)> size_traits;
  71 |     typedef typename size_traits::uint_type uint_type;
  72 |     typedef typename size_traits::int_type int_type;
  73 |     static uint_type const mask1 = uint_type((std::numeric_limits<int_type>::max)());
  74 |     auto const p1 = reinterpret_cast<uint_type const &>(s1);
  75 |     auto const p2 = reinterpret_cast<uint_type const &>(s2);
  76 |     auto const res = (mask1 & p1) | (~mask1 & p2);
  77 |     auto const fres = reinterpret_cast<Ty const &>(res);
  78 |     return fres;
  79 | }
  80 | 
  81 | template<typename Ty>
  82 | inline
  83 | typename std::enable_if<std::is_floating_point<Ty>::value, Ty>::type
  84 | fsgnjn(Ty const &s1, Ty const &s2)
  85 | {
  86 |     typedef Size_traits<sizeof(Ty)> size_traits;
  87 |     typedef typename size_traits::uint_type uint_type;
  88 |     typedef typename size_traits::int_type int_type;
  89 |     static uint_type const mask1 = uint_type((std::numeric_limits<int_type>::max)());
  90 |     auto const p1 = reinterpret_cast<uint_type const &>(s1);
  91 |     auto const p2 = reinterpret_cast<uint_type const &>(s2);
  92 |     auto const res = (mask1 & p1) | (~mask1 & ~p2);
  93 |     auto const fres = reinterpret_cast<Ty const &>(res);
  94 |     return fres;
  95 | }
  96 | 
  97 | template<typename Ty>
  98 | inline
  99 | typename std::enable_if<std::is_floating_point<Ty>::value, Ty>::type
 100 | fsgnjx(Ty const &s1, Ty const &s2)
 101 | {
 102 |     typedef Size_traits<sizeof(Ty)> size_traits;
 103 |     typedef typename size_traits::uint_type uint_type;
 104 |     typedef typename size_traits::int_type int_type;
 105 |     static uint_type const mask1 = uint_type((std::numeric_limits<int_type>::max)());
 106 |     auto const p1 = reinterpret_cast<uint_type const &>(s1);
 107 |     auto const p2 = reinterpret_cast<uint_type const &>(s2);
 108 |     auto const res = (mask1 & p1) ^(~mask1 & p2);
 109 |     auto const fres = reinterpret_cast<Ty const &>(res);
 110 |     return fres;
 111 | }
 112 | 
 113 | template<typename Ty>
 114 | inline
 115 | typename std::enable_if<
 116 |     std::is_integral<Ty>::value &&
 117 |     std::is_signed<Ty>::value &&
 118 |     std::is_integral<typename Size_traits<2 * sizeof(Ty)>::int_type>::value,
 119 |     Ty>::type
 120 | mulh(Ty const &x, Ty const &y)
 121 | {
 122 |     typedef typename Size_traits<2 * sizeof(Ty)>::int_type dbl_type;
 123 |     return static_cast<Ty>((dbl_type(x) * dbl_type(y)) >> (CHAR_BIT * sizeof(Ty)));
 124 | }
 125 | 
 126 | template<typename Ty1, typename Ty2>
 127 | inline
 128 | typename std::enable_if<
 129 |     sizeof(Ty1) == sizeof(Ty2) &&
 130 |     std::is_integral<Ty1>::value &&
 131 |     std::is_signed<Ty1>::value &&
 132 |     std::is_integral<Ty2>::value &&
 133 |     std::is_unsigned<Ty2>::value &&
 134 |     std::is_integral<typename Size_traits<2 * sizeof(Ty1)>::int_type>::value,
 135 |     Ty1>::type
 136 | mulhsu(Ty1 const &x, Ty2 const &y)
 137 | {
 138 |     typedef typename Size_traits<2 * sizeof(Ty1)>::int_type idbl_type;
 139 |     typedef typename Size_traits<2 * sizeof(Ty2)>::uint_type udbl_type;
 140 |     return static_cast<Ty1>((idbl_type(x) * idbl_type(udbl_type(y))) >> (CHAR_BIT * sizeof(Ty1)));
 141 | }
 142 | 
 143 | template<typename Ty>
 144 | inline
 145 | typename std::enable_if<
 146 |     std::is_integral<Ty>::value &&
 147 |     std::is_unsigned<Ty>::value &&
 148 |     std::is_integral<typename Size_traits<2 * sizeof(Ty)>::uint_type>::value,
 149 |     Ty>::type
 150 | mulhu(Ty const &x, Ty const &y)
 151 | {
 152 |     typedef typename Size_traits<2 * sizeof(Ty)>::uint_type udbl_type;
 153 |     return static_cast<Ty>((udbl_type(x) * udbl_type(y)) >> (CHAR_BIT * sizeof(Ty)));
 154 | }
 155 | 
 156 | inline int64_t
 157 | mulh(int64_t const &, int64_t const &)
 158 | {
 159 |     throw Instruction_undefined_for_element_size(sizeof(int64_t));
 160 | }
 161 | 
 162 | inline int64_t
 163 | mulhsu(int64_t const &, int64_t const &)
 164 | {
 165 |     throw Instruction_undefined_for_element_size(sizeof(int64_t));
 166 | }
 167 | 
 168 | inline int64_t
 169 | mulhu(int64_t const &, int64_t const &)
 170 | {
 171 |     throw Instruction_undefined_for_element_size(sizeof(int64_t));
 172 | }
 173 | 
 174 | class Impl_base
 175 |     : virtual public V_unit
 176 | {
 177 | public:
 178 |     virtual size_t setvl(size_t vl) = 0;
 179 |     virtual size_t setvstart(size_t vstart) = 0;
 180 |     virtual void setill(bool ill) = 0;
 181 |     virtual void setew(size_t ew) = 0;
 182 |     virtual void setmul(size_t mul) = 0;
 183 | #if 0
 184 |     virtual void set_mask_reg(vreg_no) = 0;
 185 | #endif
 186 | };
 187 | }  // namespace
 188 | 
 189 | class State
 190 |     : virtual public Impl_base
 191 | {
 192 | public:
 193 |     virtual size_t vl()const = 0;
 194 |     virtual size_t vstart()const = 0;
 195 |     virtual size_t vlmax()const = 0;
 196 |     virtual char* elt_ptr(vreg_no _reg, size_t _ind) = 0;
 197 |     virtual char const* elt_ptr(vreg_no _reg, size_t _ind)const = 0;
 198 |     virtual size_t sew()const = 0;
 199 |     virtual size_t lmul()const = 0;
 200 |     virtual bool is_ill()const = 0;
 201 |     virtual Operations& get_op_performer()const = 0;
 202 |     virtual Float_operations& get_fop_performer()const = 0;
 203 |     virtual bool is_enabled(size_t i)const = 0;
 204 |     virtual bool get_mask(vreg_no _reg, size_t _ind)const = 0;
 205 |     virtual void set_mask(vreg_no _reg, size_t _ind, bool value) = 0;
 206 | protected:
 207 |     virtual bool is_valid_reg(vreg_no _reg)const = 0;
 208 |     virtual bool mask_bit(size_t i)const = 0;
 209 | };
 210 | 
 211 | namespace {
 212 | template<typename Element_type, typename Memory_type>
 213 | class Bad_load
 214 |     : public Loader<Memory_type>
 215 | {
 216 |     static_assert(sizeof(Element_type) < sizeof(Memory_type), "Bad type");
 217 | 
 218 |     void operator()(V_unit &st, vreg_no, Memory_type const *, ptrdiff_t, vop_type mode) final
 219 |     {
 220 |         throw Load_wider_value_to_narrowed_element(sizeof(Memory_type), sizeof(Element_type));
 221 |     }
 222 | 
 223 |     void operator()(V_unit &st, vreg_no, Memory_type const *, vreg_no, vop_type mode) final
 224 |     {
 225 |         throw Load_wider_value_to_narrowed_element(sizeof(Memory_type), sizeof(Element_type));
 226 |     }
 227 | };
 228 | 
 229 | template<typename Element_type, typename Memory_type>
 230 | class Good_load;
 231 | 
 232 | template<typename Element_type, typename Memory_type>
 233 | using Loader_impl =
 234 | typename std::conditional<
 235 |     (sizeof(Element_type) < sizeof(Memory_type)),
 236 |     Bad_load<Element_type, Memory_type>,
 237 |     Good_load<Element_type, Memory_type>
 238 | >::type;
 239 | 
 240 | template<typename Element_type>
 241 | class Operations_impl;
 242 | 
 243 | template<typename Element_type>
 244 | class Operations_essentials;
 245 | 
 246 | template<typename Element_type, typename Memory_type>
 247 | class Good_load
 248 |     : public Loader<Memory_type>
 249 | {
 250 |     static_assert(sizeof(Element_type) >= sizeof(Memory_type), "Bad type");
 251 | 
 252 |     using Loader<Memory_type>::to_element;
 253 | 
 254 |     void operator()(V_unit& vu, vreg_no vd, Memory_type const *rs1, ptrdiff_t rs2, vop_type mode) final
 255 |     {
 256 |         State& st = dynamic_cast<State&>(vu);
 257 |         auto p = reinterpret_cast<char const *>(rs1);
 258 |         auto const len = st.vl();
 259 | 
 260 |         for (size_t i = 0; i < len; ++i, p += rs2) {
 261 |             if (mode == vop_type::thread_all || mode == vop_type::masked_in && st.is_enabled(i)) {
 262 |                 Element_type *const addr = reinterpret_cast<Element_type *>(st.elt_ptr(vd, i));
 263 |                 *addr = static_cast<Element_type>(*reinterpret_cast<Memory_type const *>(p));
 264 |             }
 265 |         }
 266 |     }
 267 | 
 268 |     void operator()(V_unit& vu, vreg_no vd, Memory_type const *rs1, vreg_no idx, vop_type mode) final
 269 |     {
 270 |         State& st = dynamic_cast<State&>(vu);
 271 |         auto const p = reinterpret_cast<char const *>(rs1);
 272 |         auto const len = st.vl();
 273 | 
 274 |         for (size_t i = 0; i < len; ++i) {
 275 |             if (mode == vop_type::thread_all || mode == vop_type::masked_in && st.is_enabled(i)) {
 276 |                 Memory_type *const addr = reinterpret_cast<Memory_type *>(st.elt_ptr(vd, i));
 277 |                 size_t const stride = *reinterpret_cast<size_t *>(st.elt_ptr(idx, i));
 278 |                 *addr = to_element(*reinterpret_cast<Memory_type const *>(p + stride));
 279 |             }
 280 |         }
 281 |     }
 282 | };
 283 | 
 284 | template<typename Element_type, typename Memory_type>
 285 | class Saver_impl
 286 |     : protected Saver<Memory_type>
 287 | {
 288 |     void operator()(V_unit& vu, vreg_no vs1, Memory_type *rs1, ptrdiff_t rs2, vop_type mode) const final
 289 |     {
 290 |         State& st = dynamic_cast<State&>(vu);
 291 |         auto p = reinterpret_cast<char *>(rs1);
 292 |         auto const len = st.vl();
 293 | 
 294 |         for (size_t i = 0; i < len; ++i, p += rs2) {
 295 |             if (mode == vop_type::thread_all || mode == vop_type::masked_in && st.is_enabled(i)) {
 296 |                 Memory_type *const addr = reinterpret_cast<Memory_type *>(p);
 297 |                 *addr = *reinterpret_cast<Memory_type *>(st.elt_ptr(vs1, i));
 298 |             }
 299 |         }
 300 |     }
 301 | 
 302 |     void operator()(V_unit& vu, vreg_no vs1, Memory_type *rs1, vreg_no idx, vop_type mode) const final
 303 |     {
 304 |         State& st = dynamic_cast<State&>(vu);
 305 |         auto const p = reinterpret_cast<char *>(rs1);
 306 |         auto const len = st.vl();
 307 | 
 308 |         for (size_t i = 0; i < len; ++i) {
 309 |             if (mode == vop_type::thread_all || mode == vop_type::masked_in && st.is_enabled(i)) {
 310 |                 size_t const stride = *reinterpret_cast<size_t *>(st.elt_ptr(idx, i));
 311 |                 Memory_type *const addr = reinterpret_cast<Memory_type *>(p + stride);
 312 |                 *addr = *reinterpret_cast<Memory_type *>(st.elt_ptr(vs1, i));
 313 |             }
 314 |         }
 315 |     }
 316 | };
 317 | 
 318 | template<typename Element_type, typename Memory_type>
 319 | class Get_loader
 320 |     : virtual Operations
 321 |     , Loader_impl<Element_type, Memory_type>
 322 | {
 323 |     operator Loader<Memory_type> &() final
 324 |     {
 325 |         return static_cast<Loader_impl<Element_type, Memory_type> &>(*this);
 326 |     }
 327 | };
 328 | 
 329 | template<typename Element_type, typename Memory_type>
 330 | class Get_saver
 331 |     : virtual Operations
 332 |     , Saver_impl<Element_type, Memory_type>
 333 | {
 334 |     operator Saver<Memory_type> &() final
 335 |     {
 336 |         return static_cast<Saver_impl<Element_type, Memory_type> &>(*this);
 337 |     }
 338 | };
 339 | 
 340 | template<typename Element_type>
 341 | class Get_mem_IO
 342 |     : virtual Operations
 343 | 
 344 |     , Get_loader<Element_type, int8_t>
 345 |     , Get_loader<Element_type, int16_t>
 346 |     , Get_loader<Element_type, int32_t>
 347 |     , Get_loader<Element_type, int64_t>
 348 | 
 349 |     , Get_loader<Element_type, uint8_t>
 350 |     , Get_loader<Element_type, uint16_t>
 351 |     , Get_loader<Element_type, uint32_t>
 352 | 
 353 |     , Get_saver<Element_type, int8_t>
 354 |     , Get_saver<Element_type, int16_t>
 355 |     , Get_saver<Element_type, int32_t>
 356 |     , Get_saver<Element_type, int64_t>
 357 | {
 358 | };
 359 | 
 360 | template<typename Element_type>
 361 | class Non_scalar_operations_essentials
 362 | {
 363 | protected:
 364 |     template<typename Func>
 365 |     typename std::enable_if<std::is_assignable<std::function<Element_type()>, Func>::value, void>::type
 366 |     iterate(V_unit& vu, Func &&func, vreg_no vd)
 367 |     {
 368 |         State& st = dynamic_cast<State&>(vu);
 369 | 
 370 |         if (st.is_ill()) {
 371 |             throw State_not_configured();
 372 |         }
 373 | 
 374 |         auto const vstart = st.vstart();
 375 |         auto const vl = st.vl();
 376 | 
 377 |         for (size_t i = vstart; i < vl; ++i) {
 378 |             Element_type *const dest_i = reinterpret_cast<Element_type *>(st.elt_ptr(vd, i));
 379 |             *dest_i = func();
 380 |         }
 381 | 
 382 |         if (vl) {
 383 |             for (size_t i = vl; i < st.vlmax(); ++i) {
 384 |                 Element_type *const dest_i = reinterpret_cast<Element_type *>(st.elt_ptr(vd, i));
 385 |                 *dest_i = 0;
 386 |             }
 387 |         }
 388 |     }
 389 | 
 390 |     template<typename Func>
 391 |     typename std::enable_if<std::is_assignable<std::function<Element_type(
 392 |         Element_type const &)>, Func>::value, void>::type
 393 |     iterate(V_unit& vu, Func &&func, vreg_no vd, vreg_no vs1, vop_type mode)
 394 |     {
 395 |         State& st = dynamic_cast<State&>(vu);
 396 | 
 397 |         if (st.is_ill()) {
 398 |             throw State_not_configured();
 399 |         }
 400 | 
 401 |         auto const vstart = st.vstart();
 402 |         auto const vl = st.vl();
 403 | 
 404 |         for (size_t i = vstart; i < vl; ++i) {
 405 |             Element_type *const dest_i = reinterpret_cast<Element_type *>(st.elt_ptr(vd, i));
 406 |             if (mode == vop_type::thread_all || mode == vop_type::masked_in && st.is_enabled(i)) {
 407 |                 Element_type const src1_i = *reinterpret_cast<Element_type const *>(st.elt_ptr(vs1, i));
 408 |                 *dest_i = func(src1_i);
 409 |             }
 410 |         }
 411 | 
 412 |         if (vl) {
 413 |             for (size_t i = vl; i < st.vlmax(); ++i) {
 414 |                 Element_type *const dest_i = reinterpret_cast<Element_type *>(st.elt_ptr(vd, i));
 415 |                 *dest_i = 0;
 416 |             }
 417 |         }
 418 |     }
 419 | 
 420 |     template<typename Func>
 421 |     typename std::enable_if<std::is_assignable<std::function<Element_type(Element_type const &,
 422 |                                                                           Element_type const &)>, Func>::value, void>::type
 423 |     iterate(V_unit& vu, Func &&func, vreg_no vd, vreg_no vs1, vreg_no vs2, vop_type mode)
 424 |     {
 425 |         State& st = dynamic_cast<State&>(vu);
 426 | 
 427 |         if (st.is_ill()) {
 428 |             throw State_not_configured();
 429 |         }
 430 | 
 431 |         auto const vstart = st.vstart();
 432 |         auto vl = st.vl();
 433 | 
 434 |         for (size_t i = vstart; i < vl; ++i) {
 435 |             Element_type *const dest_i = reinterpret_cast<Element_type *>(st.elt_ptr(vd, i));
 436 |             if (mode == vop_type::thread_all || mode == vop_type::masked_in && st.is_enabled(i)) {
 437 |                 Element_type const src1_i = *reinterpret_cast<Element_type const *>(st.elt_ptr(vs1, i));
 438 |                 Element_type const src2_i = *reinterpret_cast<Element_type const *>(st.elt_ptr(vs2, i));
 439 |                 *dest_i = func(src1_i, src2_i);
 440 |             }
 441 |         }
 442 | 
 443 |         if (vl) {
 444 |             for (size_t i = vl; i < st.vlmax(); ++i) {
 445 |                 Element_type *const dest_i = reinterpret_cast<Element_type *>(st.elt_ptr(vd, i));
 446 |                 *dest_i = 0;
 447 |             }
 448 |         }
 449 |     }
 450 | 
 451 |     template<typename Func>
 452 |     typename std::enable_if<std::is_assignable<std::function<Element_type(Element_type const &, Element_type const &,
 453 |                                                                           Element_type const &)>, Func>::value, void>::type
 454 |     iterate(V_unit& vu, Func &&func, vreg_no vd, vreg_no vs1, vreg_no vs2, vreg_no vs3, vop_type mode)
 455 |     {
 456 |         State& st = dynamic_cast<State&>(vu);
 457 | 
 458 |         if (st.is_ill()) {
 459 |             throw State_not_configured();
 460 |         }
 461 | 
 462 |         auto const vstart = st.vstart();
 463 |         auto const vl = st.vl();
 464 | 
 465 |         for (size_t i = vstart; i < vl; ++i) {
 466 |             Element_type *const dest_i = reinterpret_cast<Element_type *>(st.elt_ptr(vd, i));
 467 |             if (mode == vop_type::thread_all || mode == vop_type::masked_in && st.is_enabled(i)) {
 468 |                 Element_type const src1_i = *reinterpret_cast<Element_type const *>(st.elt_ptr(vs1, i));
 469 |                 Element_type const src2_i = *reinterpret_cast<Element_type const *>(st.elt_ptr(vs2, i));
 470 |                 Element_type const src3_i = *reinterpret_cast<Element_type const *>(st.elt_ptr(vs3, i));
 471 |                 *dest_i = func(src1_i, src2_i, src3_i);
 472 |             }
 473 |         }
 474 | 
 475 |         if (vl) {
 476 |             for (size_t i = vl; i < st.vlmax(); ++i) {
 477 |                 Element_type *const dest_i = reinterpret_cast<Element_type *>(st.elt_ptr(vd, i));
 478 |                 *dest_i = 0;
 479 |             }
 480 |         }
 481 |     }
 482 | 
 483 |     template<typename Func>
 484 |     typename std::enable_if<std::is_assignable<std::function<Element_type(Element_type const &)>, Func>::value, void>::type
 485 |     iterate_vm(V_unit& vu, Func &&func, vreg_no vd, vreg_no vs1, vop_type mode)
 486 |     {
 487 |         State& st = dynamic_cast<State&>(vu);
 488 | 
 489 |         if (st.is_ill()) {
 490 |             throw State_not_configured();
 491 |         }
 492 | 
 493 |         auto const vstart = st.vstart();
 494 |         auto const vl = st.vl();
 495 | 
 496 |         for (size_t i = vstart; i < vl; ++i) {
 497 |             if (mode == vop_type::thread_all || mode == vop_type::masked_in && st.is_enabled(i)) {
 498 |                 Element_type const src1_i = *reinterpret_cast<Element_type const *>(st.elt_ptr(vs1, i));
 499 |                 bool result = func(src1_i);
 500 |                 st.set_mask(v0, i, result);
 501 |             }
 502 |         }
 503 | 
 504 |         if (vl) {
 505 |             for (size_t i = vl; i < st.vlmax(); ++i) {
 506 |                 st.set_mask(v0, i, false);
 507 |             }
 508 |         }
 509 |     }
 510 | 
 511 |     template<typename Func>
 512 |     typename std::enable_if<std::is_assignable<std::function<Element_type(Element_type const &,
 513 |                                                                           Element_type const &)>, Func>::value, void>::type
 514 |     iterate_vm(V_unit& vu, Func &&func, vreg_no vd, vreg_no vs1, vreg_no vs2, vop_type mode)
 515 |     {
 516 |         State& st = dynamic_cast<State&>(vu);
 517 | 
 518 |         if (st.is_ill()) {
 519 |             throw State_not_configured();
 520 |         }
 521 | 
 522 |         auto const vstart = st.vstart();
 523 |         auto const vl = st.vl();
 524 | 
 525 |         for (size_t i = vstart; i < vl; ++i) {
 526 |             if (mode == vop_type::thread_all || mode == vop_type::masked_in && st.is_enabled(i)) {
 527 |                 Element_type const src1_i = *reinterpret_cast<Element_type const *>(st.elt_ptr(vs1, i));
 528 |                 Element_type const src2_i = *reinterpret_cast<Element_type const *>(st.elt_ptr(vs2, i));
 529 |                 bool result = func(src1_i, src2_i);
 530 |                 st.set_mask(v0, i, result);
 531 |             }
 532 |         }
 533 | 
 534 |         if (vl) {
 535 |             for (size_t i = vl; i < st.vlmax(); ++i) {
 536 |                 st.set_mask(v0, i, false);
 537 |             }
 538 |         }
 539 |     }
 540 | 
 541 |     template<typename Func>
 542 |     typename std::enable_if<std::is_assignable<std::function<Element_type(Element_type const &,
 543 |                                                                           Element_type const &)>, Func>::value, void>::type
 544 |     iterate_mm(V_unit& vu, Func &&func, vreg_no vd, vreg_no vs1, vreg_no vs2)
 545 |     {
 546 |         State& st = dynamic_cast<State&>(vu);
 547 | 
 548 |         if (st.is_ill()) {
 549 |             throw State_not_configured();
 550 |         }
 551 | 
 552 |         auto const vstart = st.vstart();
 553 |         auto const vl = st.vl();
 554 | 
 555 |         for (size_t i = vstart; i < vl; ++i) {
 556 |             bool src1_i = st.get_mask(vs1, i);
 557 |             bool src2_i = st.get_mask(vs2, i);
 558 |             bool result = func(src1_i, src2_i);
 559 |             st.set_mask(v0, i, result);
 560 |         }
 561 | 
 562 |         if (vl) {
 563 |             for (size_t i = vl; i < st.vlmax(); ++i) {
 564 |                 st.set_mask(v0, i, false);
 565 |             }
 566 |         }
 567 |     }
 568 | };
 569 | 
 570 | 
 571 | template<typename Element_type>
 572 | class Operations_essentials
 573 |     : virtual protected Operations
 574 |     , protected Non_scalar_operations_essentials<Element_type>
 575 | {
 576 | };
 577 | 
 578 | template<typename Element_type>
 579 | class Float_operations_essentials
 580 |     : virtual protected Float_operations
 581 |     , protected Non_scalar_operations_essentials<Element_type>
 582 | {
 583 | };
 584 | 
 585 | template<typename Element_type>
 586 | class Operations_impl
 587 |     : virtual public Operations
 588 |     , Get_mem_IO<Element_type>
 589 |     , Operations_essentials<Element_type>
 590 | {
 591 | private:
 592 |     virtual operator Operations &() final
 593 |     {
 594 |         return *this;
 595 |     }
 596 | 
 597 | #if 0
 598 |     template<typename RTy, typename Ty, typename Func>
 599 |     static
 600 |     typename std::enable_if<
 601 |         (sizeof(Element_type) < sizeof(Ty) || sizeof(Element_type) < sizeof(Ty))
 602 |         && std::is_assignable<std::function<RTy(Ty const &)>, Func>::value,
 603 |         std::function<Element_type(Element_type const &)>
 604 |     >::type
 605 |     adapter1(Func &&)
 606 |     {
 607 |         throw Load_wider_value_to_narrowed_element((std::max)(sizeof(Ty), sizeof(RTy)), sizeof(Element_type));
 608 |     }
 609 | 
 610 |     template<typename RTy, typename Ty, typename Func,
 611 |         typename = typename std::enable_if_t<
 612 |             (sizeof(Element_type) >= sizeof(Ty) && sizeof(Element_type) >= sizeof(RTy))
 613 |             && std::is_assignable<std::function<RTy(Ty const &)>, Func>::value>
 614 |     >
 615 |     static auto adapter1(Func &&func)
 616 |     {
 617 |         return
 618 |             [&func](Element_type const &x)->Element_type {
 619 |                 return
 620 |                     to_element(func(
 621 |                         reinterpret_cast<Ty const &>(x)
 622 |                     ));
 623 |             };
 624 |     }
 625 | 
 626 |     template<typename RTy, typename Ty, typename Func>
 627 |     static
 628 |     typename std::enable_if<
 629 |         (sizeof(Element_type) < sizeof(Ty) || sizeof(Element_type) < sizeof(RTy))
 630 |         && std::is_assignable<std::function<RTy(Ty const &, Ty const &)>, Func>::value,
 631 |         std::function<Element_type(Element_type const &, Element_type const &)>
 632 |     >::type
 633 |     adapter2(Func &&)
 634 |     {
 635 |         throw Load_wider_value_to_narrowed_element((std::max)(sizeof(Ty), sizeof(RTy)), sizeof(Element_type));
 636 |     }
 637 | 
 638 |     template<typename RTy, typename Ty, typename Func,
 639 |         typename = typename std::enable_if_t<
 640 |             (sizeof(Element_type) >= sizeof(Ty) && sizeof(Element_type) >= sizeof(RTy))
 641 |             && std::is_assignable<std::function<RTy(Ty const &, Ty const &)>, Func>::value>
 642 |     >
 643 |     static
 644 |     auto
 645 |     adapter2(Func &&func)
 646 |     {
 647 |         return
 648 |             [&func](Element_type const &x, Element_type const &y)->Element_type {
 649 |                 return
 650 |                     to_element(func(
 651 |                         reinterpret_cast<Ty const &>(x),
 652 |                         reinterpret_cast<Ty const &>(y));
 653 |             };
 654 |     }
 655 | 
 656 |     template<typename RTy, typename Ty, typename Func>
 657 |     static
 658 |     typename std::enable_if<
 659 |         (sizeof(Element_type) < sizeof(Ty) || sizeof(Element_type) < sizeof(RTy))
 660 |         && std::is_assignable<std::function<RTy(Ty const &, Ty const &, Ty const &)>, Func>::value,
 661 |         std::function<Element_type(Element_type const &, Element_type const &, Element_type const &)>
 662 |     >::type
 663 |     adapter3(Func &&)
 664 |     {
 665 |         throw Load_wider_value_to_narrowed_element((std::max)(sizeof(Ty), sizeof(RTy)), sizeof(Element_type));
 666 |     }
 667 | 
 668 |     template<typename RTy, typename Ty, typename Func,
 669 |         typename = typename std::enable_if_t<
 670 |             (sizeof(Element_type) >= sizeof(Ty) && sizeof(Element_type) >= sizeof(RTy))
 671 |             && std::is_assignable<std::function<RTy(Ty const &, Ty const &, Ty const &)>, Func>::value>
 672 |     >
 673 |     static
 674 |     auto
 675 |     adapter3(Func &&func)
 676 |     {
 677 |         return
 678 |             [&func](Element_type const &x, Element_type const &y, Element_type const &z)->Element_type {
 679 |                 return to_element(func(
 680 |                     reinterpret_cast<Ty const &>(x),
 681 |                     reinterpret_cast<Ty const &>(y),
 682 |                     reinterpret_cast<Ty const &>(z)));
 683 |             };
 684 |     }
 685 | #endif
 686 | 
 687 |     static Element_type sll(Element_type const &x, Element_type const &y)
 688 |     {
 689 |         return x << y;
 690 |     }
 691 | 
 692 |     static Element_type sra(Element_type const &x, Element_type const &y)
 693 |     {
 694 |         return x >> y;
 695 |     }
 696 | 
 697 |     static Element_type srl(Element_type const &x, Element_type const &y)
 698 |     {
 699 |         return to_element(static_cast<typename std::make_unsigned<Element_type>::type>(x) >> y);
 700 |     }
 701 | 
 702 |     void
 703 |     vadd_vv(vreg_no vd, vreg_no vs2, vreg_no vs1, vop_type mode = vop_type::thread_all) final
 704 |     {
 705 |         this->iterate(V_unit::instance(), std::plus<Element_type>(), vd, vs1, vs2, mode);
 706 |     }
 707 | 
 708 |     void
 709 |     vadd_vx(vreg_no vd, vreg_no vs2, xreg_type rs1, vop_type mode = vop_type::thread_all) final
 710 |     {
 711 |         using namespace std::placeholders;
 712 |         this->iterate(V_unit::instance(), std::bind(std::plus<Element_type>(), _1, Element_type(rs1)), vd, vs2, mode);
 713 |     }
 714 | 
 715 |     void
 716 |     vadd_vi(vreg_no vd, vreg_no vs2, int16_t imm, vop_type mode = vop_type::thread_all) final
 717 |     {
 718 |         using namespace std::placeholders;
 719 |         this->iterate(V_unit::instance(), std::bind(std::plus<Element_type>(), _1, Element_type(imm)), vd, vs2, mode);
 720 |     }
 721 | 
 722 |     void
 723 |     vsub_vv(vreg_no vd, vreg_no vs2, vreg_no vs1, vop_type mode = vop_type::thread_all) final
 724 |     {
 725 |         this->iterate(V_unit::instance(), std::minus<Element_type>(), vd, vs1, vs2, mode);
 726 |     }
 727 | 
 728 |     void
 729 |     vsub_vx(vreg_no vd, vreg_no vs2, xreg_type rs1, vop_type mode = vop_type::thread_all) final
 730 |     {
 731 |         using namespace std::placeholders;
 732 |         this->iterate(V_unit::instance(), std::bind(std::minus<Element_type>(), _1, Element_type(rs1)), vd, vs2, mode);
 733 |     }
 734 | 
 735 |     void
 736 |     vmsle_vv(vreg_no vd, vreg_no vs2, vreg_no vs1, vop_type mode = vop_type::thread_all) final
 737 |     {
 738 |         this->iterate_vm(V_unit::instance(),
 739 |                          std::less_equal<Element_type>(),
 740 |                          vd,
 741 |                          vs1,
 742 |                          vs2,
 743 |                          mode);
 744 |     }
 745 | 
 746 |     void
 747 |     vmsle_vx(vreg_no vd, vreg_no vs2, xreg_type rs1, vop_type mode = vop_type::thread_all) final
 748 |     {
 749 |         using namespace std::placeholders;
 750 |         this->iterate_vm(V_unit::instance(),
 751 |                          std::bind(std::less_equal<Element_type>(), _1, Element_type(rs1)),
 752 |                          vd,
 753 |                          vs2,
 754 |                          mode);
 755 |     }
 756 | 
 757 |     void
 758 |     vmsle_vi(vreg_no vd, vreg_no vs2, int16_t imm, vop_type mode = vop_type::thread_all) final
 759 |     {
 760 |         using namespace std::placeholders;
 761 |         this->iterate_vm(V_unit::instance(),
 762 |                          std::bind(std::less_equal<Element_type>(), _1, Element_type(imm)),
 763 |                          vd,
 764 |                          vs2,
 765 |                          mode);
 766 |     }
 767 | 
 768 |     void
 769 |     vmand_mm(vreg_no vd, vreg_no vs2, vreg_no vs1) final
 770 |     {
 771 |         this->iterate_mm(V_unit::instance(), std::logical_and<bool>(), vd, vs1, vs2);
 772 |     }
 773 | 
 774 |     void
 775 |     vmnand_mm(vreg_no vd, vreg_no vs2, vreg_no vs1) final
 776 |     {
 777 |         auto op = [](bool const& x, bool const& y)->bool {
 778 |             return !(x && y);
 779 |         };
 780 |         this->iterate_mm(V_unit::instance(), op, vd, vs1, vs2);
 781 |     }
 782 | 
 783 |     void
 784 |     vmnot_m(vreg_no vd, vreg_no vs1) final
 785 |     {
 786 |         vmnand_mm(vd, vs1, vs1);
 787 |     }
 788 | 
 789 |     void
 790 |     vmv_v_v(vreg_no vd, vreg_no vs1) final
 791 |     {
 792 |         auto op = [](Element_type const& x)->Element_type {
 793 |             return x;
 794 |         };
 795 |         this->iterate(V_unit::instance(), op, vd, vs1, vop_type::thread_all);
 796 |     }
 797 | 
 798 |     void
 799 |     vmv_v_x(vreg_no vd, xreg_type rs1) final
 800 |     {
 801 |         auto op = [&rs1]()->Element_type {
 802 |             return Element_type(rs1);
 803 |         };
 804 |         this->iterate(V_unit::instance(), op, vd);
 805 |     }
 806 | 
 807 |     void
 808 |     vmv_v_i(vreg_no vd, int16_t imm) final
 809 |     {
 810 |         auto op = [&imm]()->Element_type {
 811 |             return Element_type(imm);
 812 |         };
 813 |         this->iterate(V_unit::instance(), op, vd);
 814 |     }
 815 | 
 816 | #if 0
 817 |     void vsll(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 818 |     {
 819 |         this->iterate(sll, vd, vs1, vs2);
 820 |     }
 821 | 
 822 |     void vsra(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 823 |     {
 824 |         this->iterate(sra, vd, vs1, vs2);
 825 |     }
 826 | 
 827 |     void vsrl(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 828 |     {
 829 |         this->iterate(srl, vd, vs1, vs2);
 830 |     }
 831 | 
 832 |     void vand(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 833 |     {
 834 |         this->iterate(std::bit_and<Element_type>(), vd, vs1, vs2);
 835 |     }
 836 | 
 837 |     void vor(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 838 |     {
 839 |         this->iterate(std::bit_or<Element_type>(), vd, vs1, vs2);
 840 |     }
 841 | 
 842 |     void vxor(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 843 |     {
 844 |         this->iterate(std::bit_xor<Element_type>(), vd, vs1, vs2);
 845 |     }
 846 | 
 847 |     void vmul(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 848 |     {
 849 |         this->iterate(std::multiplies<Element_type>(), vd, vs1, vs2);
 850 |     }
 851 | 
 852 |     void vmulh(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 853 |     {
 854 |         auto op = [](Element_type const& x, Element_type const& y)->Element_type {
 855 |             return to_element(mulh(x, y));
 856 |         };
 857 |         this->iterate(op, vd, vs1, vs2);
 858 |     }
 859 | 
 860 |     void vmulhsu(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 861 |     {
 862 |         auto op = [](Element_type const& x, Element_type const& y)->Element_type {
 863 |             return to_element(mulhsu(x, y));
 864 |         };
 865 |         this->iterate(op, vd, vs1, vs2);
 866 |     }
 867 | 
 868 |     void vmulhu(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 869 |     {
 870 |         auto op = [](Element_type const& x, Element_type const& y)->Element_type {
 871 |             return to_element(mulhu(x, y));
 872 |         };
 873 |         this->iterate(op, vd, vs1, vs2);
 874 |     }
 875 | 
 876 |     void vdiv(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 877 |     {
 878 |         this->iterate(std::divides<Element_type>(), vd, vs1, vs2);
 879 |     }
 880 | 
 881 |     void vdivu(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 882 |     {
 883 |         static auto const op = [](Element_type x, Element_type y)->Element_type {
 884 |             typedef typename std::make_unsigned<Element_type>::type uns_type;
 885 |             return static_cast<uns_type>(x) / static_cast<uns_type>(y);
 886 |         };
 887 |         this->iterate(op, vd, vs1, vs2);
 888 |     }
 889 | 
 890 |     void vrem(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 891 |     {
 892 |         this->iterate(std::modulus<Element_type>(), vd, vs1, vs2);
 893 |     }
 894 | 
 895 |     void vremu(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 896 |     {
 897 |         static auto const op = [](Element_type x, Element_type y)->Element_type {
 898 |             typedef typename std::make_unsigned<Element_type>::type uns_type;
 899 |             return static_cast<uns_type>(x) % static_cast<uns_type>(y);
 900 |         };
 901 |         this->iterate(op, vd, vs1, vs2);
 902 |     }
 903 | 
 904 |     void vseq(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 905 |     {
 906 |         this->iterate(adapter2<bool, Element_type>(std::equal_to<Element_type>()), vd, vs1, vs2);
 907 |     }
 908 | 
 909 |     void vslt(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 910 |     {
 911 |         this->iterate(adapter2<bool, Element_type>(std::less<Element_type>()), vd, vs1, vs2);
 912 |     }
 913 | 
 914 |     void vsge(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 915 |     {
 916 |         this->iterate(adapter2<bool, Element_type>(std::greater_equal<Element_type>()), vd, vs1, vs2);
 917 |     }
 918 | 
 919 |     void vsltu(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 920 |     {
 921 |         static auto const op = [](Element_type x, Element_type y)->bool {
 922 |             typedef typename std::make_unsigned<Element_type>::type uns_type;
 923 |             return std::less<uns_type>()(static_cast<uns_type>(x), static_cast<uns_type>(y));
 924 |         };
 925 |         this->iterate(adapter2<bool, Element_type>(op), vd, vs1, vs2);
 926 |     }
 927 | 
 928 |     void vsgeu(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 929 |     {
 930 |         static auto const op = [](Element_type x, Element_type y)->bool {
 931 |             typedef typename std::make_unsigned<Element_type>::type uns_type;
 932 |             return std::greater<uns_type>()(static_cast<uns_type>(x), static_cast<uns_type>(y));
 933 |         };
 934 |         this->iterate(adapter2<bool, Element_type>(op), vd, vs1, vs2);
 935 |     }
 936 | 
 937 |     void vslli(vreg_no vd, vreg_no vs1, int16_t imm) final
 938 |     {
 939 |         using namespace std::placeholders;
 940 |         this->iterate(std::bind(sll, _1, Element_type(imm)), vd, vs1);
 941 |     }
 942 | 
 943 |     void vsrli(vreg_no vd, vreg_no vs1, int16_t imm) final
 944 |     {
 945 |         using namespace std::placeholders;
 946 |         this->iterate(std::bind(srl, _1, Element_type(imm)), vd, vs1);
 947 |     }
 948 | 
 949 |     void vsrai(vreg_no vd, vreg_no vs1, int16_t imm) final
 950 |     {
 951 |         using namespace std::placeholders;
 952 |         this->iterate(std::bind(sra, _1, Element_type(imm)), vd, vs1);
 953 |     }
 954 | 
 955 |     void vandi(vreg_no vd, vreg_no vs1, int16_t imm) final
 956 |     {
 957 |         using namespace std::placeholders;
 958 |         this->iterate(std::bind(std::bit_and<Element_type>(), _1, Element_type(imm)),
 959 |                       vd, vs1);
 960 |     }
 961 | 
 962 |     void vori(vreg_no vd, vreg_no vs1, int16_t imm) final
 963 |     {
 964 |         using namespace std::placeholders;
 965 |         this->iterate(std::bind(std::bit_or<Element_type >(), _1, Element_type(imm)),
 966 |                       vd, vs1);
 967 |     }
 968 | 
 969 |     void vxori(vreg_no vd, vreg_no vs1, int16_t imm) final
 970 |     {
 971 |         using namespace std::placeholders;
 972 |         this->iterate(std::bind(std::bit_xor<Element_type >(), _1, Element_type(imm)),
 973 |                       vd, vs1);
 974 |     }
 975 | 
 976 |     void vfadd_w(vreg_no vd, vreg_no vs1, vreg_no vs2)final
 977 |     {
 978 |         this->iterate(adapter2<float32_t, float32_t>(std::plus<float32_t>()), vd, vs1, vs2);
 979 |     }
 980 | 
 981 |     void vfadd_d(vreg_no vd, vreg_no vs1, vreg_no vs2)final
 982 |     {
 983 |         this->iterate(adapter2<float64_t, float64_t>(std::plus<float64_t>()), vd, vs1, vs2);
 984 |     }
 985 | 
 986 |     void vfsub_w(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 987 |     {
 988 |         this->iterate(adapter2<float32_t, float32_t>(std::minus<float32_t>()), vd, vs1, vs2);
 989 |     }
 990 | 
 991 |     void vfsub_d(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 992 |     {
 993 |         this->iterate(adapter2<float64_t, float64_t>(std::minus<float64_t>()), vd, vs1, vs2);
 994 |     }
 995 | 
 996 |     void vfmul_w(vreg_no vd, vreg_no vs1, vreg_no vs2) final
 997 |     {
 998 |         this->iterate(adapter2<float32_t, float32_t>(std::multiplies<float32_t>()), vd, vs1, vs2);
 999 |     }
1000 | 
1001 |     void vfmul_d(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1002 |     {
1003 |         this->iterate(adapter2<float64_t, float64_t>(std::multiplies<float64_t>()), vd, vs1, vs2);
1004 |     }
1005 | 
1006 |     void vfdiv_w(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1007 |     {
1008 |         this->iterate(adapter2<float32_t, float32_t>(std::divides<float32_t>()), vd, vs1, vs2);
1009 |     }
1010 | 
1011 |     void vfdiv_d(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1012 |     {
1013 |         this->iterate(adapter2<float64_t, float64_t>(std::divides<float64_t>()), vd, vs1, vs2);
1014 |     }
1015 | 
1016 |     void vfsgnj_w(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1017 |     {
1018 |         this->iterate(adapter2<float32_t, float32_t>(fsgnj<float32_t>), vd, vs1, vs2);
1019 |     }
1020 | 
1021 |     void vfsgnj_d(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1022 |     {
1023 |         this->iterate(adapter2<float64_t, float64_t>(fsgnj<float64_t>), vd, vs1, vs2);
1024 |     }
1025 | 
1026 |     void vfsgnjn_w(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1027 |     {
1028 |         this->iterate(adapter2<float32_t, float32_t>(fsgnjn<float32_t>), vd, vs1, vs2);
1029 |     }
1030 | 
1031 |     void vfsgnjn_d(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1032 |     {
1033 |         this->iterate(adapter2<float64_t, float64_t>(fsgnjn<float64_t>), vd, vs1, vs2);
1034 |     }
1035 | 
1036 |     void vfsgnjx_w(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1037 |     {
1038 |         this->iterate(adapter2<float32_t, float32_t>(fsgnjx<float32_t>), vd, vs1, vs2);
1039 |     }
1040 | 
1041 |     void vfsgnjx_d(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1042 |     {
1043 |         this->iterate(adapter2<float64_t, float64_t>(fsgnjx<float64_t>), vd, vs1, vs2);
1044 |     }
1045 | 
1046 |     void vfmadd_w(vreg_no vd, vreg_no vs1, vreg_no vs2, vreg_no vs3) final
1047 |     {
1048 |         typedef float32_t curr_type;
1049 |         static curr_type(*const fn)(curr_type, curr_type, curr_type) = std::fmaf;
1050 |         this->iterate(adapter3<curr_type, curr_type>(fn), vd, vs1, vs2, vs3);
1051 |     }
1052 | 
1053 |     void vfmadd_d(vreg_no vd, vreg_no vs1, vreg_no vs2, vreg_no vs3) final
1054 |     {
1055 |         typedef float64_t curr_type;
1056 |         static curr_type(*const fn)(curr_type, curr_type, curr_type) = std::fma;
1057 |         this->iterate(adapter3<curr_type, curr_type>(fn), vd, vs1, vs2, vs3);
1058 |     }
1059 | 
1060 |     void vfmsub_w(vreg_no vd, vreg_no vs1, vreg_no vs2, vreg_no vs3) final
1061 |     {
1062 |         typedef float32_t curr_type;
1063 |         static auto const op = [](curr_type const & x, curr_type const & y, curr_type const & z)->curr_type {
1064 |             return std::fmaf(x, y, -z);
1065 |         };
1066 |         this->iterate(adapter3<curr_type, curr_type>(op), vd, vs1, vs2, vs3);
1067 |     }
1068 | 
1069 |     void vfmsub_d(vreg_no vd, vreg_no vs1, vreg_no vs2, vreg_no vs3) final
1070 |     {
1071 |         typedef float64_t curr_type;
1072 |         static auto const op = [](curr_type const & x, curr_type const & y, curr_type const & z)->curr_type {
1073 |             return std::fma(x, y, -z);
1074 |         };
1075 |         this->iterate(adapter3<curr_type, curr_type>(op), vd, vs1, vs2, vs3);
1076 |     }
1077 | 
1078 |     void vfmaddwdn_w(vreg_no vd, vreg_no vs1, vreg_no vs2, vreg_no vs3) final
1079 |     {
1080 |         typedef float32_t small_type;
1081 |         typedef float32_t big_type;
1082 |         static auto const op = [](small_type const & x, small_type const & y, small_type const & z)->big_type {
1083 |             return std::fma(big_type(x), big_type(y), big_type(z));
1084 |         };
1085 |         this->iterate(adapter3<big_type, small_type>(op), vd, vs1, vs2, vs3);
1086 |     }
1087 | 
1088 |     void vfmsubwdn_w(vreg_no vd, vreg_no vs1, vreg_no vs2, vreg_no vs3) final
1089 |     {
1090 |         typedef float32_t small_type;
1091 |         typedef float32_t big_type;
1092 |         static auto const op = [](small_type const & x, small_type const & y, small_type const & z)->big_type {
1093 |             return std::fma(big_type(x), big_type(y), big_type(-z));
1094 |         };
1095 |         this->iterate(adapter3<big_type, small_type>(op), vd, vs1, vs2, vs3);
1096 |     }
1097 | 
1098 |     void vfmin_w(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1099 |     {
1100 |         typedef float32_t curr_type;
1101 |         static curr_type(*const fn)(curr_type, curr_type) = std::fminf;
1102 |         this->iterate(adapter2<curr_type, curr_type>(fn), vd, vs1, vs2);
1103 |     }
1104 | 
1105 |     void vfmin_d(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1106 |     {
1107 |         typedef float64_t curr_type;
1108 |         static curr_type(*const fn)(curr_type, curr_type) = std::fmin;
1109 |         this->iterate(adapter2<curr_type, curr_type>(fn), vd, vs1, vs2);
1110 |     }
1111 | 
1112 |     void vfmax_w(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1113 |     {
1114 |         typedef float32_t curr_type;
1115 |         static curr_type(*const fn)(curr_type, curr_type) = std::fmaxf;
1116 |         this->iterate(adapter2<curr_type, curr_type>(fn), vd, vs1, vs2);
1117 |     }
1118 | 
1119 |     void vfmax_d(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1120 |     {
1121 |         typedef float64_t curr_type;
1122 |         static curr_type(*const fn)(curr_type, curr_type) = std::fmax;
1123 |         this->iterate(adapter2<curr_type, curr_type>(fn), vd, vs1, vs2);
1124 |     }
1125 | 
1126 |     void vfsqrt_w(vreg_no vd, vreg_no vs1) final
1127 |     {
1128 |         typedef float32_t curr_type;
1129 |         static curr_type(*const fn)(curr_type) = std::sqrt;
1130 |         this->iterate(adapter1<curr_type, curr_type>(fn), vd, vs1);
1131 |     }
1132 | 
1133 |     void vfsqrt_d(vreg_no vd, vreg_no vs1) final
1134 |     {
1135 |         typedef float64_t curr_type;
1136 |         static curr_type(*const fn)(curr_type) = std::sqrt;
1137 |         this->iterate(adapter1<curr_type, curr_type>(fn), vd, vs1);
1138 |     }
1139 | 
1140 |     void vfeq_w(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1141 |     {
1142 |         typedef float32_t curr_type;
1143 |         this->iterate(adapter2<bool, curr_type>(std::equal_to<curr_type>()), vd, vs1, vs2);
1144 |     }
1145 | 
1146 |     void vfeq_d(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1147 |     {
1148 |         typedef float64_t curr_type;
1149 |         this->iterate(adapter2<bool, curr_type>(std::equal_to<curr_type>()), vd, vs1, vs2);
1150 |     }
1151 | 
1152 |     void vflt_w(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1153 |     {
1154 |         typedef float32_t curr_type;
1155 |         static bool(*const fn)(curr_type, curr_type) = std::isless;
1156 |         this->iterate(adapter2<bool, curr_type>(fn), vd, vs1, vs2);
1157 |     }
1158 | 
1159 |     void vflt_d(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1160 |     {
1161 |         typedef float64_t curr_type;
1162 |         static bool(*const fn)(curr_type, curr_type) = std::isless;
1163 |         this->iterate(adapter2<bool, curr_type>(fn), vd, vs1, vs2);
1164 |     }
1165 | 
1166 |     void vfle_w(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1167 |     {
1168 |         typedef float32_t curr_type;
1169 |         static bool(*const fn)(curr_type, curr_type) = std::islessequal;
1170 |         this->iterate(adapter2<bool, curr_type>(fn), vd, vs1, vs2);
1171 |     }
1172 | 
1173 |     void vfle_d(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1174 |     {
1175 |         typedef float64_t curr_type;
1176 |         static bool(*const fn)(curr_type, curr_type) = std::islessequal;
1177 |         this->iterate(adapter2<bool, curr_type>(fn), vd, vs1, vs2);
1178 |     }
1179 | 
1180 |     void vaddw(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1181 |     {
1182 |         typedef int32_t curr_type;
1183 |         this->iterate(adapter2<curr_type, curr_type>(std::plus<curr_type>()), vd, vs1, vs2);
1184 |     }
1185 | 
1186 |     void vsubw(vreg_no vd, vreg_no vs1, vreg_no vs2) final
1187 |     {
1188 |         typedef int32_t curr_type;
1189 |         this->iterate(adapter2<curr_type, curr_type>(std::minus<curr_type>()), vd, vs1, vs2);
1190 |     }
1191 | 
1192 |     void vaddwi(vreg_no vd, vreg_no vs1, int16_t imm) final
1193 |     {
1194 |         typedef int32_t curr_type;
1195 |         using namespace std::placeholders;
1196 |         this->iterate(adapter1<curr_type, curr_type>(std::bind(std::plus<curr_type>(), _1, curr_type(imm))), vd, vs1);
1197 |     }
1198 | #endif
1199 | };
1200 | 
1201 | template<typename Element_type>
1202 | class Float_operations_impl
1203 |     : virtual public Float_operations
1204 |     , Float_operations_essentials<Element_type>
1205 | {
1206 | private:
1207 |     virtual operator Float_operations &() final
1208 |     {
1209 |         return *this;
1210 |     }
1211 | 
1212 |     void
1213 |     vfmacc_vf(vreg_no vd, float rs1, vreg_no vs2, vop_type mode = vop_type::thread_all) final
1214 |     {
1215 |         auto op = [&rs1](Element_type const& x, Element_type const& y)->Element_type {
1216 |             return rs1 * x + y;
1217 |         };
1218 |         this->iterate(V_unit::instance(), op, vd, vs2, vd, mode);
1219 |     }
1220 | };
1221 | 
1222 | Operations_impl<int8_t> op8;
1223 | Operations_impl<int16_t> op16;
1224 | Operations_impl<int32_t> op32;
1225 | Operations_impl<int64_t> op64;
1226 | 
1227 | Float_operations_impl<float32_t> fop32;
1228 | Float_operations_impl<float64_t> fop64;
1229 | 
1230 | static thread_local std::unique_ptr<State> p_state;
1231 | 
1232 | static inline constexpr size_t
1233 | bits(size_t _value, size_t _offset = 0)
1234 | {
1235 |     return 0 == _value ? _offset : bits(_value >> 1, _offset + 1);
1236 | }
1237 | }  // namespace
1238 | 
1239 | class State_impl
1240 |     : virtual protected State
1241 | {
1242 |     typedef State_impl This_class;
1243 | protected:
1244 |     State_impl()
1245 |         : m_vstart(0)
1246 |         , m_vl(0)
1247 |         , m_mul(m1)
1248 |         , m_mask_reg(v0)
1249 |     {
1250 |     }
1251 | 
1252 | private:
1253 |     size_t setvstart(size_t vstart)final
1254 |     {
1255 |         return this->m_vstart = vstart;
1256 |     }
1257 | 
1258 |     size_t setvl(size_t vl)final
1259 |     {
1260 |         return this->m_vl = vl;
1261 |     }
1262 | 
1263 |     void setill(bool ill)final
1264 |     {
1265 |         this->m_ill = ill;
1266 |     }
1267 | 
1268 |     void setew(size_t ew)final
1269 |     {
1270 |         switch(ew) {
1271 |             case 0b000:
1272 |                 this->m_op_performer = &op8;
1273 |                 this->m_fop_performer = nullptr;
1274 |                 break;
1275 |             case 0b001:
1276 |                 this->m_op_performer = &op16;
1277 |                 this->m_fop_performer = nullptr;
1278 |                 break;
1279 |             case 0b010:
1280 |                 this->m_op_performer = &op32;
1281 |                 this->m_fop_performer = &fop32;
1282 |                 break;
1283 |             case 0b011:
1284 |                 this->m_op_performer = &op64;
1285 |                 this->m_fop_performer = &fop64;
1286 |                 break;
1287 |         }
1288 |         this->m_ew = ew;
1289 |     }
1290 | 
1291 |     void setmul(size_t mul)final
1292 |     {
1293 |         this->m_mul = mul;
1294 |     }
1295 | 
1296 |     size_t vstart()const final
1297 |     {
1298 |         return m_vstart;
1299 |     }
1300 | 
1301 |     size_t vl()const final
1302 |     {
1303 |         return m_vl;
1304 |     }
1305 | 
1306 |     size_t sew()const final
1307 |     {
1308 |         return 8 << m_ew;
1309 |     }
1310 | 
1311 |     size_t lmul()const final
1312 |     {
1313 |         return 1 << m_mul;
1314 |     }
1315 | 
1316 |     bool is_ill()const final
1317 |     {
1318 |         return m_ill;
1319 |     }
1320 | 
1321 |     size_t vlmax()const final
1322 |     {
1323 |         return m_mul * implementation::V_unit::VLEN / sew();
1324 |     }
1325 | 
1326 |     inline bool is_enabled(size_t i)const
1327 |     {
1328 |         return 0 != this->mask_bit(i);
1329 |     }
1330 | 
1331 |     bool
1332 |     is_valid_reg(vreg_no _reg)const final
1333 |     {
1334 |         return (_reg % lmul()) == 0;
1335 |     }
1336 | 
1337 |     char *
1338 |     elt_ptr(vreg_no _reg, size_t _ind) final
1339 |     {
1340 |         if (!is_valid_reg(_reg)) {
1341 |             throw Register_out_of_config_range(_reg);
1342 |         }
1343 |         size_t skip_rows = _reg;
1344 | 
1345 |         size_t elements_in_stripe = V_unit::SLEN / sew();
1346 |         size_t elements_in_group = elements_in_stripe * lmul();
1347 | 
1348 |         size_t num_of_group = _ind / elements_in_group;
1349 | 
1350 |         size_t row = (_ind % elements_in_group) / elements_in_stripe;
1351 |         size_t col = _ind % elements_in_stripe;
1352 | 
1353 |         size_t bits = (skip_rows + row) * V_unit::VLEN + (num_of_group * elements_in_stripe + col) * sew();
1354 |         return &m_register_file[bits / 8];
1355 |     }
1356 | 
1357 |     char const *
1358 |     elt_ptr(vreg_no _reg, size_t _ind)const final
1359 |     {
1360 |         return const_cast<This_class*>(this)->elt_ptr(_reg, _ind);
1361 |     }
1362 | 
1363 |     void
1364 |     set_mask(vreg_no _reg, size_t _ind, bool value) final
1365 |     {
1366 |         size_t mlen = sew() / lmul();
1367 |         size_t byte_num = _reg * V_unit::VLEN / 8 + (_ind * mlen) / 8;
1368 |         char *byte_ptr = &m_register_file[byte_num];
1369 | 
1370 |         // zeroing mlen bits
1371 |         size_t bits_left = mlen;
1372 |         char *ptr = byte_ptr;
1373 |         while (bits_left >= 8) {
1374 |             *ptr = 0;
1375 |             bits_left -= 8;
1376 |             ++ptr;
1377 |         }
1378 |         if (bits_left) {
1379 |             *ptr = *ptr & ~((1 << bits_left) - 1);
1380 |         }
1381 | 
1382 |         // set LSB to the value
1383 |         *byte_ptr = *byte_ptr & ~(1u) | !!value;
1384 |     }
1385 | 
1386 |     bool
1387 |     get_mask(vreg_no _reg, size_t _ind)const final
1388 |     {
1389 |         size_t mlen = sew() / lmul();
1390 |         size_t byte_num = _reg * V_unit::VLEN / 8 + (_ind * mlen) / 8;
1391 |         char const *byte_ptr = &m_register_file[byte_num];
1392 | 
1393 |         return 0 != (*byte_ptr & (1 << (mlen % 8)));
1394 |     }
1395 | 
1396 |     bool
1397 |     mask_bit(size_t i)const final
1398 |     {
1399 |         return get_mask(v0, i);
1400 |     }
1401 | 
1402 |     Operations&
1403 |     get_op_performer()const final
1404 |     {
1405 |         return *m_op_performer;
1406 |     }
1407 | 
1408 |     Float_operations&
1409 |     get_fop_performer()const final
1410 |     {
1411 |         return *m_fop_performer;
1412 |     }
1413 | 
1414 | private:
1415 |     std::array<char, V_unit::VLEN / 8 * V_unit::NREGS> m_register_file;
1416 | 
1417 |     Operations* m_op_performer;
1418 |     Float_operations* m_fop_performer;
1419 | 
1420 |     size_t m_vstart;
1421 |     size_t m_vl;
1422 |     bool m_ill;
1423 |     size_t m_ew;
1424 |     size_t m_mul;
1425 |     vreg_no m_mask_reg;
1426 | };
1427 | 
1428 | namespace {
1429 | class V_unit_impl
1430 |     : State_impl
1431 | {
1432 |     V_unit_impl()
1433 |         : State_impl()
1434 |     {}
1435 | 
1436 | public:
1437 |     static void
1438 |         init()
1439 |     {
1440 |         p_state.reset(new V_unit_impl());
1441 |     }
1442 | };
1443 | 
1444 | }  // namespace
1445 | 
1446 | V_unit&
1447 | V_unit::instance()
1448 | {
1449 |     return *p_state;
1450 | }
1451 | 
1452 | }  // namespace implementation
1453 | 
1454 | size_t
1455 | vsetvl(size_t _avl, size_t _vtype)
1456 | {
1457 |     using implementation::p_state;
1458 | 
1459 |     if (!p_state) {
1460 |         implementation::V_unit_impl::init();
1461 |     }
1462 | 
1463 |     bool const ill = 0 != ((_vtype >> (sizeof(xreg_type) - 1)) && 0b1);
1464 |     size_t mul = 0;
1465 |     size_t ew = 0;
1466 |     size_t avl = 0;
1467 |     if (!ill) {
1468 |         mul = _vtype & 0b11;
1469 |         ew = (_vtype >> 2) & 0b111;
1470 |         size_t lmul = 1 << mul;
1471 |         size_t sew = 8 << ew;
1472 | 
1473 |         if (_avl > 0) {
1474 |             size_t vlmax = lmul * implementation::V_unit::VLEN / sew;
1475 | 
1476 |             if (_avl <= vlmax) {
1477 |                 avl = _avl;
1478 |             } else if (_avl >= 2 * vlmax) {
1479 |                 avl = vlmax;
1480 |             } else {
1481 |                 avl = (_avl + 1) / 2;
1482 |             }
1483 |         }
1484 |     }
1485 | 
1486 |     p_state->setill(ill);
1487 |     p_state->setmul(mul);
1488 |     p_state->setew(ew);
1489 |     p_state->setvstart(0);
1490 |     return p_state->setvl(avl);
1491 | }
1492 | 
1493 | size_t
1494 | vsetvli(size_t _avl, int16_t _vtypei)
1495 | {
1496 |     return vsetvl(_avl, size_t(_vtypei));
1497 | }
1498 | 
1499 | #if 0
1500 | // mask is always register 0
1501 | void
1502 | vsetmask(vreg_no _size)
1503 | {
1504 |     using implementation::p_state;
1505 | 
1506 |     p_state->set_mask_reg(_size);
1507 | }
1508 | #endif
1509 | 
1510 | }  // namespace spec_0_7
1511 | }  // namespace v
1512 | }  // namespace riscv
1513 | 


--------------------------------------------------------------------------------