├── .github └── workflows │ ├── bench-config.h │ ├── rv32-config.mk │ ├── rv32-run.sh │ ├── rv64-config-hosted.mk │ ├── rv64-config.mk │ ├── rv64-run.sh │ └── validate-bench.yml ├── .gitmodules ├── LICENSE ├── README.md ├── bench ├── LUT4.S ├── LUT4.c ├── LUT6.S ├── LUT6.c ├── Makefile ├── ascii_to_utf16.S ├── ascii_to_utf16.c ├── ascii_to_utf32.S ├── ascii_to_utf32.c ├── base64_encode.S ├── base64_encode.c ├── bench.h ├── byteswap.S ├── byteswap.c ├── chacha20.S ├── chacha20.c ├── config.h ├── hist.S ├── hist.c ├── mandelbrot.S ├── mandelbrot.c ├── memcpy.S ├── memcpy.c ├── memset.S ├── memset.c ├── mergelines.S ├── mergelines.c ├── poly1305.S ├── poly1305.c ├── strlen.S ├── strlen.c ├── template.S ├── utf8_count.S └── utf8_count.c ├── config.mk ├── instructions ├── rvv │ ├── Makefile │ ├── config.h │ ├── gen.S │ └── main.c ├── scalar │ ├── Makefile │ ├── config.h │ ├── main.S │ └── main.c └── xtheadvector │ ├── Makefile │ ├── config.h │ ├── main.S │ └── main.c ├── nolibc.h ├── run.sh ├── single ├── Makefile └── veclibm.c ├── thirdparty ├── rvv-chacha-poly │ ├── CONTRIBUTING.md │ ├── LICENSE │ ├── README.md │ ├── boring.c │ ├── boring.h │ ├── main.c │ ├── test.sh │ ├── vchacha.s │ └── vpoly.s └── rvv-rollback.S └── vector-utf ├── 16to8_gather.c ├── 8toN_gather.c ├── Makefile ├── bench.c ├── rvv-0.7.1 ├── 8to16.S └── 8to32.S ├── scalar.h ├── simdutf.cpp └── tests ├── 16to8.c ├── 8to16.c ├── 8to32.c ├── Makefile └── common.h /.github/workflows/bench-config.h: -------------------------------------------------------------------------------- 1 | #define MAX_MEM (4096*8) 2 | #define NEXT(c) (c + c/3 + 3) 3 | #define VALIDATE 1 4 | #define MIN_REPEATS 2 5 | #define MAX_REPEATS 2 6 | 7 | #define STOP_CYCLES (1024*1024*500) 8 | #define SCALE_mandelbrot(N) ((N)/10) 9 | #define SCALE_mergelines(N) ((N)/10) 10 | #define mandelbrot_ITER 100 11 | -------------------------------------------------------------------------------- /.github/workflows/rv32-config.mk: -------------------------------------------------------------------------------- 1 | WARN=-Wall -Wextra -Wno-unused-function -Wno-unused-parameter 2 | CC=clang-17 3 | CFLAGS=--target=riscv32 -march=rv32gc_zve32f_zfh_zba_zbb_zbs -O3 ${WARN} -nostdlib -fno-builtin -nodefaultlibs -ffreestanding 4 | -------------------------------------------------------------------------------- /.github/workflows/rv32-run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # zfh=true,zvfh=true disabled for now 4 | qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@ && \ 5 | qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=256,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@ && \ 6 | qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=512,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@ && \ 7 | qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=1024,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@ &&\ 8 | qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=off,rvv_ma_all_1s=off $@ && \ 9 | qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=1024,rvv_ta_all_1s=off,rvv_ma_all_1s=off $@ 10 | 11 | -------------------------------------------------------------------------------- /.github/workflows/rv64-config-hosted.mk: -------------------------------------------------------------------------------- 1 | WARN=-Wall -Wextra -Wno-unused-function -Wno-unused-parameter 2 | CC=riscv64-linux-gnu-gcc 3 | CFLAGS=-march=rv64gcv_zfh_zba_zbb_zbs -O3 ${WARN} 4 | -------------------------------------------------------------------------------- /.github/workflows/rv64-config.mk: -------------------------------------------------------------------------------- 1 | WARN=-Wall -Wextra -Wno-unused-function -Wno-unused-parameter 2 | CC=clang-17 3 | CFLAGS=--target=riscv64 -march=rv64gcv_zfh_zba_zbb_zbs -O3 ${WARN} -nostdlib -fno-builtin -nodefaultlibs -ffreestanding 4 | -------------------------------------------------------------------------------- /.github/workflows/rv64-run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # zfh=true,zvfh=true disabled for now 4 | qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@ && \ 5 | qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=256,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@ && \ 6 | qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=512,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@ && \ 7 | qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=1024,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@ && \ 8 | qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=off,rvv_ma_all_1s=off $@ && \ 9 | qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=1024,rvv_ta_all_1s=off,rvv_ma_all_1s=off $@ 10 | -------------------------------------------------------------------------------- /.github/workflows/validate-bench.yml: -------------------------------------------------------------------------------- 1 | name: Validate bench 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | Tests: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Install packages 15 | run: | 16 | git submodule update --init --recursive 17 | sudo apt-get update -y 18 | sudo apt-get install -y make qemu-user-static clang-17 lld-17 gcc-riscv64-linux-gnu 19 | sed 's/zfh_zvfh//g' -i ./config.mk 20 | - name: Validate RV64 21 | run: | 22 | cp .github/workflows/rv64-config.mk ./config.mk 23 | cp .github/workflows/rv64-run.sh ./run.sh 24 | cp .github/workflows/bench-config.h ./bench/config.h 25 | make -C bench run -j$(nproc) 26 | make -C bench clean 27 | - name: Validate RV32 28 | run: | 29 | cp .github/workflows/rv32-config.mk ./config.mk 30 | cp .github/workflows/rv32-run.sh ./run.sh 31 | cp .github/workflows/bench-config.h ./bench/config.h 32 | make -C bench run -j$(nproc) 33 | make -C bench clean 34 | - name: Build freestanding 64-bit 35 | run: | 36 | cp .github/workflows/rv64-config.mk ./config.mk 37 | make -C bench -j$(nproc) 38 | make -C bench clean 39 | make -C instructions/rvv 40 | make -C instructions/rvv clean 41 | make -C instructions/scalar 42 | make -C instructions/scalar clean 43 | - name: Build freestanding 32-bit 44 | run: | 45 | cp .github/workflows/rv32-config.mk ./config.mk 46 | make -C bench -j$(nproc) 47 | make -C bench clean 48 | make -C instructions/rvv 49 | make -C instructions/rvv clean 50 | make -C instructions/scalar 51 | make -C instructions/scalar clean 52 | - name: Build hosted 64-bit 53 | run: | 54 | cp .github/workflows/rv64-config-hosted.mk ./config.mk 55 | make -C bench -j$(nproc) 56 | make -C bench clean 57 | make -C instructions/rvv 58 | make -C instructions/rvv clean 59 | make -C instructions/scalar 60 | make -C instructions/scalar clean 61 | sed '/CFLAGS/s/$/ -DUSE_PERF_EVENT/' -i ./config.mk 62 | make -C bench -j$(nproc) 63 | make -C bench clean 64 | make -C instructions/rvv 65 | make -C instructions/rvv clean 66 | make -C instructions/scalar 67 | make -C instructions/scalar clean 68 | sed 's/-DUSE_PERF_EVENT/-DUSE_PERF_EVENT_SLOW/' -i ./config.mk 69 | make -C bench -j$(nproc) 70 | make -C bench clean 71 | make -C instructions/rvv 72 | make -C instructions/rvv clean 73 | make -C instructions/scalar 74 | make -C instructions/scalar clean 75 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "thirdparty/veclibm"] 2 | path = thirdparty/veclibm 3 | url = https://github.com/rivosinc/veclibm 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Olaf Bernstein 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RISC-V Vector benchmark 2 | 3 | A collection of RISC-V Vector (RVV) benchmarks to help developers write portably performant RVV code. 4 | 5 | Benchmark results can be found at: https://camel-cdr.github.io/rvv-bench-results 6 | 7 | ## Benchmarks ([./bench/](./bench/)) 8 | 9 | Contains a bunch of benchmark of different implementations of certain algorithms. 10 | 11 | ## Instruction cycle count ([./instructions/](./instructions/)) 12 | 13 | Measures the cycle count of RVV instructions by unrolling and looping over the given instruction repeatedly. 14 | 15 | ## Getting started 16 | 17 | Start by configuring [./config.mk](./config.mk), such that `make` works and optionally [./run.sh](./run.sh), which allows you to compile and run using `make run`. 18 | 19 | The default configuration should work with all recent clang builds and doesn't require a full cross compilation toolchain, because it builds in freestanding mode. 20 | This means it will only work on linux, or linux syscall compatible OS. 21 | 22 | On recent linux versions, the performance counters aren't exposed by default, you may have to execute `echo 2 >/proc/sys/kernel/perf_user_access` and append `-DUSE_PERF_EVENT` to the `CFLAGS=...` line in [./config.mk](./config.mk) (if that doesn't work, try `-DUSE_PERF_EVENT_SLOW` instead). 23 | 24 | You can configure [./config.mk](./config.mk) to build a hosted build or configure it with your custom toolchain, add the `-DCUSTOM_HOST` define, and implement the unimplemented functions under `#ifdef CUSTOM_HOST` in [./nolibc.h](./nolibc.h). 25 | 26 | XTheadVector isn't supported anymore. 27 | 28 | ### Running benchmarks ([./bench/](./bench/)) 29 | 30 | To run the benchmarks, first look through ([./bench/config.h](./bench/config.h)) and adjust it to your processor (e.g. set `HAS_E64`). If it takes too long to execute, try lowering `MAX_MEM`, which is used to scale the benchmark, and play around with the other constants until it executes in a reasonable amount of time and gives a relatively smooth graph. 31 | 32 | Now you can just run the benchmarks using `make run` in the ([./bench/](./bench/)) directory, or `make` to just build the executables. 33 | 34 | ### Measuring cycle count ([./instructions/](./instructions/)) 35 | 36 | To run the cycle count measurement, first configure [instructions/rvv/config.h](instructions/rvv/config.h) to your processor. 37 | 38 | Now you can run the measurement using `make run` in the ([./instructions/rvv/](./instructions/rvv/)) directory, or `make` to just build the executables. 39 | 40 | For XTheadVector use the ([./instructions/xtheadvector/](./instructions/xtheadvector/)) directory instead. (this isn't maintained anymore) 41 | 42 | ## Contributing 43 | 44 | Here are some suggestions of things that still need to be done. 45 | 46 | * contribute a measurement of a new CPU to: https://github.com/camel-cdr/rvv-bench-results \ 47 | You can just create an issue with a single json file, which contains all concatenated [./bench/](./bench/) results. (after proper setup, `make run > out.json` should do the trick). \ 48 | * implement non memory bound benchmarks 49 | * implement more benchmarks 50 | * better cycle count measurements: throughput vs latency (also: can we figure out the execution port configuration?) 51 | * cycle count for load/stores 52 | * cycle count for vsetvl 53 | 54 | ## License 55 | 56 | This repository is licensed under the MIT [LICENSE](LICENSE). 57 | 58 | -------------------------------------------------------------------------------- /bench/LUT4.S: -------------------------------------------------------------------------------- 1 | #ifdef MX 2 | 3 | .global MX(LUT4_rvv_vloxei8_) 4 | MX(LUT4_rvv_vloxei8_): 5 | 1: 6 | vsetvli a3, a2, e8, MX(), ta, ma 7 | vle8.v v8, (a1) 8 | vand.vi v8, v8, 15 9 | vloxei8.v v8, (a0), v8 10 | vse8.v v8, (a1) 11 | sub a2, a2, a3 12 | add a1, a1, a3 13 | bnez a2, 1b 14 | ret 15 | 16 | .global MX(LUT4_rvv_vluxei8_) 17 | MX(LUT4_rvv_vluxei8_): 18 | 1: 19 | vsetvli a3, a2, e8, MX(), ta, ma 20 | vle8.v v8, (a1) 21 | vand.vi v8, v8, 15 22 | vluxei8.v v8, (a0), v8 23 | vse8.v v8, (a1) 24 | sub a2, a2, a3 25 | add a1, a1, a3 26 | bnez a2, 1b 27 | ret 28 | 29 | # a0 = lut, a1 = ptr, a2 = len 30 | .global MX(LUT4_rvv_gather_) 31 | MX(LUT4_rvv_gather_): 32 | li t0, 16 33 | vsetvli zero, t0, e8, m1, ta, ma 34 | vle8.v v0, (a0) 35 | 1: 36 | vsetvli a0, a2, e8, MX(), ta, ma 37 | vle8.v v8, (a1) 38 | vand.vi v8, v8, 15 39 | vrgather.vv v16, v0, v8 40 | vse8.v v16, (a1) 41 | sub a2, a2, a0 42 | add a1, a1, a0 43 | bnez a2, 1b 44 | ret 45 | #endif 46 | 47 | #if MX_N == 2 48 | 49 | .macro LUT4_rvv_m1_gathers n 50 | .global LUT4_rvv_m1_gathers_m\n 51 | LUT4_rvv_m1_gathers_m\n: 52 | li t0, 16 53 | vsetvli zero, t0, e8, m1, ta, ma 54 | vle8.v v0, (a0) 55 | 1: 56 | vsetvli a0, a2, e8, m\n, ta, ma 57 | vle8.v v8, (a1) 58 | vand.vi v8, v8, 15 59 | vsetvli t1, x0, e8, m1, ta, ma 60 | vrgather.vv v16, v0, v8 61 | .ifge \n-2 62 | vrgather.vv v17, v0, v9 63 | .ifge \n-4 64 | vrgather.vv v18, v0, v10 65 | vrgather.vv v19, v0, v11 66 | .ifge \n-8 67 | vrgather.vv v20, v0, v12 68 | vrgather.vv v21, v0, v13 69 | vrgather.vv v22, v0, v14 70 | vrgather.vv v23, v0, v15 71 | .endif 72 | .endif 73 | .endif 74 | vsetvli x0, a0, e8, m\n, ta, ma 75 | vse8.v v16, (a1) 76 | sub a2, a2, a0 77 | add a1, a1, a0 78 | bnez a2, 1b 79 | ret 80 | .endm 81 | 82 | LUT4_rvv_m1_gathers 2 83 | #endif 84 | #if MX_N == 4 85 | LUT4_rvv_m1_gathers 4 86 | #endif 87 | #if MX_N == 8 88 | LUT4_rvv_m1_gathers 8 89 | #endif 90 | 91 | 92 | -------------------------------------------------------------------------------- /bench/LUT4.c: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | void 4 | LUT4_scalar(uint8_t lut[16], uint8_t *ptr, size_t n) 5 | { 6 | for (; n--; ++ptr) 7 | *ptr = lut[*ptr & 0xF], BENCH_CLOBBER(); 8 | } 9 | 10 | void 11 | LUT4_scalar_autovec(uint8_t lut[restrict 16], uint8_t *restrict ptr, size_t n) 12 | { 13 | for (; n--; ++ptr) 14 | *ptr = lut[*ptr & 0xF]; 15 | } 16 | 17 | 18 | #define IMPLS(f) \ 19 | f(scalar) \ 20 | f(scalar_autovec) \ 21 | MX(f, rvv_gather) \ 22 | f(rvv_m1_gathers_m2) \ 23 | f(rvv_m1_gathers_m4) \ 24 | f(rvv_m1_gathers_m8) \ 25 | MX(f, rvv_vluxei8) \ 26 | MX(f, rvv_vloxei8) \ 27 | 28 | typedef void Func(uint8_t lut[16], uint8_t *ptr, size_t n); 29 | 30 | #define DECLARE(f) extern Func LUT4_##f; 31 | IMPLS(DECLARE) 32 | 33 | #define EXTRACT(f) { #f, &LUT4_##f }, 34 | Impl impls[] = { IMPLS(EXTRACT) }; 35 | 36 | uint8_t *ptr; 37 | 38 | void init(void) { ptr = (uint8_t*)mem; } 39 | 40 | ux checksum(size_t n) { 41 | ux sum = 0; 42 | for (size_t i = 0; i < n; ++i) 43 | sum = uhash(sum) + ptr[i]; 44 | return sum; 45 | } 46 | 47 | BENCH_BEG(base) { 48 | static uint32_t lut[4] = { 0x4564907f, 0xb8ce2de0, 0xc0f7adf8, 0xa048aa9f }; 49 | bench_memrand(ptr, n * sizeof *ptr); 50 | TIME f((uint8_t*)lut, ptr, n); 51 | } BENCH_END 52 | 53 | Bench benches[] = { 54 | BENCH( impls, MAX_MEM, "LUT4", bench_base ) 55 | }; BENCH_MAIN(benches) 56 | 57 | -------------------------------------------------------------------------------- /bench/LUT6.S: -------------------------------------------------------------------------------- 1 | #if MX_N == 4 2 | 3 | .global LUT6_rvv_vloxei8_m4 4 | LUT6_rvv_vloxei8_m4: 5 | vsetvli t0, x0, e8, m4, ta, ma 6 | li t0, 63 7 | vmv.v.x v24, t0 8 | 1: 9 | vsetvli a3, a2, e8, m4, ta, ma 10 | vle8.v v8, (a1) 11 | vand.vv v8, v8, v24 12 | vloxei8.v v8, (a0), v8 13 | vse8.v v8, (a1) 14 | sub a2, a2, a3 15 | add a1, a1, a3 16 | bnez a2, 1b 17 | ret 18 | 19 | .global LUT6_rvv_vluxei8_m4 20 | LUT6_rvv_vluxei8_m4: 21 | vsetvli t0, x0, e8, m4, ta, ma 22 | li t0, 63 23 | vmv.v.x v24, t0 24 | 1: 25 | vsetvli a3, a2, e8, m4, ta, ma 26 | vle8.v v8, (a1) 27 | vand.vv v8, v8, v24 28 | vluxei8.v v8, (a0), v8 29 | vse8.v v8, (a1) 30 | sub a2, a2, a3 31 | add a1, a1, a3 32 | bnez a2, 1b 33 | ret 34 | 35 | # a0 = lut, a1 = ptr, a2 = len 36 | .global LUT6_rvv_gather_m4 37 | LUT6_rvv_gather_m4: 38 | li t0, 64 39 | vsetvli zero, t0, e8, m4, ta, ma 40 | vle8.v v0, (a0) 41 | 42 | vsetvli t0, x0, e8, m4, ta, ma 43 | li t0, 63 44 | vmv.v.x v24, t0 45 | 1: 46 | vsetvli a0, a2, e8, m4, ta, ma 47 | vle8.v v8, (a1) 48 | vand.vv v8, v8, v24 49 | vrgather.vv v16, v0, v8 50 | vse8.v v16, (a1) 51 | sub a2, a2, a0 52 | add a1, a1, a0 53 | bnez a2, 1b 54 | ret 55 | 56 | .global LUT6_rvv_m1m2m4_gathers_m4 57 | LUT6_rvv_m1m2m4_gathers_m4: 58 | li t0, 64 59 | vsetvli zero, t0, e8, m4, ta, ma 60 | vle8.v v0, (a0) 61 | 62 | vsetvli t0, x0, e8, m4, ta, ma 63 | li t0, 63 64 | vmv.v.x v24, t0 65 | 66 | csrr t0, vlenb 67 | srl t0, t0, 4 68 | sltiu t1, t0, 4 69 | sltiu t0, t0, 2 70 | j 0f 71 | 1: 72 | vsetvli t1, x0, e8, m1, ta, ma 73 | vrgather.vv v16, v0, v8 74 | vrgather.vv v17, v0, v9 75 | vrgather.vv v18, v0, v10 76 | vrgather.vv v19, v0, v11 77 | 8: 78 | vsetvli x0, a0, e8, m4, ta, ma 79 | vse8.v v16, (a1) 80 | sub a2, a2, a0 81 | add a1, a1, a0 82 | beqz a2, 9f 83 | 0: 84 | vsetvli a0, a2, e8, m4, ta, ma 85 | vle8.v v8, (a1) 86 | vand.vv v8, v8, v24 87 | beqz t1, 1b 88 | beqz t0, 2f 89 | vrgather.vv v16, v0, v8 90 | j 8b 91 | 2: 92 | vsetvli t1, x0, e8, m2, ta, ma 93 | vrgather.vv v16, v0, v8 94 | vrgather.vv v18, v0, v10 95 | j 8b 96 | 9: 97 | ret 98 | 99 | #endif 100 | 101 | 102 | -------------------------------------------------------------------------------- /bench/LUT6.c: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | void 4 | LUT6_scalar(uint8_t lut[64], uint8_t *ptr, size_t n) 5 | { 6 | for (; n--; ++ptr) 7 | *ptr = lut[*ptr & 63], BENCH_CLOBBER(); 8 | } 9 | 10 | void 11 | LUT6_scalar_autovec(uint8_t lut[restrict 64], uint8_t *restrict ptr, size_t n) 12 | { 13 | for (; n--; ++ptr) 14 | *ptr = lut[*ptr & 63]; 15 | } 16 | 17 | 18 | #define IMPLS(f) \ 19 | f(scalar) \ 20 | f(scalar_autovec) \ 21 | f(rvv_gather_m4) \ 22 | f(rvv_m1m2m4_gathers_m4) \ 23 | f(rvv_vluxei8_m4) \ 24 | f(rvv_vloxei8_m4) \ 25 | 26 | typedef void Func(uint8_t lut[64], uint8_t *ptr, size_t n); 27 | 28 | #define DECLARE(f) extern Func LUT6_##f; 29 | IMPLS(DECLARE) 30 | 31 | #define EXTRACT(f) { #f, &LUT6_##f }, 32 | Impl impls[] = { IMPLS(EXTRACT) }; 33 | 34 | uint8_t *ptr; 35 | 36 | void init(void) { ptr = (uint8_t*)mem; } 37 | 38 | ux checksum(size_t n) { 39 | ux sum = 0; 40 | for (size_t i = 0; i < n; ++i) 41 | sum = uhash(sum) + ptr[i]; 42 | return sum; 43 | } 44 | 45 | BENCH_BEG(base) { 46 | static uint8_t lut[] = 47 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ" 48 | "abcdefghijklmnopqrstuvwxyz" 49 | "0123456789" 50 | "+/"; 51 | bench_memrand(ptr, n * sizeof *ptr); 52 | TIME f(lut, ptr, n); 53 | } BENCH_END 54 | 55 | Bench benches[] = { 56 | BENCH( impls, MAX_MEM, "LUT6", bench_base ) 57 | }; BENCH_MAIN(benches) 58 | 59 | -------------------------------------------------------------------------------- /bench/Makefile: -------------------------------------------------------------------------------- 1 | .POSIX: 2 | 3 | include ../config.mk 4 | 5 | EXECS=memcpy memset utf8_count strlen mergelines mandelbrot chacha20 poly1305 ascii_to_utf16 ascii_to_utf32 byteswap LUT4 LUT6 hist base64_encode 6 | 7 | all: ${EXECS} 8 | 9 | .c: $@.S template.S config.h bench.h 10 | ${CC} ${CFLAGS} -o $@ $< -DINC=$@.S template.S 11 | 12 | clean: 13 | rm -f ${EXECS} 14 | 15 | run: all 16 | for i in ${EXECS}; do ../run.sh ./$$i || { printf "\n\n\033[0;31mFAILED\033[0m\n\n"; exit 1; } ; done 17 | 18 | -------------------------------------------------------------------------------- /bench/ascii_to_utf16.S: -------------------------------------------------------------------------------- 1 | #ifdef MX 2 | 3 | #if MX_N == 4 || MX_N == 2 || MX_N == 1 4 | 5 | .global MX(ascii_to_utf16_rvv_vsseg_) 6 | MX(ascii_to_utf16_rvv_vsseg_): 7 | vsetvli t0, x0, e8, MX2(), ta, ma 8 | vmv.v.i v0, 0 9 | 1: 10 | vsetvli t0, a2, e8, MX(), ta, ma 11 | vle8.v v0, (a1) 12 | vsseg2e8.v v0, (a0) 13 | add a1, a1, t0 14 | sub a2, a2, t0 15 | slli t0, t0, 1 16 | add a0, a0, t0 17 | bnez a2, 1b 18 | ret 19 | 20 | 21 | 22 | .global MX(ascii_to_utf16_rvv_ext_) 23 | MX(ascii_to_utf16_rvv_ext_): 24 | 1: 25 | vsetvli t0, a2, e8, MX(), ta, ma 26 | vle8.v v0, (a1) 27 | vsetvli x0, x0, e16, MX2(), ta, ma 28 | vzext.vf2 v8, v0 29 | vse16.v v8, (a0) 30 | add a1, a1, t0 31 | sub a2, a2, t0 32 | slli t0, t0, 1 33 | add a0, a0, t0 34 | bnez a2, 1b 35 | ret 36 | 37 | 38 | .global MX(ascii_to_utf16_rvv_vss_) 39 | MX(ascii_to_utf16_rvv_vss_): 40 | vsetvli t0, x0, e8, MX2(), ta, ma 41 | vmv.v.i v0, 0 42 | li a3, 2 43 | 1: 44 | vsetvli t0, a2, e16, MX2(), ta, ma 45 | vse16.v v0, (a0) 46 | 47 | vsetvli t0, a2, e8, MX(), ta, ma 48 | vle8.v v8, (a1) 49 | vsse8.v v8, (a0), a3 50 | 51 | add a1, a1, t0 52 | sub a2, a2, t0 53 | slli t0, t0, 1 54 | add a0, a0, t0 55 | bnez a2, 1b 56 | ret 57 | 58 | #endif 59 | #endif 60 | 61 | -------------------------------------------------------------------------------- /bench/ascii_to_utf16.c: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | void 4 | ascii_to_utf16_scalar(uint16_t *restrict dest, uint8_t const *restrict src, size_t len) 5 | { 6 | while (len--) BENCH_CLOBBER(), *dest++ = *src++; 7 | } 8 | 9 | void 10 | ascii_to_utf16_scalar_autovec(uint16_t *restrict dest, uint8_t const *restrict src, size_t len) 11 | { 12 | while (len--) *dest++ = *src++; 13 | } 14 | 15 | #define IMPLS(f) \ 16 | f(scalar) f(scalar_autovec) \ 17 | f(rvv_ext_m1) f(rvv_ext_m2) f(rvv_ext_m4) \ 18 | f(rvv_vsseg_m1) f(rvv_vsseg_m2) f(rvv_vsseg_m4) \ 19 | f(rvv_vss_m1) f(rvv_vss_m2) f(rvv_vss_m4) \ 20 | 21 | typedef void Func(uint16_t *restrict dest, uint8_t const *restrict src, size_t len); 22 | 23 | #define DECLARE(f) extern Func ascii_to_utf16_##f; 24 | IMPLS(DECLARE) 25 | 26 | #define EXTRACT(f) { #f, &ascii_to_utf16_##f }, 27 | Impl impls[] = { IMPLS(EXTRACT) }; 28 | 29 | uint16_t *dest; 30 | uint8_t *src; 31 | 32 | void init(void) { } 33 | 34 | ux checksum(size_t n) { 35 | ux sum = 0; 36 | for (size_t i = 0; i < n+9; ++i) 37 | sum = uhash(sum) + dest[i]; 38 | return sum; 39 | } 40 | 41 | void common(size_t n, size_t dOff, size_t sOff) { 42 | dest = (uint16_t*)mem + dOff/2; 43 | src = (uint8_t*)(dest + 9 + MAX_MEM/3) + sOff; 44 | bench_memrand(src, n+9); 45 | for (size_t i = 0; i < n+9; ++i) src[i] |= 0x7F; 46 | memset(dest, 1, (n+9)*2); 47 | } 48 | 49 | BENCH_BEG(base) { 50 | common(n, bench_urand() & 255, bench_urand() & 255); 51 | TIME f(dest, src, n); 52 | } BENCH_END 53 | 54 | BENCH_BEG(aligned) { 55 | common(n, 0, 0); 56 | TIME f(dest, src, n); 57 | } BENCH_END 58 | 59 | Bench benches[] = { 60 | BENCH( impls, MAX_MEM/3 - 512-9*2, "ascii to utf16", bench_base ), 61 | BENCH( impls, MAX_MEM/3 - 512-9*2, "ascii to utf16 aligned", bench_aligned ), 62 | }; BENCH_MAIN(benches) 63 | 64 | -------------------------------------------------------------------------------- /bench/ascii_to_utf32.S: -------------------------------------------------------------------------------- 1 | #ifdef MX 2 | 3 | #if MX_N == 2 || MX_N == 1 4 | 5 | .global MX(ascii_to_utf32_rvv_vsseg_) 6 | MX(ascii_to_utf32_rvv_vsseg_): 7 | vsetvli t0, x0, e8, MX4(), ta, ma 8 | vmv.v.i v0, 0 9 | 1: 10 | vsetvli t0, a2, e8, MX(), ta, ma 11 | vle8.v v0, (a1) 12 | vsseg4e8.v v0, (a0) 13 | add a1, a1, t0 14 | sub a2, a2, t0 15 | slli t0, t0, 2 16 | add a0, a0, t0 17 | bnez a2, 1b 18 | ret 19 | 20 | 21 | .global MX(ascii_to_utf32_rvv_ext_) 22 | MX(ascii_to_utf32_rvv_ext_): 23 | 1: 24 | vsetvli t0, a2, e8, MX(), ta, ma 25 | vle8.v v0, (a1) 26 | vsetvli x0, x0, e32, MX4(), ta, ma 27 | vzext.vf4 v8, v0 28 | vse32.v v8, (a0) 29 | add a1, a1, t0 30 | sub a2, a2, t0 31 | slli t0, t0, 2 32 | add a0, a0, t0 33 | bnez a2, 1b 34 | ret 35 | 36 | 37 | .global MX(ascii_to_utf32_rvv_vss_) 38 | MX(ascii_to_utf32_rvv_vss_): 39 | vsetvli t0, x0, e8, MX4(), ta, ma 40 | vmv.v.i v0, 0 41 | li a3, 4 42 | 1: 43 | vsetvli t0, a2, e32, MX4(), ta, ma 44 | vse32.v v0, (a0) 45 | 46 | vsetvli t0, a2, e8, MX(), ta, ma 47 | vle8.v v8, (a1) 48 | vsse8.v v8, (a0), a3 49 | 50 | add a1, a1, t0 51 | sub a2, a2, t0 52 | slli t0, t0, 2 53 | add a0, a0, t0 54 | bnez a2, 1b 55 | ret 56 | 57 | #endif 58 | #endif 59 | 60 | -------------------------------------------------------------------------------- /bench/ascii_to_utf32.c: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | void 4 | ascii_to_utf32_scalar(uint32_t *restrict dest, uint8_t const *restrict src, size_t len) 5 | { 6 | while (len--) BENCH_CLOBBER(), *dest++ = *src++; 7 | } 8 | 9 | void 10 | ascii_to_utf32_scalar_autovec(uint32_t *restrict dest, uint8_t const *restrict src, size_t len) 11 | { 12 | while (len--) *dest++ = *src++; 13 | } 14 | 15 | #define IMPLS(f) \ 16 | f(scalar) f(scalar_autovec) \ 17 | f(rvv_ext_m1) f(rvv_ext_m2) \ 18 | f(rvv_vsseg_m1) f(rvv_vsseg_m2) \ 19 | f(rvv_vss_m1) f(rvv_vss_m2) \ 20 | 21 | typedef void Func(uint32_t *restrict dest, uint8_t const *restrict src, size_t len); 22 | 23 | #define DECLARE(f) extern Func ascii_to_utf32_##f; 24 | IMPLS(DECLARE) 25 | 26 | #define EXTRACT(f) { #f, &ascii_to_utf32_##f }, 27 | Impl impls[] = { IMPLS(EXTRACT) }; 28 | 29 | uint32_t *dest; 30 | uint8_t *src; 31 | 32 | void init(void) { } 33 | 34 | ux checksum(size_t n) { 35 | ux sum = 0; 36 | for (size_t i = 0; i < n+9; ++i) 37 | sum = uhash(sum) + dest[i]; 38 | return sum; 39 | } 40 | 41 | void common(size_t n, size_t dOff, size_t sOff) { 42 | dest = (uint32_t*)mem + dOff/4; 43 | src = (uint8_t*)(dest + 9 + MAX_MEM/5) + sOff; 44 | bench_memrand(src, n+9); 45 | for (size_t i = 0; i < n+9; ++i) src[i] |= 0x7F; 46 | memset(dest, 1, (n+9)*4); 47 | } 48 | 49 | BENCH_BEG(base) { 50 | common(n, bench_urand() & 255, bench_urand() & 255); 51 | TIME f(dest, src, n); 52 | } BENCH_END 53 | 54 | BENCH_BEG(aligned) { 55 | common(n, 0, 0); 56 | TIME f(dest, src, n); 57 | } BENCH_END 58 | 59 | Bench benches[] = { 60 | BENCH( impls, MAX_MEM/5 - 512-9*2, "ascii to utf32", bench_base ), 61 | BENCH( impls, MAX_MEM/5 - 512-9*2, "ascii to utf32 aligned", bench_aligned ), 62 | }; BENCH_MAIN(benches) 63 | 64 | -------------------------------------------------------------------------------- /bench/base64_encode.S: -------------------------------------------------------------------------------- 1 | // Code generated using clang-20 from: 2 | // https://github.com/camel-cdr/rvv-playground/blob/main/base64-encode.c 3 | // which was slighly modified to remove a GPR spill: 4 | // https://godbolt.org/z/vqYMv4r9c 5 | 6 | #if MX_N == 4 7 | 8 | .global b64_encode_rvv_LUT64 9 | b64_encode_rvv_LUT64: 10 | mv a4, a2 11 | mv a2, a1 12 | vsetvli a6, zero, e8, m1, ta, ma 13 | slli a5, a6, 2 14 | bgeu a4, a5, .LBB0_3 15 | mv a1, a0 16 | .LBB0_2: 17 | sub a0, a1, a0 18 | mv a3, a4 19 | tail b64_encode_scalar_tail 20 | .LBB0_3: 21 | vid.v v24 22 | li t2, 3 23 | lui t3, 4128 24 | lui a7, 96 25 | lui t0, 128 26 | vsetvli zero, a5, e8, m4, ta, ma 27 | vid.v v20 28 | li a1, 64 29 | vsetvli zero, a1, e8, m4, ta, ma 30 | vle8.v v8, (a3) 31 | srli t1, a6, 2 32 | addi a7, a7, 10 33 | addi t0, t0, 4 34 | slli a1, t1, 1 35 | vsetvli a3, zero, e32, m4, ta, ma 36 | vmv.v.x v12, a7 37 | slli a3, t1, 3 38 | vmv.v.x v16, t0 39 | add a7, a1, t1 40 | sub t0, a3, a1 41 | add t1, t1, a3 42 | li a1, 63 43 | vsetvli zero, zero, e8, m1, ta, ma 44 | vsrl.vi v24, v24, 2 45 | addi t3, t3, 1 46 | vsetvli zero, a5, e8, m4, ta, ma 47 | vand.vi v20, v20, 1 48 | vmsne.vi v0, v20, 0 49 | vmv.v.x v20, a1 50 | vsetvli a3, zero, e8, m1, ta, ma 51 | vmul.vx v24, v24, t2 52 | vsetvli zero, a6, e32, m1, ta, ma 53 | vadd.vx v7, v24, t3 54 | slli a3, a6, 1 55 | add t3, a3, a6 56 | bgeu a1, a6, .LBB0_9 57 | vsetvli a1, zero, e16, m2, ta, ma 58 | vid.v v10 59 | lui a1, 32769 60 | vsrl.vi v10, v10, 2 61 | slli a1, a1, 21 62 | vmul.vx v10, v10, t2 63 | addi a1, a1, 1 64 | vsetvli zero, a6, e64, m2, ta, ma 65 | vadd.vx v10, v10, a1 66 | li t2, 257 67 | mv a1, a0 68 | j .LBB0_7 69 | .LBB0_5: 70 | vrgather.vv v24, v9, v7 71 | vrgather.vv v25, v26, v7 72 | vrgather.vv v26, v27, v7 73 | vrgather.vv v27, v28, v7 74 | .LBB0_6: 75 | vsetvli zero, a5, e16, m4, ta, ma 76 | vsrl.vv v28, v24, v12 77 | vsll.vv v24, v24, v16 78 | sub a4, a4, t3 79 | add a2, a2, t3 80 | vsetvli zero, a5, e8, m4, ta, ma 81 | vmerge.vvm v24, v28, v24, v0 82 | vand.vv v24, v24, v20 83 | vsetvli a3, zero, e8, m1, ta, ma 84 | vrgather.vv v28, v8, v24 85 | vrgather.vv v29, v8, v25 86 | vrgather.vv v30, v8, v26 87 | vrgather.vv v31, v8, v27 88 | vsetvli zero, a5, e8, m4, ta, ma 89 | vse8.v v28, (a1) 90 | add a1, a1, a5 91 | bltu a4, a5, .LBB0_2 92 | .LBB0_7: 93 | vsetvli a3, zero, e8, m1, ta, ma 94 | vle8.v v9, (a2) 95 | add a3, a2, a7 96 | vle8.v v26, (a3) 97 | add a3, a2, t0 98 | vle8.v v27, (a3) 99 | add a3, a2, t1 100 | vle8.v v28, (a3) 101 | bltu a6, t2, .LBB0_5 102 | vrgatherei16.vv v24, v9, v10 103 | vrgatherei16.vv v25, v26, v10 104 | vrgatherei16.vv v26, v27, v10 105 | vrgatherei16.vv v27, v28, v10 106 | j .LBB0_6 107 | .LBB0_9: 108 | li t2, 31 109 | vsetvli a1, zero, e8, m2, ta, ma 110 | mv a1, a0 111 | j .LBB0_12 112 | .LBB0_10: 113 | vsetvli a3, zero, e8, m2, ta, ma 114 | vrgather.vv v24, v8, v28 115 | vrgather.vv v26, v8, v30 116 | .LBB0_11: 117 | vsetvli zero, a5, e8, m4, ta, ma 118 | vse8.v v24, (a1) 119 | sub a4, a4, t3 120 | add a2, a2, t3 121 | add a1, a1, a5 122 | bltu a4, a5, .LBB0_2 123 | .LBB0_12: 124 | vsetvli a3, zero, e8, m1, ta, ma 125 | vle8.v v25, (a2) 126 | add a3, a2, a7 127 | vle8.v v26, (a3) 128 | add a3, a2, t0 129 | vle8.v v27, (a3) 130 | add a3, a2, t1 131 | vle8.v v28, (a3) 132 | vrgather.vv v24, v25, v7 133 | vrgather.vv v25, v26, v7 134 | vrgather.vv v26, v27, v7 135 | vrgather.vv v27, v28, v7 136 | vsetvli zero, a5, e16, m4, ta, ma 137 | vsrl.vv v28, v24, v12 138 | vsll.vv v24, v24, v16 139 | vsetvli zero, a5, e8, m4, ta, ma 140 | vmerge.vvm v24, v28, v24, v0 141 | vand.vv v28, v24, v20 142 | bltu t2, a6, .LBB0_10 143 | vrgather.vv v24, v8, v28 144 | j .LBB0_11 145 | 146 | .global b64_encode_rvv_seg_LUT64 147 | b64_encode_rvv_seg_LUT64: 148 | mv a4, a2 149 | mv a2, a1 150 | vsetvli a7, zero, e8, m1, ta, ma 151 | slli a5, a7, 2 152 | bgeu a4, a5, .LBB1_2 153 | mv a1, a0 154 | sub a0, a0, a0 155 | mv a3, a4 156 | tail b64_encode_scalar_tail 157 | .LBB1_2: 158 | li a6, 64 159 | vsetvli zero, a6, e8, m4, ta, ma 160 | vle8.v v8, (a3) 161 | li a3, 63 162 | vsetvli a1, zero, e8, m1, ta, ma 163 | vmv.v.x v12, a3 164 | slli a3, a7, 1 165 | add t0, a3, a7 166 | bltu a7, a6, .LBB1_6 167 | li a6, 4 168 | li a7, 16 169 | mv a1, a0 170 | .LBB1_4: 171 | vlseg3e8.v v9, (a2) 172 | sub a4, a4, t0 173 | add a2, a2, t0 174 | vand.vv v13, v11, v12 175 | vsrl.vi v11, v11, 6 176 | vsrl.vi v14, v10, 4 177 | vsrl.vi v15, v9, 2 178 | vmacc.vx v11, a6, v10 179 | vmacc.vx v14, a7, v9 180 | vrgather.vv v16, v8, v15 181 | vand.vv v9, v11, v12 182 | vand.vv v10, v14, v12 183 | vrgather.vv v17, v8, v10 184 | vrgather.vv v18, v8, v9 185 | vrgather.vv v19, v8, v13 186 | vsseg4e8.v v16, (a1) 187 | add a1, a1, a5 188 | bgeu a4, a5, .LBB1_4 189 | .LBB1_5: 190 | sub a0, a1, a0 191 | mv a3, a4 192 | tail b64_encode_scalar_tail 193 | .LBB1_6: 194 | li a1, 31 195 | bgeu a1, a7, .LBB1_9 196 | vsetvli a1, zero, e8, m2, ta, ma 197 | li a6, 4 198 | li a7, 16 199 | mv a1, a0 200 | .LBB1_8: 201 | vsetvli a3, zero, e8, m1, ta, ma 202 | vlseg3e8.v v13, (a2) 203 | sub a4, a4, t0 204 | add a2, a2, t0 205 | vand.vv v11, v15, v12 206 | vsrl.vi v10, v15, 6 207 | vsrl.vi v15, v14, 4 208 | vmacc.vx v10, a6, v14 209 | vmacc.vx v15, a7, v13 210 | vand.vv v10, v10, v12 211 | vand.vv v17, v15, v12 212 | vsrl.vi v16, v13, 2 213 | vsetvli a3, zero, e8, m2, ta, ma 214 | vrgather.vv v20, v8, v16 215 | vrgather.vv v22, v8, v10 216 | vsetvli a3, zero, e8, m1, ta, ma 217 | vsseg4e8.v v20, (a1) 218 | add a1, a1, a5 219 | bgeu a4, a5, .LBB1_8 220 | j .LBB1_5 221 | .LBB1_9: 222 | li a6, 4 223 | li a7, 16 224 | mv a1, a0 225 | .LBB1_10: 226 | vlseg3e8.v v13, (a2) 227 | sub a4, a4, t0 228 | add a2, a2, t0 229 | vand.vv v19, v15, v12 230 | vsrl.vi v15, v15, 6 231 | vmacc.vx v15, a6, v14 232 | vand.vv v18, v15, v12 233 | vsrl.vi v14, v14, 4 234 | vmacc.vx v14, a7, v13 235 | vand.vv v17, v14, v12 236 | vsrl.vi v16, v13, 2 237 | vsetvli zero, a5, e8, m4, ta, ma 238 | vrgather.vv v20, v8, v16 239 | vsetvli a3, zero, e8, m1, ta, ma 240 | vsseg4e8.v v20, (a1) 241 | add a1, a1, a5 242 | bgeu a4, a5, .LBB1_10 243 | j .LBB1_5 244 | 245 | .global b64_encode_rvv_LUT16 246 | b64_encode_rvv_LUT16: 247 | mv a4, a2 248 | mv a2, a1 249 | vsetvli t3, zero, e8, m1, ta, ma 250 | slli a5, t3, 2 251 | bgeu a4, a5, .LBB2_2 252 | mv a1, a0 253 | sub a0, a0, a0 254 | mv a3, a4 255 | tail b64_encode_scalar_tail 256 | .LBB2_2: 257 | li t2, 3 258 | lui a6, 96 259 | lui a7, 128 260 | vsetvli zero, a5, e8, m4, ta, ma 261 | vid.v v8 262 | li t0, 63 263 | addi a3, a3, 64 264 | srli a1, t3, 2 265 | addi a6, a6, 10 266 | addi a7, a7, 4 267 | vand.vi v20, v8, 1 268 | vmv.v.x v12, t0 269 | vsetivli zero, 16, e8, m1, ta, ma 270 | vle8.v v9, (a3) 271 | slli t1, a1, 1 272 | slli t0, a1, 3 273 | vsetvli a3, zero, e32, m4, ta, ma 274 | vmv.v.x v16, a6 275 | vsetvli zero, a5, e8, m4, ta, ma 276 | vmsne.vi v8, v20, 0 277 | vsetvli a3, zero, e32, m4, ta, ma 278 | vmv.v.x v20, a7 279 | add a6, t1, a1 280 | sub a7, t0, t1 281 | add t0, t0, a1 282 | slli a3, t3, 1 283 | li a1, 257 284 | add t4, a3, t3 285 | bgeu t3, a1, .LBB2_6 286 | vsetvli zero, zero, e8, m1, ta, ma 287 | vid.v v10 288 | lui a1, 4128 289 | li t1, 51 290 | vsrl.vi v10, v10, 2 291 | addi a1, a1, 1 292 | vmul.vx v10, v10, t2 293 | vsetvli zero, t3, e32, m1, ta, ma 294 | vadd.vx v10, v10, a1 295 | li t2, 26 296 | mv a1, a0 297 | .LBB2_4: 298 | vsetvli a3, zero, e8, m1, ta, ma 299 | vmv1r.v v0, v8 300 | vle8.v v11, (a2) 301 | add a3, a2, a6 302 | vle8.v v26, (a3) 303 | add a3, a2, a7 304 | vle8.v v27, (a3) 305 | add a3, a2, t0 306 | sub a4, a4, t4 307 | add a2, a2, t4 308 | vle8.v v28, (a3) 309 | vrgather.vv v24, v11, v10 310 | vrgather.vv v25, v26, v10 311 | vrgather.vv v26, v27, v10 312 | vrgather.vv v27, v28, v10 313 | vsetvli zero, a5, e16, m4, ta, ma 314 | vsrl.vv v28, v24, v16 315 | vsll.vv v24, v24, v20 316 | vsetvli zero, a5, e8, m4, ta, ma 317 | vmerge.vvm v24, v28, v24, v0 318 | vand.vv v24, v24, v12 319 | vmsltu.vx v0, v24, t2 320 | vssubu.vx v28, v24, t1 321 | vmerge.vim v28, v28, 13, v0 322 | vsetvli a3, zero, e8, m1, ta, ma 323 | vrgather.vv v4, v9, v28 324 | vrgather.vv v5, v9, v29 325 | vrgather.vv v6, v9, v30 326 | vrgather.vv v7, v9, v31 327 | vsetvli zero, a5, e8, m4, ta, ma 328 | vadd.vv v24, v24, v4 329 | vse8.v v24, (a1) 330 | add a1, a1, a5 331 | bgeu a4, a5, .LBB2_4 332 | .LBB2_5: 333 | sub a0, a1, a0 334 | mv a3, a4 335 | tail b64_encode_scalar_tail 336 | .LBB2_6: 337 | vsetvli zero, zero, e16, m2, ta, ma 338 | vid.v v10 339 | lui a1, 32769 340 | li t1, 51 341 | vsrl.vi v10, v10, 2 342 | slli a1, a1, 21 343 | vmul.vx v10, v10, t2 344 | addi a1, a1, 1 345 | vsetvli zero, t3, e64, m2, ta, ma 346 | vadd.vx v10, v10, a1 347 | li t2, 26 348 | mv a1, a0 349 | .LBB2_7: 350 | vsetvli a3, zero, e8, m1, ta, ma 351 | vmv1r.v v0, v8 352 | vle8.v v25, (a2) 353 | add a3, a2, a6 354 | vle8.v v26, (a3) 355 | add a3, a2, a7 356 | vle8.v v27, (a3) 357 | add a3, a2, t0 358 | sub a4, a4, t4 359 | add a2, a2, t4 360 | vle8.v v28, (a3) 361 | vrgatherei16.vv v24, v25, v10 362 | vrgatherei16.vv v25, v26, v10 363 | vrgatherei16.vv v26, v27, v10 364 | vrgatherei16.vv v27, v28, v10 365 | vsetvli zero, a5, e16, m4, ta, ma 366 | vsrl.vv v28, v24, v16 367 | vsll.vv v24, v24, v20 368 | vsetvli zero, a5, e8, m4, ta, ma 369 | vmerge.vvm v24, v28, v24, v0 370 | vand.vv v24, v24, v12 371 | vmsltu.vx v0, v24, t2 372 | vssubu.vx v28, v24, t1 373 | vmerge.vim v28, v28, 13, v0 374 | vsetvli a3, zero, e8, m1, ta, ma 375 | vrgather.vv v4, v9, v28 376 | vrgather.vv v5, v9, v29 377 | vrgather.vv v6, v9, v30 378 | vrgather.vv v7, v9, v31 379 | vsetvli zero, a5, e8, m4, ta, ma 380 | vadd.vv v24, v24, v4 381 | vse8.v v24, (a1) 382 | add a1, a1, a5 383 | bgeu a4, a5, .LBB2_7 384 | j .LBB2_5 385 | 386 | .global b64_encode_rvv_seg_LUT16 387 | b64_encode_rvv_seg_LUT16: 388 | mv a4, a2 389 | mv a2, a1 390 | vsetvli a1, zero, e8, m1, ta, ma 391 | slli a5, a1, 2 392 | bgeu a4, a5, .LBB3_2 393 | mv a1, a0 394 | sub a0, a0, a0 395 | mv a3, a4 396 | tail b64_encode_scalar_tail 397 | .LBB3_2: 398 | li a6, 63 399 | addi a7, a3, 64 400 | slli a3, a1, 1 401 | vmv.v.x v8, a6 402 | vsetivli zero, 16, e8, m1, ta, ma 403 | vle8.v v9, (a7) 404 | add t2, a3, a1 405 | li a6, 4 406 | li a7, 16 407 | li t0, 51 408 | li t1, 26 409 | mv a1, a0 410 | .LBB3_3: 411 | vsetvli a3, zero, e8, m1, ta, ma 412 | vlseg3e8.v v10, (a2) 413 | sub a4, a4, t2 414 | add a2, a2, t2 415 | vand.vv v15, v12, v8 416 | vsrl.vi v12, v12, 6 417 | vsrl.vi v13, v11, 4 418 | vmacc.vx v12, a6, v11 419 | vmacc.vx v13, a7, v10 420 | vand.vv v14, v12, v8 421 | vand.vv v13, v13, v8 422 | vsrl.vi v12, v10, 2 423 | vsetvli zero, a5, e8, m4, ta, ma 424 | vmsltu.vx v0, v12, t1 425 | vssubu.vx v16, v12, t0 426 | vmerge.vim v16, v16, 13, v0 427 | vsetvli a3, zero, e8, m1, ta, ma 428 | vrgather.vv v20, v9, v16 429 | vrgather.vv v21, v9, v17 430 | vrgather.vv v22, v9, v18 431 | vrgather.vv v23, v9, v19 432 | vsetvli zero, a5, e8, m4, ta, ma 433 | vadd.vv v12, v12, v20 434 | vsetvli a3, zero, e8, m1, ta, ma 435 | vsseg4e8.v v12, (a1) 436 | add a1, a1, a5 437 | bgeu a4, a5, .LBB3_3 438 | sub a0, a1, a0 439 | mv a3, a4 440 | tail b64_encode_scalar_tail 441 | 442 | #endif 443 | 444 | -------------------------------------------------------------------------------- /bench/base64_encode.c: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | size_t 4 | b64_encode_scalar(uint8_t *dst, const uint8_t *src, size_t length, const uint8_t LUTs[64+16]) 5 | { 6 | uint8_t *dstBeg = dst; 7 | for (; length >= 3; length -= 3, src += 3, dst += 4) { 8 | uint32_t u32 = src[0] << 16 | src[1] << 8 | src[2]; 9 | dst[0] = LUTs[(u32 >> 18) & 63]; 10 | dst[1] = LUTs[(u32 >> 12) & 63]; 11 | dst[2] = LUTs[(u32 >> 6) & 63]; 12 | dst[3] = LUTs[(u32 >> 0) & 63]; 13 | } 14 | if (length > 0) { 15 | uint32_t u32 = src[0] << 8 | (length > 1 ? src[1] : 0); 16 | *dst++ = LUTs[(u32 >> 10) & 63]; 17 | *dst++ = LUTs[(u32 >> 4) & 63]; 18 | *dst++ = length > 1 ? LUTs[(u32 << 2) & 63] : '='; 19 | *dst++ = '='; 20 | } 21 | return dst - dstBeg; 22 | } 23 | 24 | static uint8_t base64LUTs[64 + 16] = 25 | "ABCDEFGHIJKLMNOPQRSTUVWXYZ" 26 | "abcdefghijklmnopqrstuvwxyz" 27 | "0123456789" 28 | "+/" 29 | "\x47\xfc\xfc\xfc\xfc\xfc\xfc\xfc\xfc\xfc\xfc\xed\xf0\x41" 30 | // 'a'-26, 10x '0' - 52, '+' - 62, '/' - 63, 'A' 31 | ; 32 | 33 | /* used to prevent GPR spill in vectorized implementations */ 34 | size_t 35 | b64_encode_scalar_tail(size_t prefix, uint8_t *dst, const uint8_t *src, size_t length) 36 | { 37 | return prefix + b64_encode_scalar(dst, src, length, base64LUTs); 38 | } 39 | 40 | 41 | #define IMPLS(f) \ 42 | f(scalar) \ 43 | f(rvv_LUT64) f(rvv_LUT16) \ 44 | f(rvv_seg_LUT64) f(rvv_seg_LUT16) 45 | 46 | typedef size_t Func(uint8_t *dst, const uint8_t *src, size_t length, const uint8_t LUTs[64+16]); 47 | 48 | #define DECLARE(f) extern Func b64_encode_##f; 49 | IMPLS(DECLARE) 50 | 51 | #define EXTRACT(f) { #f, &b64_encode_##f }, 52 | Impl impls[] = { IMPLS(EXTRACT) }; 53 | 54 | uint8_t *dest, *src; 55 | size_t last; 56 | 57 | void init(void) { } 58 | 59 | ux checksum(size_t n) { 60 | ux sum = last; 61 | for (size_t i = 0; i < last+9; ++i) 62 | sum = uhash(sum) + dest[i]; 63 | return sum; 64 | } 65 | 66 | BENCH_BEG(base) { 67 | src = mem; 68 | dest = mem + MAX_MEM/3; 69 | memset(dest, 0, n*2+9); 70 | TIME last = f(dest, src, n, base64LUTs); 71 | } BENCH_END 72 | 73 | Bench benches[] = { 74 | BENCH( impls, MAX_MEM/3, "base64 encode", bench_base ), 75 | }; BENCH_MAIN(benches) 76 | 77 | -------------------------------------------------------------------------------- /bench/bench.h: -------------------------------------------------------------------------------- 1 | #include "config.h" 2 | #include "../nolibc.h" 3 | 4 | #ifndef BENCH_NEXT 5 | # define BENCH_NEXT NEXT 6 | #endif 7 | 8 | #define MX(f,F) f(F##_m1) f(F##_m2) f(F##_m4) f(F##_m8) 9 | #define STR(x) STR_(x) 10 | #define STR_(x) #x 11 | 12 | #if defined(__clang__) || defined(__GNUC__) || defined(__INTEL_COMPILER) 13 | 14 | # define BENCH_CLOBBER() ({__asm__ volatile("":::"memory");}) 15 | # define BENCH_VOLATILE(x) ({__asm__ volatile("" : "+g"(x) : "g"(x) : "memory");}) 16 | # define BENCH_VOLATILE_REG(x) ({__asm__ volatile("" : "+r"(x) : "r"(x) : "memory");}) 17 | # define BENCH_VOLATILE_MEM(x) ({__asm__ volatile("" : "+m"(x) : "m"(x) : "memory");}) 18 | 19 | #define BENCH_MAY_ALIAS __attribute__((__may_alias__)) 20 | 21 | #else 22 | 23 | # define BENCH_CLOBBER() 24 | # define BENCH_CLOBBER_WITH(x) (bench__use_ptr(&(x)), BENCH_CLOBBER()) 25 | # define BENCH_CLOBBER_WITH_REG(x) (bench__use_ptr(&(x)), BENCH_CLOBBER()) 26 | # define BENCH_CLOBBER_WITH_MEM(x) (bench__use_ptr(&(x)), BENCH_CLOBBER()) 27 | static void bench_use_ptr(char const volatile *x) {} 28 | 29 | #define BENCH_MAY_ALIAS 30 | 31 | #endif 32 | 33 | 34 | static int 35 | compare_ux(void const *a, void const *b) 36 | { 37 | ux A = *(ux*)a, B = *(ux*)b; 38 | return A < B ? -1 : A > B ? 1 : 0; 39 | } 40 | 41 | static URand randState = { 123, 456, 789 }; 42 | static ux bench_urand(void) { return urand(&randState); } 43 | static float bench_urandf(void) { return urandf(&randState); } 44 | static void bench_memrand(void *ptr, size_t n) { return memrand(&randState, ptr, n); } 45 | 46 | typedef struct { 47 | char const *name; void *func; 48 | } Impl; 49 | typedef struct { 50 | Impl *impls; 51 | size_t nImpls; 52 | size_t N; 53 | char const *name; 54 | ux (*func)(void *, size_t); 55 | } Bench; 56 | 57 | static unsigned char *mem = 0; 58 | 59 | void bench_main(void); 60 | ux checksum(size_t n); 61 | void init(void); 62 | 63 | #if __STDC_HOSTED__ && !defined(CUSTOM_HOST) 64 | # include 65 | #else 66 | static ux heap[1 + MAX_MEM / sizeof(ux)]; 67 | #endif 68 | 69 | 70 | int 71 | main(void) 72 | { 73 | 74 | #if __STDC_HOSTED__ && !defined(CUSTOM_HOST) 75 | mem = malloc(MAX_MEM); 76 | #else 77 | mem = (unsigned char*)heap; 78 | #endif 79 | 80 | size_t x; 81 | randState.x ^= rv_cycles()*7; 82 | randState.y += rv_cycles() ^ ((uintptr_t)&x + 666*(uintptr_t)mem); 83 | 84 | /* initialize memory */ 85 | bench_memrand(mem, MAX_MEM); 86 | 87 | init(); 88 | bench_main(); 89 | #if __STDC_HOSTED__ && !defined(CUSTOM_HOST) 90 | free(mem); 91 | #endif 92 | return 0; 93 | } 94 | 95 | static fx 96 | bench_time(size_t n, Impl impl, Bench bench) 97 | { 98 | static ux arr[MAX_REPEATS]; 99 | size_t total = 0, repeats = 0; 100 | for (; repeats < MAX_REPEATS; ++repeats) { 101 | total += arr[repeats] = bench.func(impl.func, n); 102 | if (repeats > MIN_REPEATS && total > STOP_CYCLES) 103 | break; 104 | } 105 | #if MAX_REPEATS > 4 106 | qsort(arr, repeats, sizeof *arr, compare_ux); 107 | ux sum = 0, count = 0; 108 | for (size_t i = repeats * 0.2f; i < repeats * 0.8f; ++i, ++count) 109 | sum += arr[i]; 110 | #else 111 | ux sum = 0, count = repeats; 112 | for (size_t i = 0; i < repeats; ++i) 113 | sum += arr[i]; 114 | #endif 115 | return n / ((fx)sum / count); 116 | } 117 | 118 | static void 119 | bench_run(Bench *benches, size_t nBenches) 120 | { 121 | for (Bench *b = benches; b != benches + nBenches; ++b) { 122 | print("{\ntitle: \"")(s,b->name)("\",\n"); 123 | print("labels: [\"0\","); 124 | for (size_t i = 0; i < b->nImpls; ++i) 125 | print("\"")(s,b->impls[i].name)("\","); 126 | print("],\n"); 127 | 128 | size_t N = b->N; 129 | print("data: [\n["); 130 | for (size_t n = 1; n < N; n = BENCH_NEXT(n)) 131 | print(u,n)(","); 132 | print("],\n")(flush,); 133 | 134 | for (Impl *i = b->impls; i != b->impls + b->nImpls; ++i) { 135 | print("["); 136 | for (size_t n = 1; n < N; n = BENCH_NEXT(n)) { 137 | #if VALIDATE 138 | ux si = 0, s0 = 0; 139 | if (i != b->impls) { 140 | URand seed = randState; 141 | (void)b->func(i->func, n); 142 | si = checksum(n); 143 | 144 | randState = seed; 145 | (void)b->func(b->impls[0].func, n); 146 | s0 = checksum(n); 147 | } 148 | 149 | if (si != s0) { 150 | print("ERROR: ")(s,i->name)(" in ")(s,b->name)(" at ")(u,n)(flush,); 151 | exit(EXIT_FAILURE); 152 | } 153 | #endif 154 | 155 | print(f,bench_time(n, *i, *b))(",")(flush,); 156 | } 157 | print("],\n")(flush,); 158 | } 159 | print("]\n},\n"); 160 | } 161 | } 162 | 163 | #define TIME \ 164 | for (ux beg = rv_cycles(), _once = 1; _once; \ 165 | rv_fencei(), \ 166 | _cycles += rv_cycles() - beg, _once = 0) 167 | 168 | #define BENCH_BEG(name) \ 169 | ux bench_##name(void *_func, size_t n) { \ 170 | Func *f = _func; ux _cycles = 0; 171 | #define BENCH_END return _cycles; } 172 | 173 | #define BENCH(impls, ...) { impls, ARR_LEN(impls), __VA_ARGS__ } 174 | 175 | #define BENCH_MAIN(benches) \ 176 | void bench_main(void) { \ 177 | bench_run(benches, ARR_LEN(benches)); \ 178 | } 179 | 180 | -------------------------------------------------------------------------------- /bench/byteswap.S: -------------------------------------------------------------------------------- 1 | 2 | #if defined(MX) && __riscv_zvbb 3 | .global MX(byteswap32_rvv_vrev8_) 4 | MX(byteswap32_rvv_vrev8_): 5 | 1: 6 | vsetvli t0, a1, e32, MX(), ta, ma 7 | vle32.v v0, (a0) 8 | vrev8.v v8, v0 9 | vse32.v v8, (a0) 10 | sub a1, a1, t0 11 | slli t1, t0, 2 12 | add a0, a0, t1 13 | bnez a1, 1b 14 | ret 15 | #endif 16 | 17 | #if MX_N == 4 || MX_N == 2 || MX_N == 1 18 | 19 | # a0 = ptr, a1 = len 20 | .global MX(byteswap32_rvv_gatherei16_) 21 | MX(byteswap32_rvv_gatherei16_): 22 | vsetvli t0, x0, e16, MX2(), ta, ma 23 | vid.v v0 24 | vand.vi v8, v0, 3 25 | vrsub.vi v8, v8, 3 26 | vsrl.vi v0, v0, 2 27 | vsll.vi v0, v0, 2 28 | vadd.vv v0, v0, v8 # i/8*8 + (7-1%8) 29 | 1: 30 | vsetvli t0, a1, e32, MX(), ta, ma 31 | vle32.v v8, (a0) 32 | slli t1, t0, 2 33 | vsetvli x0, t1, e8, MX(), ta, ma 34 | vrgatherei16.vv v16, v8, v0 35 | vsetvli x0, t0, e32, MX(), ta, ma 36 | vse32.v v16, (a0) 37 | sub a1, a1, t0 38 | add a0, a0, t1 39 | bnez a1, 1b 40 | ret 41 | #endif 42 | 43 | #if MX_N == 2 44 | 45 | .macro byteswap32_rvv_m1_gatherei16s n 46 | .global byteswap32_rvv_m1_gatherei16s_m\n 47 | byteswap32_rvv_m1_gatherei16s_m\n: 48 | vsetvli t0, x0, e16, MX(), ta, ma 49 | vid.v v0 50 | vand.vi v8, v0, 3 51 | vrsub.vi v8, v8, 3 52 | vsrl.vi v0, v0, 2 53 | vsll.vi v0, v0, 2 54 | vadd.vv v0, v0, v8 # i/8*8 + (7-1%8) 55 | 1: 56 | vsetvli t0, a1, e32, m\n, ta, ma 57 | vle32.v v8, (a0) 58 | vsetvli t1, x0, e8, m1, ta, ma 59 | vrgatherei16.vv v16, v8, v0 60 | .ifge \n-2 61 | vrgatherei16.vv v17, v9, v0 62 | .ifge \n-4 63 | vrgatherei16.vv v18, v10, v0 64 | vrgatherei16.vv v19, v11, v0 65 | .ifge \n-8 66 | vrgatherei16.vv v20, v12, v0 67 | vrgatherei16.vv v21, v13, v0 68 | vrgatherei16.vv v22, v14, v0 69 | vrgatherei16.vv v23, v15, v0 70 | .endif 71 | .endif 72 | .endif 73 | vsetvli x0, t0, e32, m\n, ta, ma 74 | vse32.v v16, (a0) 75 | sub a1, a1, t0 76 | slli t0, t0, 2 77 | add a0, a0, t0 78 | bnez a1, 1b 79 | ret 80 | .endm 81 | 82 | byteswap32_rvv_m1_gatherei16s 2 83 | #endif 84 | #if MX_N == 4 85 | byteswap32_rvv_m1_gatherei16s 4 86 | #endif 87 | #if MX_N == 8 88 | byteswap32_rvv_m1_gatherei16s 8 89 | #endif 90 | 91 | -------------------------------------------------------------------------------- /bench/byteswap.c: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | void 4 | byteswap32_scalar(uint32_t *ptr, size_t n) 5 | { 6 | for (uint8_t *p = (uint8_t*)ptr; n--; p += 4) { 7 | uint8_t p0 = p[0], p1 = p[1], p2 = p[2], p3 = p[3]; 8 | p[3] = p0; BENCH_CLOBBER(); 9 | p[2] = p1; BENCH_CLOBBER(); 10 | p[1] = p2; BENCH_CLOBBER(); 11 | p[0] = p3; BENCH_CLOBBER(); 12 | } 13 | } 14 | 15 | void 16 | byteswap32_scalar_autovec(uint32_t *ptr, size_t n) 17 | { 18 | for (uint8_t *p = (uint8_t*)ptr; n--; p += 4) { 19 | uint8_t p0 = p[0], p1 = p[1], p2 = p[2], p3 = p[3]; 20 | p[3] = p0; 21 | p[2] = p1; 22 | p[1] = p2; 23 | p[0] = p3; 24 | } 25 | } 26 | 27 | #if __riscv_zbb 28 | void 29 | byteswap32_SWAR_rev8(uint32_t *ptr, size_t n) 30 | { 31 | while (n--) { 32 | *ptr = __builtin_bswap32(*ptr); 33 | ++ptr; 34 | BENCH_CLOBBER(); 35 | } 36 | } 37 | #define REV8(f) f(SWAR_rev8) 38 | #else 39 | #define REV8(f) 40 | #endif 41 | 42 | 43 | /* we don't support these on XTheadVector */ 44 | #ifndef __riscv_vector 45 | #define IMPLS_RVV(f) 46 | #else 47 | #define IMPLS_RVV(f) \ 48 | f(rvv_gatherei16_m1) \ 49 | f(rvv_gatherei16_m2) \ 50 | f(rvv_gatherei16_m4) \ 51 | f(rvv_m1_gatherei16s_m2) \ 52 | f(rvv_m1_gatherei16s_m4) \ 53 | f(rvv_m1_gatherei16s_m8) 54 | #endif 55 | 56 | #if __riscv_zvbb 57 | #define IMPLS_ZVBB(f) MX(f,rvv_vrev8) 58 | #else 59 | #define IMPLS_ZVBB(f) 60 | #endif 61 | 62 | 63 | #define IMPLS(f) \ 64 | f(scalar) \ 65 | f(scalar_autovec) \ 66 | REV8(f) \ 67 | IMPLS_ZVBB(f) \ 68 | IMPLS_RVV(f) 69 | 70 | typedef void Func(uint32_t *ptr, size_t n); 71 | 72 | #define DECLARE(f) extern Func byteswap32_##f; 73 | IMPLS(DECLARE) 74 | 75 | #define EXTRACT(f) { #f, &byteswap32_##f }, 76 | Impl impls[] = { IMPLS(EXTRACT) }; 77 | 78 | uint32_t *ptr; 79 | 80 | void init(void) { ptr = (uint32_t*)mem; } 81 | 82 | ux checksum(size_t n) { 83 | ux sum = 0; 84 | for (size_t i = 0; i < n; ++i) 85 | sum = uhash(sum) + ptr[i]; 86 | return sum; 87 | } 88 | 89 | BENCH_BEG(base) { 90 | bench_memrand(ptr, n * sizeof *ptr); 91 | TIME f(ptr, n); 92 | } BENCH_END 93 | 94 | Bench benches[] = { 95 | BENCH( impls, MAX_MEM/4, "byteswap32", bench_base ) 96 | }; BENCH_MAIN(benches) 97 | 98 | -------------------------------------------------------------------------------- /bench/chacha20.S: -------------------------------------------------------------------------------- 1 | #ifndef MX 2 | #if __riscv_xlen != 32 3 | #include "../thirdparty/rvv-chacha-poly/vchacha.s" 4 | #endif 5 | #endif 6 | -------------------------------------------------------------------------------- /bench/chacha20.c: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | #if __riscv_xlen != 32 3 | #include "../thirdparty/rvv-chacha-poly/boring.h" 4 | 5 | uint8_t *dest, *src; 6 | uint8_t key[32], nonce[12]; 7 | uint32_t counter; 8 | 9 | 10 | extern void vector_chacha20( 11 | uint8_t *out, const uint8_t *in, 12 | size_t in_len, const uint8_t key[32], 13 | const uint8_t nonce[12], uint32_t counter); 14 | 15 | static void 16 | chacha20_boring(void *restrict dest, void const *restrict src, size_t n) { 17 | boring_chacha20(dest, src, n, key, nonce, counter); 18 | } 19 | 20 | static void 21 | chacha20_rvv(void *restrict dest, void const *restrict src, size_t n) { 22 | vector_chacha20(dest, src, n, key, nonce, counter); 23 | } 24 | 25 | typedef void *Func(void *restrict dest, void const *restrict src, size_t n); 26 | 27 | Impl impls[] = { 28 | { "boring", &chacha20_boring }, 29 | { "rvv", &chacha20_rvv }, 30 | }; 31 | 32 | void init(void) { 33 | bench_memrand(key, sizeof key); 34 | bench_memrand(nonce, sizeof nonce); 35 | counter = 0; 36 | } 37 | 38 | ux checksum(size_t n) { 39 | ux sum = 0; 40 | for (size_t i = 0; i < n+16; ++i) 41 | sum = uhash(sum) + mem[i]; 42 | return sum; 43 | } 44 | 45 | BENCH_BEG(aligned) { 46 | memset(mem, 0, n+16); 47 | TIME f(mem, mem + MAX_MEM/2 + 16, n); 48 | } BENCH_END 49 | 50 | Bench benches[] = { 51 | BENCH( impls, MAX_MEM/2 - 16, "chacha20 aligned", bench_aligned ) 52 | }; BENCH_MAIN(benches) 53 | 54 | 55 | #include "../thirdparty/rvv-chacha-poly/boring.c" 56 | #else 57 | void init(void) {} 58 | Impl impls[] = {}; 59 | Bench benches[] = {}; 60 | BENCH_MAIN(benches) 61 | #endif 62 | -------------------------------------------------------------------------------- /bench/config.h: -------------------------------------------------------------------------------- 1 | /* the maximum number of bytes to allocate, minimum of 4096 */ 2 | #define MAX_MEM (1024*1024*32) 3 | /* the byte count for the next run */ 4 | #define NEXT(c) (c + c/7 + 3) 5 | 6 | /* minimum number of repeats, to sample median from */ 7 | #define MIN_REPEATS 10 8 | /* maxium number of repeats, executed until more than STOP_TIME has elapsed */ 9 | #define MAX_REPEATS 64 10 | 11 | /* stop repeats early afer this many cycles have elapsed */ 12 | #define STOP_CYCLES (1024*1024*500) 13 | 14 | /* validate against reference implementation on the first repetition */ 15 | #define VALIDATE 1 16 | 17 | /* custom scaling factors for benchmarks, these are used to make sure each 18 | * benchmark approximately takes the same amount of time. */ 19 | 20 | #define SCALE_mandelbrot(N) ((N)/10) 21 | #define SCALE_mergelines(N) ((N)/10) 22 | 23 | /* benchmark specific configurations */ 24 | #define mandelbrot_ITER 100 25 | -------------------------------------------------------------------------------- /bench/hist.S: -------------------------------------------------------------------------------- 1 | #if 0 2 | 3 | void 4 | hist_rvv_assume_no_conflict(uint32_t *hist, float *x, float *y, size_t n) 5 | { 6 | for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { 7 | vl = __riscv_vsetvl_e32m8(n); 8 | vfloat32m8_t vx = __riscv_vle32_v_f32m8(x, vl); 9 | vfloat32m8_t vy = __riscv_vle32_v_f32m8(y, vl); 10 | vfloat32m8_t vsq = __riscv_vfmacc(__riscv_vfmul(vx, vx, vl), vy, vy, vl); 11 | vfloat32m8_t v = __riscv_vfsqrt(vsq, vl); 12 | vuint32m8_t vidx = __riscv_vminu(__riscv_vfcvt_rtz_xu(v, vl), 100, vl); 13 | vidx = __riscv_vsll(vidx, 2, vl); 14 | vuint32m8_t vcnt =__riscv_vluxei32(hist, vidx, vl); 15 | vcnt = __riscv_vadd(vcnt, 1, vl); 16 | __riscv_vsuxei32(hist, vidx, vcnt, vl); 17 | } 18 | } 19 | 20 | void 21 | hist_rvv_slidedown(uint32_t *hist, float *x, float *y, size_t n) 22 | { 23 | for (size_t vl; n > 0; n -= vl, x += vl, y += vl) { 24 | vl = __riscv_vsetvl_e32m8(n); 25 | vfloat32m8_t vx = __riscv_vle32_v_f32m8(x, vl); 26 | vfloat32m8_t vy = __riscv_vle32_v_f32m8(y, vl); 27 | vfloat32m8_t vsq = __riscv_vfmacc(__riscv_vfmul(vx, vx, vl), vy, vy, vl); 28 | vfloat32m8_t v = __riscv_vfsqrt(vsq, vl); 29 | vuint16m4_t vidx = __riscv_vminu(__riscv_vfncvt_rtz_xu(v, vl), 100, vl); 30 | 31 | for (size_t i = 0; i < vl; ++i) { 32 | size_t idx = __riscv_vmv_x(__riscv_vslidedown(vidx, i, 1)); 33 | ++hist[idx]; 34 | } 35 | } 36 | } 37 | #endif 38 | 39 | #ifdef MX 40 | 41 | .global MX(LUT4_rvv_vloxei8_) 42 | MX(LUT4_rvv_vloxei8_): 43 | 1: 44 | vsetvli a3, a2, e8, MX(), ta, ma 45 | vle8.v v8, (a1) 46 | vand.vi v8, v8, 15 47 | vloxei8.v v8, (a0), v8 48 | vse8.v v8, (a1) 49 | sub a2, a2, a3 50 | add a1, a1, a3 51 | bnez a2, 1b 52 | ret 53 | 54 | /* assumes no conflicts, which causes the wrong result */ 55 | .global MX(hist_rvv_assume_no_conflict_) 56 | MX(hist_rvv_assume_no_conflict_): 57 | li a4, 100 58 | 1: 59 | vsetvli a5, a3, e32, m8, ta, ma 60 | vle32.v v8, (a1) 61 | vle32.v v16, (a2) 62 | vfmul.vv v8, v8, v8 63 | vfmacc.vv v8, v16, v16 64 | vfsqrt.v v8, v8 65 | vfcvt.rtz.xu.f.v v8, v8 66 | vminu.vx v8, v8, a4 67 | vsll.vi v8, v8, 2 68 | vluxei32.v v16, (a0), v8 69 | vadd.vi v16, v16, 1 70 | vsuxei32.v v16, (a0), v8 71 | sub a3, a3, a5 72 | slli a5, a5, 2 73 | add a1, a1, a5 74 | add a2, a2, a5 75 | bnez a3, 1b 76 | ret 77 | 78 | 79 | 80 | .global MX(hist_rvv_slidedown_) 81 | MX(hist_rvv_slidedown_): 82 | li a6, 100 83 | j 2f 84 | 1: 85 | sub a3, a3, a7 86 | slli a5, a7, 2 87 | add a1, a1, a5 88 | add a2, a2, a5 89 | beqz a3, 4f 90 | 2: 91 | vsetvli a7, a3, e32, MX(), ta, ma 92 | beqz a7, 1b 93 | vle32.v v8, (a1) 94 | vle32.v v16, (a2) 95 | li a4, 0 96 | vfmul.vv v8, v8, v8 97 | vfmacc.vv v8, v16, v16 98 | vfsqrt.v v8, v8 99 | vsetvli zero, zero, e16, MXf2(), ta, ma 100 | vfncvt.rtz.xu.f.w v16, v8 101 | vminu.vx v8, v16, a6 102 | vsll.vi v8, v8, 2 103 | vsetivli zero, 1, e16, MXf2(), ta, ma 104 | 3: 105 | vslidedown.vx v12, v8, a4 106 | vmv.x.s a5, v12 107 | add t0, a0, a5 108 | lw a5, 0(t0) 109 | addi a5, a5, 1 110 | addi a4, a4, 1 111 | sw a5, 0(t0) 112 | bne a7, a4, 3b 113 | j 1b 114 | 4: 115 | ret 116 | 117 | 118 | #endif 119 | 120 | 121 | #if MX_N == 1 122 | 123 | .global MX(hist_rvv_dup_entries_) 124 | MX(hist_rvv_dup_entries_): 125 | vsetvli a6, zero, e32, m1, ta, ma 126 | beqz a3, 2f 127 | slli a5, a6, 2 128 | vmv.v.x v8, a5 129 | vid.v v9 130 | addi a5, a6, -1 131 | vand.vx v9, v9, a5 132 | vsll.vi v9, v9, 2 133 | li a5, 100 134 | 1: 135 | vsetvli a4, a3, e32, m1, ta, ma 136 | vle32.v v10, (a1) 137 | vle32.v v11, (a2) 138 | vfmul.vv v10, v10, v10 139 | vfmacc.vv v10, v11, v11 140 | vfsqrt.v v10, v10 141 | vfcvt.rtz.xu.f.v v10, v10 142 | vminu.vx v10, v10, a5 143 | vmadd.vv v10, v8, v9 144 | vluxei32.v v11, (a0), v10 145 | vadd.vi v11, v11, 1 146 | vsuxei32.v v11, (a0), v10 147 | sub a3, a3, a4 148 | slli a4, a4, 2 149 | add a1, a1, a4 150 | add a2, a2, a4 151 | bnez a3, 1b 152 | 2: 153 | vsetvli a1, zero, e32, m1, ta, ma 154 | vmv.v.i v8, 0 155 | slli a4, a6, 2 156 | addi a1, a0, 400 157 | mv a2, a0 158 | vsetvli a3, zero, e32, m1, ta, ma 159 | 3: 160 | vle32.v v9, (a0) 161 | vredsum.vs v9, v9, v8 162 | vmv.x.s t0, v9 163 | sw t0, (a2) 164 | addi a2, a2, 4 165 | add a0, a0, a4 166 | bne a2, a1, 3b 167 | ret 168 | 169 | #endif 170 | 171 | #if MX_N == 2 172 | 173 | .global MX(hist_rvv_dup_entries_) 174 | MX(hist_rvv_dup_entries_): 175 | vsetvli a6, zero, e32, m1, ta, ma 176 | beqz a3, 2f 177 | slli a5, a6, 2 178 | slli a4, a6, 1 179 | vsetvli zero, a4, e32, m2, ta, ma 180 | vmv.v.x v8, a5 181 | vid.v v10 182 | addi a4, a6, -1 183 | vand.vx v10, v10, a4 184 | vsll.vi v10, v10, 2 185 | li a7, 100 186 | 1: 187 | vsetvli a4, a3, e32, m2, ta, ma 188 | vle32.v v12, (a1) 189 | vle32.v v14, (a2) 190 | vfmul.vv v12, v12, v12 191 | vfmacc.vv v12, v14, v14 192 | vfsqrt.v v12, v12 193 | vfcvt.rtz.xu.f.v v12, v12 194 | vminu.vx v12, v12, a7 195 | vmadd.vv v12, v8, v10 196 | vsetvli a5, a4, e32, m1, ta, ma 197 | vluxei32.v v14, (a0), v12 198 | sub a5, a4, a5 199 | vadd.vi v14, v14, 1 200 | vsuxei32.v v14, (a0), v12 201 | vsetvli zero, a5, e32, m1, ta, ma 202 | vluxei32.v v12, (a0), v13 203 | vadd.vi v12, v12, 1 204 | vsuxei32.v v12, (a0), v13 205 | sub a3, a3, a4 206 | slli a4, a4, 2 207 | add a1, a1, a4 208 | add a2, a2, a4 209 | bnez a3, 1b 210 | 2: 211 | vsetvli a1, zero, e32, m1, ta, ma 212 | vmv.v.i v8, 0 213 | slli a4, a6, 2 214 | addi a1, a0, 400 215 | mv a2, a0 216 | vsetvli a3, zero, e32, m1, ta, ma 217 | 3: 218 | vle32.v v9, (a0) 219 | vredsum.vs v9, v9, v8 220 | vmv.x.s t0, v9 221 | sw t0, (a2) 222 | addi a2, a2, 4 223 | add a0, a0, a4 224 | bne a2, a1, 3b 225 | ret 226 | 227 | #endif 228 | 229 | #if MX_N == 4 230 | 231 | .global MX(hist_rvv_dup_entries_) 232 | MX(hist_rvv_dup_entries_): 233 | vsetvli a5, zero, e32, m1, ta, ma 234 | slli a7, a5, 2 235 | beqz a3, 2f 236 | vsetvli zero, a7, e32, m4, ta, ma 237 | vmv.v.x v8, a7 238 | vid.v v12 239 | addi a5, a5, -1 240 | vand.vx v12, v12, a5 241 | vsll.vi v12, v12, 2 242 | li a6, 100 243 | 1: 244 | vsetvli a5, a3, e32, m4, ta, ma 245 | vle32.v v16, (a1) 246 | vle32.v v20, (a2) 247 | vfmul.vv v16, v16, v16 248 | vfmacc.vv v16, v20, v20 249 | vfsqrt.v v16, v16 250 | vfcvt.rtz.xu.f.v v16, v16 251 | vminu.vx v16, v16, a6 252 | vmadd.vv v16, v8, v12 253 | vsetvli a4, a5, e32, m1, ta, ma 254 | vluxei32.v v20, (a0), v16 255 | sub a4, a5, a4 256 | vadd.vi v20, v20, 1 257 | vsuxei32.v v20, (a0), v16 258 | vsetvli t0, a4, e32, m1, ta, ma 259 | vluxei32.v v16, (a0), v17 260 | sub a4, a4, t0 261 | vadd.vi v16, v16, 1 262 | vsuxei32.v v16, (a0), v17 263 | vsetvli t0, a4, e32, m1, ta, ma 264 | vluxei32.v v16, (a0), v18 265 | sub a4, a4, t0 266 | vadd.vi v16, v16, 1 267 | vsuxei32.v v16, (a0), v18 268 | vsetvli zero, a4, e32, m1, ta, ma 269 | vluxei32.v v16, (a0), v19 270 | vadd.vi v16, v16, 1 271 | vsuxei32.v v16, (a0), v19 272 | sub a3, a3, a5 273 | slli a5, a5, 2 274 | add a1, a1, a5 275 | add a2, a2, a5 276 | bnez a3, 1b 277 | 2: 278 | vsetvli a1, zero, e32, m1, ta, ma 279 | vmv.v.i v8, 0 280 | addi a1, a0, 400 281 | mv a2, a0 282 | 3: 283 | vsetvli a3, zero, e32, m1, ta, ma 284 | vle32.v v9, (a0) 285 | vredsum.vs v9, v9, v8 286 | vsetivli zero, 1, e32, m1, ta, ma 287 | vse32.v v9, (a2) 288 | addi a2, a2, 4 289 | add a0, a0, a7 290 | bne a2, a1, 3b 291 | ret 292 | 293 | #endif 294 | -------------------------------------------------------------------------------- /bench/hist.c: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | void 4 | hist_scalar(uint32_t *hist, float *x, float *y, size_t n) 5 | { 6 | for (size_t i = 0; i < n; ++i) { 7 | float dist = x[i]*x[i] + y[i]*y[i]; 8 | __asm__ ("fsqrt.s %0, %0\n" : "+f"(dist)); 9 | size_t idx = dist; 10 | idx = idx > 100 ? 100 : dist; 11 | ++hist[idx]; 12 | } 13 | } 14 | 15 | #define IMPLS(f) \ 16 | f(scalar) \ 17 | MX(f, rvv_slidedown) \ 18 | MX(f, rvv_assume_no_conflict) \ 19 | f(rvv_dup_entries_m1) \ 20 | f(rvv_dup_entries_m2) \ 21 | f(rvv_dup_entries_m4) \ 22 | 23 | typedef void Func(uint32_t *hist, float *x, float *y, size_t n); 24 | 25 | #define DECLARE(f) extern Func hist_##f; 26 | IMPLS(DECLARE) 27 | 28 | #define EXTRACT(f) { #f, &hist_##f }, 29 | Impl impls[] = { IMPLS(EXTRACT) }; 30 | 31 | static uint32_t hist[100 * (1<<16>>4)]; 32 | float *inx, *iny; 33 | 34 | void init(void) { 35 | inx = (float*)mem; 36 | iny = (float*)(mem + MAX_MEM/2); 37 | } 38 | 39 | ux checksum(size_t n) { 40 | size_t sum = 0; 41 | for (size_t i = 0; i < 100; ++i) 42 | sum = hist[i]; 43 | return sum <= n; // sanity check for no_conflict 44 | } 45 | 46 | BENCH_BEG(base) { 47 | n /= sizeof(float); 48 | memset(hist, 0, sizeof hist); 49 | float max = 70.71; // approx. sqrtf(100*100/2); 50 | for (size_t i = 0; i < n; ++i) { 51 | inx[i] = bench_urandf() * 2 * max - max; 52 | iny[i] = bench_urandf() * 2 * max - max; 53 | } 54 | TIME f(hist, inx, iny, n); 55 | } BENCH_END 56 | 57 | Bench benches[] = { 58 | BENCH( impls, MAX_MEM/2, "hist", bench_base) 59 | }; BENCH_MAIN(benches) 60 | 61 | -------------------------------------------------------------------------------- /bench/mandelbrot.S: -------------------------------------------------------------------------------- 1 | #if 0 2 | 3 | void 4 | mandelbrot_rvv(size_t width, size_t maxIter, uint32_t *res) 5 | { 6 | size_t VL2 = __riscv_vsetvlmax_e32m2(); 7 | vfloat32m2_t v4 = __riscv_vfmv_v_f_f32m2(4, VL2); 8 | vuint32m2_t vid = __riscv_vid_v_u32m2(VL2); 9 | vfloat32m2_t cx, cy, zx, zy, zx2, zy2; 10 | 11 | for (size_t y = 0; y < width; ++y) { 12 | cy = __riscv_vfmv_v_f_f32m2(y, VL2); 13 | cy = __riscv_vfadd(__riscv_vfmul(cy, 2.0f / width, VL2), -1, VL2); 14 | 15 | for (size_t vl, x = 0, n = width; n > 0; n -= vl, res += vl, x += vl) { 16 | vl = __riscv_vsetvl_e32m2(n); 17 | 18 | cx = __riscv_vfcvt_f(__riscv_vadd(vid, x, vl), vl); 19 | cx = __riscv_vfadd(__riscv_vfmul(cx, 2.0f / width, vl), -1.5f, vl); 20 | 21 | size_t iter = 0; 22 | vuint32m2_t viter = __riscv_vmv_v_x_u32m2(0, vl); 23 | vbool16_t mask = __riscv_vmset_m_b16(vl); 24 | zx = zy = zx2 = zy2 = __riscv_vfmv_v_f_f32m2(0, vl); 25 | do { 26 | mask = __riscv_vmflt(__riscv_vfadd(zx2, zy2, vl), v4, vl); 27 | viter = __riscv_vadc(viter, 0, mask, vl); 28 | zy = __riscv_vfmacc(cy, __riscv_vfadd(zx, zx, vl), zy, vl); 29 | zx = __riscv_vfadd(__riscv_vfsub(zx2, zy2, vl), cx, vl); 30 | zx2 = __riscv_vfmul(zx, zx, vl); 31 | zy2 = __riscv_vfmul(zy, zy, vl); 32 | ++iter; 33 | } while (iter < maxIter && __riscv_vfirst(mask, vl) >= 0); 34 | __riscv_vse32(res, viter, vl); 35 | } 36 | } 37 | } 38 | #endif 39 | 40 | #if MX_N > 0 && MX_N <= 2 41 | 42 | #if IF_VF16(1)+0 43 | .global MX(mandelbrot_rvv_f16_) # generated by clang 44 | .balign 2 45 | MX(rvv_f16_m1p5): 46 | .half 0xbe00 # half -1.5 47 | MX(rvv_f16_m1): 48 | .half 0xbc00 # half -1 49 | MX(rvv_f16_p4): 50 | .half 0x4400 # half 4 51 | MX(mandelbrot_rvv_f16_): 52 | beqz a0, 9f 53 | li a6, 0 54 | vsetvli a3, zero, e16, m2, ta, ma 55 | la a3, MX(rvv_f16_p4) 56 | fcvt.s.wu fa5, a0 57 | flh fa4, (a3) 58 | lui a3, 262144 59 | fmv.w.x fa3, a3 60 | la a3, MX(rvv_f16_m1) 61 | fdiv.s fa3, fa3, fa5 62 | flh fa5, (a3) 63 | la a3, MX(rvv_f16_m1p5) 64 | vfmv.v.f v12, fa4 65 | flh fa4, (a3) 66 | addi a3, a1, -1 67 | sltu a1, a1, a3 68 | addi a1, a1, -1 69 | and a1, a1, a3 70 | vid.v v14 71 | fcvt.h.s fa3, fa3 72 | addi a7, a1, 1 73 | j 2f 74 | 1: 75 | addi a6, a6, 1 76 | beq a6, a0, 9f 77 | 2: 78 | li a4, 0 79 | fcvt.h.wu fa2, a6 80 | vsetvli a1, zero, e16, m2, ta, ma 81 | vfmv.v.f v8, fa2 82 | vfmul.vf v8, v8, fa3 83 | vfadd.vf v16, v8, fa5 84 | mv a5, a0 85 | j 4f 86 | 3: 87 | vsetvli zero, zero, e32, m4, ta, ma 88 | vse32.v v8, (a2) 89 | sub a5, a5, t0 90 | slli a1, t0, 2 91 | add a2, a2, a1 92 | add a4, a4, t0 93 | beqz a5, 1b 94 | 4: 95 | vsetvli t0, a5, e16, m2, ta, ma 96 | vadd.vx v8, v14, a4 97 | vfcvt.f.xu.v v8, v8 98 | vfmul.vf v8, v8, fa3 99 | vfadd.vf v18, v8, fa4 100 | vmv.v.i v20, 0 101 | vmv.v.i v22, 0 102 | vmv.v.i v24, 0 103 | vmv.v.i v26, 0 104 | vsetvli zero, zero, e32, m4, ta, ma 105 | vmv.v.i v8, 0 106 | mv a1, a7 107 | 5: 108 | vsetvli zero, zero, e16, m2, ta, ma 109 | vfadd.vv v28, v24, v20 110 | vmflt.vv v0, v28, v12 111 | addi a1, a1, -1 112 | vsetvli zero, zero, e32, m4, ta, ma 113 | vadc.vim v8, v8, 0, v0 114 | beqz a1, 3b 115 | vsetvli zero, zero, e16, m2, ta, ma 116 | vfadd.vv v26, v26, v26 117 | vfsub.vv v20, v24, v20 118 | vfmadd.vv v22, v26, v16 119 | vfadd.vv v26, v20, v18 120 | vfmul.vv v20, v22, v22 121 | vfirst.m a3, v0 122 | vfmul.vv v24, v26, v26 123 | bgez a3, 5b 124 | j 3b 125 | 9: 126 | ret 127 | #endif 128 | 129 | .global MX(mandelbrot_rvv_f32_) # generated by clang 130 | MX(mandelbrot_rvv_f32_): 131 | beqz a0, 9f 132 | li a6, 0 133 | vsetvli a3, zero, e32, MX(), ta, ma 134 | lui a3, 264192 135 | fcvt.s.wu fa5, a0 136 | vmv.v.x v8, a3 137 | lui a3, 262144 138 | fmv.w.x fa4, a3 139 | fdiv.s fa5, fa4, fa5 140 | addi a3, a1, -1 141 | sltu a1, a1, a3 142 | addi a1, a1, -1 143 | and a1, a1, a3 144 | lui a3, 784384 145 | fmv.w.x fa4, a3 146 | lui a3, 785408 147 | fmv.w.x fa3, a3 148 | vid.v v10 149 | addi a7, a1, 1 150 | j 2f 151 | 1: 152 | addi a6, a6, 1 153 | beq a6, a0, 9f 154 | 2: 155 | li a4, 0 156 | fcvt.s.wu fa2, a6 157 | vsetvli a1, zero, e32, MX(), ta, ma 158 | vfmv.v.f v12, fa2 159 | vfmul.vf v12, v12, fa5 160 | vfadd.vf v12, v12, fa4 161 | mv a5, a0 162 | j 4f 163 | 3: 164 | vse32.v v14, (a2) 165 | sub a5, a5, t0 166 | slli a1, t0, 2 167 | add a2, a2, a1 168 | add a4, a4, t0 169 | beqz a5, 1b 170 | 4: 171 | vsetvli t0, a5, e32, MX(), ta, ma 172 | vadd.vx v14, v10, a4 173 | vmv.v.i v18, 0 174 | vfcvt.f.xu.v v14, v14 175 | vfmul.vf v14, v14, fa5 176 | vfadd.vf v16, v14, fa3 177 | vmv.v.i v14, 0 178 | mv a1, a7 179 | vmv.v.i v22, 0 180 | vmv.v.i v20, 0 181 | vmv.v.i v24, 0 182 | 5: 183 | vfadd.vv v26, v22, v18 184 | vmflt.vv v0, v26, v8 185 | addi a1, a1, -1 186 | vadc.vim v14, v14, 0, v0 187 | beqz a1, 3b 188 | vfadd.vv v24, v24, v24 189 | vfsub.vv v18, v22, v18 190 | vfmadd.vv v20, v24, v12 191 | vfadd.vv v24, v18, v16 192 | vfmul.vv v18, v20, v20 193 | vfirst.m a3, v0 194 | vfmul.vv v22, v24, v24 195 | bgez a3, 5b 196 | j 3b 197 | 9: 198 | ret 199 | 200 | #if IF_VF64(1)+0 201 | .balign 8 202 | .global MX(mandelbrot_rvv_f64_) # generated by clang 203 | MX(rvv_f64_m1p5): 204 | .quad 0xbff8000000000000 # double -1.5 205 | MX(rvv_f64_m1): 206 | .quad 0xbff0000000000000 # double -1 207 | MX(rvv_f64_p4): 208 | .quad 0x4010000000000000 # double 4 209 | MX(mandelbrot_rvv_f64_): 210 | beqz a0, 9f 211 | li a6, 0 212 | vsetvli a3, zero, e64, m2, ta, ma 213 | la a3, MX(rvv_f64_p4) 214 | fcvt.s.wu fa5, a0 215 | fld fa4, (a3) 216 | lui a3, 262144 217 | fmv.w.x fa3, a3 218 | la a3, MX(rvv_f64_m1) 219 | fdiv.s fa3, fa3, fa5 220 | fld fa5, (a3) 221 | la a3, MX(rvv_f64_m1p5) 222 | vfmv.v.f v8, fa4 223 | fld fa4, (a3) 224 | addi a3, a1, -1 225 | sltu a1, a1, a3 226 | addi a1, a1, -1 227 | and a1, a1, a3 228 | vid.v v10 229 | fcvt.d.s fa3, fa3 230 | addi a7, a1, 1 231 | j 2f 232 | 1: 233 | addi a6, a6, 1 234 | beq a6, a0, 9f 235 | 2: 236 | li a4, 0 237 | fcvt.d.wu fa2, a6 238 | vsetvli a1, zero, e64, m2, ta, ma 239 | vfmv.v.f v12, fa2 240 | vfmul.vf v12, v12, fa3 241 | vfadd.vf v12, v12, fa5 242 | mv a5, a0 243 | j 4f 244 | 3: 245 | vsetvli zero, zero, e32, m1, ta, ma 246 | vse32.v v24, (a2) 247 | sub a5, a5, t0 248 | slli a1, t0, 2 249 | add a2, a2, a1 250 | add a4, a4, t0 251 | beqz a5, 1b 252 | 4: 253 | vsetvli t0, a5, e64, m2, ta, ma 254 | vadd.vx v14, v10, a4 255 | vfcvt.f.xu.v v14, v14 256 | vfmul.vf v14, v14, fa3 257 | vfadd.vf v14, v14, fa4 258 | vmv.v.i v16, 0 259 | vmv.v.i v18, 0 260 | vmv.v.i v20, 0 261 | vmv.v.i v22, 0 262 | vsetvli zero, zero, e32, m1, ta, ma 263 | vmv.v.i v24, 0 264 | mv a1, a7 265 | 5: 266 | vsetvli zero, zero, e64, m2, ta, ma 267 | vfadd.vv v26, v20, v16 268 | vmflt.vv v0, v26, v8 269 | addi a1, a1, -1 270 | vsetvli zero, zero, e32, m1, ta, ma 271 | vadc.vim v24, v24, 0, v0 272 | beqz a1, 3b 273 | vsetvli zero, zero, e64, m2, ta, ma 274 | vfadd.vv v22, v22, v22 275 | vfsub.vv v16, v20, v16 276 | vfmadd.vv v18, v22, v12 277 | vfadd.vv v22, v16, v14 278 | vfmul.vv v16, v18, v18 279 | vfirst.m a3, v0 280 | vfmul.vv v20, v22, v22 281 | bgez a3, 5b 282 | j 3b 283 | 9: 284 | ret 285 | #endif 286 | 287 | #endif 288 | 289 | 290 | -------------------------------------------------------------------------------- /bench/mandelbrot.c: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | void 4 | mandelbrot_scalar_f32(size_t width, size_t maxIter, uint32_t *res) 5 | { 6 | for (size_t y = 0; y < width; ++y) 7 | for (size_t x = 0; x < width; ++x) { 8 | float cx = x * 2.0f / width - 1.5; 9 | float cy = y * 2.0f / width - 1; 10 | size_t iter = 0; 11 | float zx = 0, zy = 0, zxS = 0, zyS = 0; 12 | 13 | BENCH_VOLATILE_REG(cy); 14 | while (zxS + zyS <= 4 && iter < maxIter) { 15 | zxS = zxS - zyS + cx; 16 | zy = 2 * zx * zy + cy; 17 | zx = zxS; 18 | zxS = zx*zx; 19 | zyS = zy*zy; 20 | ++iter; 21 | } 22 | *res++ = iter; 23 | } 24 | } 25 | 26 | #if __riscv_flen == 64 27 | void 28 | mandelbrot_scalar_f64(size_t width, size_t maxIter, uint32_t *res) 29 | { 30 | for (size_t y = 0; y < width; ++y) 31 | for (size_t x = 0; x < width; ++x) { 32 | double cx = x * 2.0 / width - 1.5; 33 | double cy = y * 2.0 / width - 1; 34 | size_t iter = 0; 35 | double zx = 0, zy = 0, zxS = 0, zyS = 0; 36 | 37 | BENCH_VOLATILE_REG(cy); 38 | while (zxS + zyS <= 4 && iter < maxIter) { 39 | zxS = zxS - zyS + cx; 40 | zy = 2 * zx * zy + cy; 41 | zx = zxS; 42 | zxS = zx*zx; 43 | zyS = zy*zy; 44 | ++iter; 45 | } 46 | *res++ = iter; 47 | } 48 | } 49 | #endif 50 | 51 | #define IMPLS(f) \ 52 | f(scalar_f32) \ 53 | IF_F64(f(scalar_f64)) \ 54 | IF_VF16(f(rvv_f16_m1)) \ 55 | IF_VF16(f(rvv_f16_m2)) \ 56 | f(rvv_f32_m1) \ 57 | f(rvv_f32_m2) \ 58 | IF_VF64(f(rvv_f64_m1)) \ 59 | IF_VF64(f(rvv_f64_m2)) \ 60 | 61 | typedef void Func(size_t width, size_t maxIter, uint32_t *res); 62 | 63 | #define DECLARE(f) extern Func mandelbrot_##f; 64 | IMPLS(DECLARE) 65 | 66 | #define EXTRACT(f) { #f, &mandelbrot_##f }, 67 | Impl impls[] = { IMPLS(EXTRACT) }; 68 | 69 | void init(void) { } 70 | 71 | /* disabled, because of rounding errors, please independently verify */ 72 | ux checksum(size_t n) { 73 | #if 0 74 | double sum = 0; 75 | uint32_t *ptr = (uint32_t*)mem; 76 | n = usqrt(n); 77 | for (size_t i = 0; i < n*n; ++i) 78 | sum += *ptr++; 79 | print("<")(f,sum/(n*n+1))(">"); 80 | #endif 81 | return 0; 82 | } 83 | 84 | BENCH_BEG(base) { 85 | n = usqrt(n); 86 | TIME f(n, mandelbrot_ITER, (uint32_t*)mem); 87 | } BENCH_END 88 | 89 | Bench benches[] = { 90 | BENCH( 91 | impls, 92 | SCALE_mandelbrot(MAX_MEM / 4), 93 | "mandelbrot "STR(mandelbrot_ITER), 94 | bench_base 95 | ) 96 | }; BENCH_MAIN(benches) 97 | 98 | -------------------------------------------------------------------------------- /bench/memcpy.S: -------------------------------------------------------------------------------- 1 | #if 0 2 | void *memcpy_rvv(void *restrict dest, void const *restrict src, size_t n) { 3 | unsigned char *d = dest; 4 | unsigned char const *s = src; 5 | for (size_t vl; n > 0; n -= vl, s += vl, d += vl) { 6 | vl = __riscv_vsetvl_e8m8(n); 7 | vuint8m8_t vec_src = __riscv_vle8_v_u8m8(s, vl); 8 | __riscv_vse8_v_u8m8(d, vec_src, vl); 9 | } 10 | return dest; 11 | } 12 | #endif 13 | 14 | 15 | #ifdef MX 16 | 17 | # a0 = dest, a1 = src, a2 = len 18 | .global MX(memcpy_rvv_) 19 | MX(memcpy_rvv_): 20 | mv a3, a0 21 | 1: 22 | vsetvli t0, a2, e8, MX(), ta, ma 23 | vle8.v v0, (a1) 24 | add a1, a1, t0 25 | sub a2, a2, t0 26 | vse8.v v0, (a3) 27 | add a3, a3, t0 28 | bnez a2, 1b 29 | ret 30 | 31 | .global MX(memcpy_rvv_align_dest_) 32 | MX(memcpy_rvv_align_dest_): 33 | mv a3, a0 34 | vsetvli t0, zero, e8, MX(), ta, ma # vlenb 35 | bltu a2, t0, 2f # len < vlenb 36 | # align dest to vlenb 37 | sub t1, zero, a0 38 | addi t2, t0, -1 39 | and t1, t1, t2 #align = (-dest) & (vlenb-1) 40 | vsetvli t0, t1, e8, MX(), ta, ma 41 | 1: 42 | vle8.v v0, (a1) 43 | add a1, a1, t0 44 | sub a2, a2, t0 45 | vse8.v v0, (a3) 46 | add a3, a3, t0 47 | 2: 48 | vsetvli t0, a2, e8, MX(), ta, ma 49 | bnez a2, 1b 50 | ret 51 | 52 | .global MX(memcpy_rvv_align_src_) 53 | MX(memcpy_rvv_align_src_): 54 | mv a3, a0 55 | vsetvli t0, zero, e8, MX(), ta, ma # vlen 56 | bltu a2, t0, 2f # len < vlen 57 | # align src to vlen 58 | sub t1, zero, a1 59 | addi t2, t0, -1 60 | and t1, t1, t2 # align = (-src) & (vlen-1) 61 | vsetvli t0, t1, e8, MX(), ta, ma 62 | 1: 63 | vle8.v v0, (a1) 64 | add a1, a1, t0 65 | sub a2, a2, t0 66 | vse8.v v0, (a3) 67 | add a3, a3, t0 68 | 2: 69 | vsetvli t0, a2, e8, MX(), ta, ma 70 | bnez a2, 1b 71 | ret 72 | 73 | # combination of memcpy_rvv_align_dest and memcpy_rvv 74 | .global MX(memcpy_rvv_align_dest_hybrid_) 75 | MX(memcpy_rvv_align_dest_hybrid_): 76 | mv a3, a0 77 | vsetvli t0, zero, e8, MX(), ta, ma # vlen 78 | slli t1, t0, 8 # skip costly division for more values 79 | bltu a2, t1, 2f # len < vlen 80 | sub t1, zero, a0 81 | addi t2, t0, -1 82 | and t1, t1, t2 # align = (-dest) & (vlen-1) 83 | vsetvli t0, t1, e8, MX(), ta, ma # align dest to vlen 84 | 1: 85 | vle8.v v0, (a1) 86 | add a1, a1, t0 87 | sub a2, a2, t0 88 | vse8.v v0, (a3) 89 | add a3, a3, t0 90 | 2: 91 | vsetvli t0, a2, e8, MX(), ta, ma 92 | bnez a2, 1b 93 | ret 94 | 95 | 96 | .global MX(memcpy_rvv_tail_) 97 | MX(memcpy_rvv_tail_): 98 | vsetvli t0, a2, e8, MX(), ta, ma 99 | remu a3, a2, t0 # tail = n % vlenb 100 | sub a2, a2, a3 # n -= tail 101 | add a4, a0, a2 # end = dest + n 102 | mv a2, a0 # n = dest 103 | 1: 104 | vle8.v v8, (a1) 105 | add a1, a1, t0 # src += vlenb 106 | vse8.v v8, (a2) 107 | add a2, a2, t0 # dest += vlenb 108 | bltu a2, a4, 1b # dest < end 109 | # copy tail 110 | vsetvli zero, a3, e8, MX(), ta, ma 111 | vle8.v v8, (a1) 112 | vse8.v v8, (a2) 113 | ret 114 | 115 | # this is supposed to test how well the implementation handles 116 | # operations with an vl smaller than VLMAX 117 | .global MX(memcpy_rvv_128_) 118 | MX(memcpy_rvv_128_): 119 | li t0, 128/8 120 | bgt a2, t0, 1f 121 | mv t0, a2 122 | 1: 123 | vsetvli t0, t0, e8, MX(), ta, ma 124 | remu a3, a2, t0 # tail = n % vlenb 125 | sub a2, a2, a3 # n -= tail 126 | add a4, a0, a2 # end = dest + n 127 | mv a2, a0 # n = dest 128 | 1: 129 | vle8.v v8, (a1) 130 | add a1, a1, t0 # src += vlenb 131 | vse8.v v8, (a2) 132 | add a2, a2, t0 # dest += vlenb 133 | bltu a2, a4, 1b # dest < end 134 | # copy tail 135 | vsetvli zero, a3, e8, MX(), ta, ma 136 | vle8.v v8, (a1) 137 | vse8.v v8, (a2) 138 | ret 139 | 140 | #endif 141 | 142 | -------------------------------------------------------------------------------- /bench/memcpy.c: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | void * 4 | memcpy_scalar(void *restrict dest, void const *restrict src, size_t n) 5 | { 6 | unsigned char *d = dest; 7 | unsigned char const *s = src; 8 | while (n--) *d++ = *s++, BENCH_CLOBBER(); 9 | return dest; 10 | } 11 | 12 | void * 13 | memcpy_scalar_autovec(void *restrict dest, void const *restrict src, size_t n) 14 | { 15 | unsigned char *d = dest; 16 | unsigned char const *s = src; 17 | while (n--) *d++ = *s++; 18 | return dest; 19 | } 20 | 21 | /* https://git.musl-libc.org/cgit/musl/tree/src/string/memcpy.c */ 22 | void * 23 | memcpy_musl(void *restrict dest, void const *restrict src, size_t n) 24 | { 25 | unsigned char *d = dest; 26 | unsigned char const *s = src; 27 | 28 | #ifdef __GNUC__ 29 | 30 | #if __BYTE_ORDER == __LITTLE_ENDIAN 31 | #define LS >> 32 | #define RS << 33 | #else 34 | #define LS << 35 | #define RS >> 36 | #endif 37 | 38 | typedef uint32_t __attribute__((__may_alias__)) u32; 39 | uint32_t w, x; 40 | 41 | for (; (uintptr_t)s % 4 && n; n--) *d++ = *s++; 42 | 43 | if ((uintptr_t)d % 4 == 0) { 44 | for (; n>=16; s+=16, d+=16, n-=16) { 45 | *(u32 *)(d+0) = *(u32 *)(s+0); 46 | *(u32 *)(d+4) = *(u32 *)(s+4); 47 | *(u32 *)(d+8) = *(u32 *)(s+8); 48 | *(u32 *)(d+12) = *(u32 *)(s+12); 49 | } 50 | if (n&8) { 51 | *(u32 *)(d+0) = *(u32 *)(s+0); 52 | *(u32 *)(d+4) = *(u32 *)(s+4); 53 | d += 8; s += 8; 54 | } 55 | if (n&4) { 56 | *(u32 *)(d+0) = *(u32 *)(s+0); 57 | d += 4; s += 4; 58 | } 59 | if (n&2) { 60 | *d++ = *s++; *d++ = *s++; 61 | } 62 | if (n&1) { 63 | *d = *s; 64 | } 65 | return dest; 66 | } 67 | 68 | if (n >= 32) switch ((uintptr_t)d % 4) { 69 | case 1: 70 | w = *(u32 *)s; 71 | *d++ = *s++; 72 | *d++ = *s++; 73 | *d++ = *s++; 74 | n -= 3; 75 | for (; n>=17; s+=16, d+=16, n-=16) { 76 | x = *(u32 *)(s+1); 77 | *(u32 *)(d+0) = (w LS 24) | (x RS 8); 78 | w = *(u32 *)(s+5); 79 | *(u32 *)(d+4) = (x LS 24) | (w RS 8); 80 | x = *(u32 *)(s+9); 81 | *(u32 *)(d+8) = (w LS 24) | (x RS 8); 82 | w = *(u32 *)(s+13); 83 | *(u32 *)(d+12) = (x LS 24) | (w RS 8); 84 | } 85 | break; 86 | case 2: 87 | w = *(u32 *)s; 88 | *d++ = *s++; 89 | *d++ = *s++; 90 | n -= 2; 91 | for (; n>=18; s+=16, d+=16, n-=16) { 92 | x = *(u32 *)(s+2); 93 | *(u32 *)(d+0) = (w LS 16) | (x RS 16); 94 | w = *(u32 *)(s+6); 95 | *(u32 *)(d+4) = (x LS 16) | (w RS 16); 96 | x = *(u32 *)(s+10); 97 | *(u32 *)(d+8) = (w LS 16) | (x RS 16); 98 | w = *(u32 *)(s+14); 99 | *(u32 *)(d+12) = (x LS 16) | (w RS 16); 100 | } 101 | break; 102 | case 3: 103 | w = *(u32 *)s; 104 | *d++ = *s++; 105 | n -= 1; 106 | for (; n>=19; s+=16, d+=16, n-=16) { 107 | x = *(u32 *)(s+3); 108 | *(u32 *)(d+0) = (w LS 8) | (x RS 24); 109 | w = *(u32 *)(s+7); 110 | *(u32 *)(d+4) = (x LS 8) | (w RS 24); 111 | x = *(u32 *)(s+11); 112 | *(u32 *)(d+8) = (w LS 8) | (x RS 24); 113 | w = *(u32 *)(s+15); 114 | *(u32 *)(d+12) = (x LS 8) | (w RS 24); 115 | } 116 | break; 117 | } 118 | if (n&16) { 119 | *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; 120 | *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; 121 | *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; 122 | *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; 123 | } 124 | if (n&8) { 125 | *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; 126 | *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; 127 | } 128 | if (n&4) { 129 | *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++; 130 | } 131 | if (n&2) { 132 | *d++ = *s++; *d++ = *s++; 133 | } 134 | if (n&1) { 135 | *d = *s; 136 | } 137 | return dest; 138 | #endif 139 | 140 | while (n--) { *d++ = *s++; BENCH_CLOBBER(); } 141 | return dest; 142 | } 143 | 144 | #define memcpy_libc memcpy 145 | 146 | #define IMPLS(f) \ 147 | IFHOSTED(f(libc)) \ 148 | f(musl) \ 149 | f(scalar) \ 150 | f(scalar_autovec) \ 151 | MX(f, rvv) \ 152 | MX(f, rvv_align_dest) \ 153 | MX(f, rvv_align_src) \ 154 | MX(f, rvv_align_dest_hybrid) \ 155 | MX(f, rvv_tail) \ 156 | MX(f, rvv_128) \ 157 | 158 | typedef void *Func(void *restrict dest, void const *restrict src, size_t n); 159 | 160 | #define DECLARE(f) extern Func memcpy_##f; 161 | IMPLS(DECLARE) 162 | 163 | #define EXTRACT(f) { #f, &memcpy_##f }, 164 | Impl impls[] = { IMPLS(EXTRACT) }; 165 | 166 | uint8_t *dest, *src; 167 | ux last; 168 | 169 | void init(void) { } 170 | 171 | ux checksum(size_t n) { 172 | ux sum = last; 173 | for (size_t i = 0; i < n+9; ++i) 174 | sum = uhash(sum) + dest[i]; 175 | return sum; 176 | } 177 | 178 | void common(size_t n, size_t dOff, size_t sOff) { 179 | dest = mem + dOff; src = dest + MAX_MEM/2 + sOff + 9; 180 | memset(dest, 0, n+9); 181 | } 182 | 183 | BENCH_BEG(base) { 184 | common(n, bench_urand() & 255, bench_urand() & 255); 185 | TIME last = (uintptr_t)f(dest, src, n); 186 | } BENCH_END 187 | 188 | BENCH_BEG(aligned) { 189 | common(n, 0, 0); 190 | TIME last = (uintptr_t)f(dest, src, n); 191 | } BENCH_END 192 | 193 | Bench benches[] = { 194 | BENCH( impls, MAX_MEM/2 - 521, "memcpy", bench_base ), 195 | BENCH( impls, MAX_MEM/2 - 521, "memcpy aligned", bench_aligned ) 196 | }; BENCH_MAIN(benches) 197 | 198 | -------------------------------------------------------------------------------- /bench/memset.S: -------------------------------------------------------------------------------- 1 | #if 0 2 | void *memset(void *dst, int n, size_t len) { 3 | unsigned char *d = dst; 4 | vuint8m8_t v = __riscv_vmv_v_x_u8m8((uint8_t)n, __riscv_vsetvlmax_e8m8()); 5 | for (size_t vl; len > 0; len -= vl, d += vl) { 6 | vl = __riscv_vsetvl_e8m8(len); 7 | __riscv_vse8_v_u8m8(d, v, vl); 8 | } 9 | return dst; 10 | } 11 | #endif 12 | 13 | #ifdef MX 14 | 15 | .global MX(memset_rvv_) 16 | MX(memset_rvv_): 17 | vsetvli a3, zero, e8, MX(), ta, ma 18 | vmv.v.x v8, a1 19 | mv a1, a0 20 | 1: 21 | vsetvli a3, a2, e8, MX(), ta, ma 22 | vse8.v v8, (a1) 23 | sub a2, a2, a3 24 | add a1, a1, a3 25 | bnez a2, 1b 26 | ret 27 | 28 | 29 | .global MX(memset_rvv_align_) 30 | MX(memset_rvv_align_): 31 | vsetvli t0, zero, e8, MX(), ta, ma # vlen 32 | vmv.v.x v8, a1 33 | mv a1, a0 34 | vsetvli t0, zero, e8, MX(), ta, ma # vlen 35 | bltu a2, t0, 2f # len < vlen 36 | # align dest to vlen 37 | sub t1, zero, a0 38 | addi t2, t0, -1 39 | and t1, t1, t2 #align = (-dest) & (vlenb-1) 40 | vsetvli t0, t1, e8, MX(), ta, ma 41 | 1: 42 | vse8.v v8, (a1) 43 | sub a2, a2, t0 44 | add a1, a1, t0 45 | 2: 46 | vsetvli t0, a2, e8, MX(), ta, ma 47 | bnez a2, 1b 48 | ret 49 | 50 | .global MX(memset_rvv_tail_) 51 | MX(memset_rvv_tail_): 52 | vsetvli t0, a2, e8, MX(), ta, ma 53 | vmv.v.x v8, a1 54 | remu a3, a2, t0 # tail = n % vlenb 55 | sub a2, a2, a3 # n -= tail 56 | add a4, a0, a2 # end = dest + n 57 | mv a2, a0 # n = dest 58 | 1: 59 | vse8.v v8, (a2) 60 | add a2, a2, t0 # dest += vlenb 61 | bltu a2, a4, 1b # dest < end 62 | # handle tail 63 | vsetvli zero, a3, e8, MX(), ta, ma 64 | vse8.v v8, (a2) 65 | ret 66 | 67 | .global MX(memset_rvv_tail_4x_) 68 | MX(memset_rvv_tail_4x_): 69 | vsetvli t0, a2, e8, MX(), ta, ma 70 | vmv.v.x v8, a1 71 | slli t1, t0, 2 72 | mv a5, a0 73 | mv a3, a2 74 | bltu a2, t1, 2f 75 | remu a3, a2, t1 # tail = n % (vlenb*4) 76 | sub a2, a2, a3 # n -= tail 77 | add a4, a0, a2 # end = dest + n 78 | 1: 79 | vse8.v v8, (a5) 80 | add a5, a5, t0 # dest += vlenb 81 | vse8.v v8, (a5) 82 | add a5, a5, t0 # dest += vlenb 83 | vse8.v v8, (a5) 84 | add a5, a5, t0 # dest += vlenb 85 | vse8.v v8, (a5) 86 | add a5, a5, t0 # dest += vlenb 87 | bltu a5, a4, 1b # dest < end 88 | # handle tail 89 | 2: 90 | vsetvli a4, a3, e8, MX(), ta, ma 91 | vse8.v v8, (a5) 92 | sub a3, a3, a4 93 | add a5, a5, a4 94 | bnez a3, 2b 95 | ret 96 | 97 | #endif 98 | -------------------------------------------------------------------------------- /bench/memset.c: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | void * 4 | memset_scalar(void *dest, int c, size_t n) 5 | { 6 | unsigned char *d = dest; 7 | while (n--) *d++ = c, BENCH_CLOBBER(); 8 | return dest; 9 | } 10 | 11 | void * 12 | memset_scalar_autovec(void *dest, int c, size_t n) 13 | { 14 | unsigned char *d = dest; 15 | while (n--) *d++ = c; 16 | return dest; 17 | } 18 | 19 | /* https://git.musl-libc.org/cgit/musl/tree/src/string/memset.c */ 20 | #if __riscv_xlen != 32 21 | void * 22 | memset_musl(void *dest, int c, size_t n) 23 | { 24 | unsigned char *s = dest; 25 | size_t k; 26 | 27 | /* Fill head and tail with minimal branching. Each 28 | * conditional ensures that all the subsequently used 29 | * offsets are well-defined and in the dest region. */ 30 | 31 | if (!n) return dest; 32 | s[0] = c; 33 | s[n-1] = c; 34 | if (n <= 2) return dest; 35 | s[1] = c; 36 | s[2] = c; 37 | s[n-2] = c; 38 | s[n-3] = c; 39 | if (n <= 6) return dest; 40 | s[3] = c; 41 | s[n-4] = c; 42 | if (n <= 8) return dest; 43 | 44 | /* Advance pointer to align it at a 4-byte boundary, 45 | * and truncate n to a multiple of 4. The previous code 46 | * already took care of any head/tail that get cut off 47 | * by the alignment. */ 48 | 49 | k = -(uintptr_t)s & 3; 50 | s += k; 51 | n -= k; 52 | n &= -4; 53 | 54 | #ifdef __GNUC__ 55 | typedef uint32_t __attribute__((__may_alias__)) u32; 56 | typedef uint64_t __attribute__((__may_alias__)) u64; 57 | 58 | u32 c32 = ((u32)-1)/255 * (unsigned char)c; 59 | 60 | /* In preparation to copy 32 bytes at a time, aligned on 61 | * an 8-byte bounary, fill head/tail up to 28 bytes each. 62 | * As in the initial byte-based head/tail fill, each 63 | * conditional below ensures that the subsequent offsets 64 | * are valid (e.g. !(n<=24) implies n>=28). */ 65 | 66 | *(u32 *)(s+0) = c32; 67 | *(u32 *)(s+n-4) = c32; 68 | if (n <= 8) return dest; 69 | *(u32 *)(s+4) = c32; 70 | *(u32 *)(s+8) = c32; 71 | *(u32 *)(s+n-12) = c32; 72 | *(u32 *)(s+n-8) = c32; 73 | if (n <= 24) return dest; 74 | *(u32 *)(s+12) = c32; 75 | *(u32 *)(s+16) = c32; 76 | *(u32 *)(s+20) = c32; 77 | *(u32 *)(s+24) = c32; 78 | *(u32 *)(s+n-28) = c32; 79 | *(u32 *)(s+n-24) = c32; 80 | *(u32 *)(s+n-20) = c32; 81 | *(u32 *)(s+n-16) = c32; 82 | 83 | /* Align to a multiple of 8 so we can fill 64 bits at a time, 84 | * and avoid writing the same bytes twice as much as is 85 | * practical without introducing additional branching. */ 86 | 87 | k = 24 + ((uintptr_t)s & 4); 88 | s += k; 89 | n -= k; 90 | 91 | /* If this loop is reached, 28 tail bytes have already been 92 | * filled, so any remainder when n drops below 32 can be 93 | * safely ignored. */ 94 | 95 | u64 c64 = c32 | ((u64)c32 << 32); 96 | for (; n >= 32; n-=32, s+=32) { 97 | *(u64 *)(s+0) = c64; 98 | *(u64 *)(s+8) = c64; 99 | *(u64 *)(s+16) = c64; 100 | *(u64 *)(s+24) = c64; 101 | } 102 | #else 103 | /* Pure C fallback with no aliasing violations. */ 104 | while (n--) *s++ = c; 105 | #endif 106 | 107 | return dest; 108 | } 109 | #endif 110 | 111 | #define memset_libc memset 112 | 113 | #define IMPLS(f) \ 114 | IFHOSTED(f(libc)) \ 115 | IF64(f(musl)) \ 116 | f(scalar) \ 117 | f(scalar_autovec) \ 118 | MX(f, rvv) \ 119 | MX(f, rvv_align) \ 120 | MX(f, rvv_tail) \ 121 | MX(f, rvv_tail_4x) \ 122 | 123 | typedef void *Func(void *dest, int c, size_t n); 124 | 125 | #define DECLARE(f) extern Func memset_##f; 126 | IMPLS(DECLARE) 127 | 128 | #define EXTRACT(f) { #f, &memset_##f }, 129 | Impl impls[] = { IMPLS(EXTRACT) }; 130 | 131 | uint8_t *dest; 132 | ux last; 133 | char c; 134 | 135 | void init(void) { c = bench_urand(); } 136 | 137 | ux checksum(size_t n) { 138 | ux sum = last; 139 | for (size_t i = 0; i < n+9; ++i) 140 | sum = uhash(sum) + dest[i]; 141 | return sum; 142 | } 143 | 144 | void common(size_t n, size_t off) { 145 | dest = mem + off; 146 | memset(dest, c+3, n+9); 147 | } 148 | 149 | BENCH_BEG(base) { 150 | common(n, bench_urand() & 511); 151 | TIME last = (uintptr_t)f(dest, c, n); 152 | } BENCH_END 153 | 154 | BENCH_BEG(aligned) { 155 | common(n, 0); 156 | TIME last = (uintptr_t)f(dest, c, n); 157 | } BENCH_END 158 | 159 | Bench benches[] = { 160 | BENCH( impls, MAX_MEM - 521, "memset", bench_base ), 161 | BENCH( impls, MAX_MEM - 521, "memset aligned", bench_aligned ) 162 | }; BENCH_MAIN(benches) 163 | 164 | -------------------------------------------------------------------------------- /bench/mergelines.S: -------------------------------------------------------------------------------- 1 | #if 0 2 | size_t 3 | mergelines_rvv_vslide(char *str, size_t len) 4 | { 5 | uint8_t *dest = (uint8_t*)str; 6 | uint8_t *src = (uint8_t*)str; 7 | char last = 0; 8 | 9 | vuint8m8_t v, u, d; 10 | vbool1_t m; 11 | 12 | for (size_t vl, VL; len > 1; ) { 13 | VL = vl = __riscv_vsetvl_e8m8(len); 14 | 15 | char next = len > vl ? src[vl] : 0; 16 | v = __riscv_vle8_v_u8m8(src, vl); 17 | u = __riscv_vslide1up_vx_u8m8(v, last, vl); 18 | d = __riscv_vslide1down_vx_u8m8(v, next, vl); 19 | 20 | m = __riscv_vmor_mm_b1(__riscv_vmsne_vx_u8m8_b1(u, '\\', vl), __riscv_vmsne_vx_u8m8_b1(v, '\n', vl), vl); 21 | #if DO_SKIP 22 | if (likely(__riscv_vcpop_m_b1(m, vl) == vl && next != '\n')) 23 | goto skip; 24 | #endif 25 | m = __riscv_vmand_mm_b1( 26 | m, 27 | __riscv_vmor_mm_b1(__riscv_vmsne_vx_u8m8_b1(v, '\\', vl), __riscv_vmsne_vx_u8m8_b1(d, '\n', vl), vl), 28 | vl); 29 | 30 | v = __riscv_vcompress_vm_u8m8(v, m, vl); 31 | vl = __riscv_vcpop_m_b1(m, vl); 32 | skip: 33 | __riscv_vse8_v_u8m8(dest, v, vl); 34 | dest += vl; src += VL; len -= VL; 35 | last = src[-1]; 36 | } 37 | 38 | if (len > 0 && !(last == '\\' && *src == '\n')) *dest++ = *src++; 39 | return (dest - (uint8_t*)str); 40 | } 41 | 42 | size_t 43 | mergelines_rvv_mshift(char *str, size_t count) 44 | { 45 | if (count < 2) return count; 46 | uint8_t *dest = (uint8_t*)str; 47 | uint8_t *src = 1+(uint8_t*)str; 48 | char last = src[-1]; 49 | size_t len = count-1; 50 | 51 | vuint8m8_t v, u, d; 52 | vbool1_t m; 53 | 54 | for (size_t vl, VL; len > 0; dest += vl, src += VL, len -= VL, last = src[-1]) { 55 | vl = VL = __riscv_vsetvl_e8m8(len); 56 | 57 | v = __riscv_vle8_v_u8m8(src, vl); 58 | u = __riscv_vslide1up_vx_u8m8(v, last, vl); 59 | 60 | m = __riscv_vmor_mm_b1( 61 | __riscv_vmsne_vx_u8m8_b1(u, '\\', vl), 62 | __riscv_vmsne_vx_u8m8_b1(v, '\n', vl), vl); 63 | #if DO_SKIP 64 | if (__riscv_vcpop_m_b1(m, vl) == vl) goto skip; 65 | #endif 66 | 67 | vuint8m1_t m1 = __riscv_vreinterpret_v_b1_u8m1(m); 68 | size_t vlmax8 = __riscv_vsetvlmax_e8m1(); 69 | m1 = __riscv_vor_vv_u8m1( 70 | __riscv_vsrl_vx_u8m1(__riscv_vslide1up_vx_u8m1(m1, 0xFF, vlmax8), 7, vlmax8), 71 | __riscv_vsll_vx_u8m1(m1, 1, vlmax8), vlmax8); 72 | m = __riscv_vmand_mm_b1(m, __riscv_vreinterpret_v_u8m1_b1(m1), vl); 73 | 74 | u = __riscv_vcompress_vm_u8m8(u, m, vl); 75 | 76 | vl = __riscv_vcpop_m_b1(m, vl); 77 | VL += (VL^vl)&1&(VL < len); // missing bit in mask, so skip 1 78 | skip: 79 | __riscv_vse8_v_u8m8(dest, u, vl); 80 | } 81 | if (count > 1 && !(src[-2] == '\\' && src[-1] == '\n')) *dest++ = last; 82 | return (dest - (uint8_t*)str); 83 | } 84 | 85 | #endif 86 | 87 | #ifdef MX 88 | 89 | .global MX(mergelines_rvv_vslide_) # generated by clang 90 | MX(mergelines_rvv_vslide_): 91 | li a2, 2 92 | bltu a1, a2, MX(rvv_6) 93 | li t0, 0 94 | li a7, 92 95 | li a6, 1 96 | mv a2, a0 97 | mv a4, a0 98 | j MX(rvv_4) 99 | MX(rvv_2): 100 | add a3, a4, a5 101 | lbu t1, 0(a3) 102 | MX(rvv_3): 103 | vle8.v v8, (a4) 104 | add a3, a4, a5 105 | vslide1up.vx v16, v8, t0 106 | vslide1down.vx v24, v8, t1 107 | vmsne.vx v0, v16, a7 108 | vmsne.vi v16, v8, 10 109 | vmor.mm v16, v0, v16 110 | vmsne.vx v17, v8, a7 111 | vmsne.vi v18, v24, 10 112 | vmor.mm v17, v17, v18 113 | vmand.mm v16, v16, v17 114 | vcompress.vm v24, v8, v16 115 | vcpop.m a4, v16 116 | vsetvli zero, a4, e8, MX(), ta, ma 117 | vse8.v v24, (a2) 118 | lbu t0, -1(a3) 119 | sub a1, a1, a5 120 | add a2, a2, a4 121 | mv a4, a3 122 | bgeu a6, a1, MX(rvv_8) 123 | MX(rvv_4): 124 | vsetvli a5, a1, e8, MX(), ta, ma 125 | bltu a5, a1, MX(rvv_2) 126 | li t1, 0 127 | j MX(rvv_3) 128 | MX(rvv_6): 129 | mv a2, a0 130 | beqz a1, MX(rvv_10) 131 | lbu a1, 0(a0) 132 | mv a2, a0 133 | j MX(rvv_11) 134 | MX(rvv_8): 135 | beqz a1, MX(rvv_10) 136 | lbu a1, 0(a3) 137 | xori a3, t0, 92 138 | xori a4, a1, 10 139 | or a3, a3, a4 140 | bnez a3, MX(rvv_11) 141 | MX(rvv_10): 142 | sub a0, a2, a0 143 | ret 144 | MX(rvv_11): 145 | addi a3, a2, 1 146 | sb a1, 0(a2) 147 | sub a0, a3, a0 148 | ret 149 | 150 | 151 | .global MX(mergelines_rvv_vslide_skip_) # generated by clang 152 | MX(mergelines_rvv_vslide_skip_): 153 | li a2, 2 154 | bltu a1, a2, MX(rvv_skip_9) 155 | li a5, 0 156 | li a6, 92 157 | li a7, 1 158 | mv t1, a0 159 | mv a3, a0 160 | MX(rvv_skip_2): 161 | vsetvli a4, a1, e8, MX(), ta, ma 162 | bgeu a4, a1, MX(rvv_skip_4) 163 | add a2, a3, a4 164 | lbu t0, 0(a2) 165 | j MX(rvv_skip_5) 166 | MX(rvv_skip_4): 167 | li t0, 0 168 | MX(rvv_skip_5): 169 | vle8.v v8, (a3) 170 | vslide1up.vx v16, v8, a5 171 | vmsne.vx v24, v16, a6 172 | vmsne.vi v16, v8, 10 173 | vmor.mm v16, v24, v16 174 | vcpop.m a2, v16 175 | xor a2, a2, a4 176 | seqz a2, a2 177 | addi a5, t0, -10 178 | snez a5, a5 179 | and a2, a2, a5 180 | beqz a2, MX(rvv_skip_8) 181 | mv a2, a4 182 | MX(rvv_skip_7): 183 | add a3, a3, a4 184 | vsetvli zero, a2, e8, MX(), ta, ma 185 | vse8.v v8, (t1) 186 | lbu a5, -1(a3) 187 | sub a1, a1, a4 188 | add t1, t1, a2 189 | bltu a7, a1, MX(rvv_skip_2) 190 | j MX(rvv_skip_11) 191 | MX(rvv_skip_8): 192 | vslide1down.vx v24, v8, t0 193 | vmsne.vx v17, v8, a6 194 | vmsne.vi v18, v24, 10 195 | vmor.mm v17, v17, v18 196 | vmand.mm v16, v16, v17 197 | vcompress.vm v24, v8, v16 198 | vcpop.m a2, v16 199 | vmv.v.v v8, v24 200 | j MX(rvv_skip_7) 201 | MX(rvv_skip_9): 202 | mv t1, a0 203 | beqz a1, MX(rvv_skip_13) 204 | lbu a1, 0(a0) 205 | mv t1, a0 206 | j MX(rvv_skip_14) 207 | MX(rvv_skip_11): 208 | beqz a1, MX(rvv_skip_13) 209 | lbu a1, 0(a3) 210 | xori a2, a5, 92 211 | xori a3, a1, 10 212 | or a2, a2, a3 213 | bnez a2, MX(rvv_skip_14) 214 | MX(rvv_skip_13): 215 | sub a0, t1, a0 216 | ret 217 | MX(rvv_skip_14): 218 | addi a2, t1, 1 219 | sb a1, 0(t1) 220 | sub a0, a2, a0 221 | ret 222 | 223 | .global MX(mergelines_rvv_mshift_) 224 | MX(mergelines_rvv_mshift_): 225 | li a2, 2 226 | bltu a1, a2, 1f 227 | addi a2, a0, 1 228 | addi a3, a1, -1 229 | lbu a4, 0(a0) 230 | li a6, 92 231 | vsetvli a1, zero, e8, MXf8e8(), ta, ma 232 | li a7, -1 233 | mv t0, a0 234 | 2: 235 | vsetvli a5, a3, e8, MX(), ta, ma 236 | vle8.v v16, (a2) 237 | vslide1up.vx v8, v16, a4 238 | vmsne.vx v24, v8, a6 239 | vmsne.vi v25, v16, 10 240 | vmor.mm v16, v24, v25 241 | vsetvli a4, zero, e8, MXf8e8(), ta, ma 242 | vslide1up.vx v17, v16, a7 243 | vsrl.vi v17, v17, 7 244 | vadd.vv v18, v16, v16 245 | vor.vv v17, v17, v18 246 | vsetvli zero, a5, e8, MX(), ta, ma 247 | vmand.mm v16, v16, v17 248 | vcompress.vm v24, v8, v16 249 | vcpop.m a1, v16 250 | xor t1, a1, a5 251 | sltu a4, a5, a3 252 | and a4, a4, t1 253 | add a5, a5, a4 254 | vsetvli zero, a1, e8, MX(), ta, ma 255 | vse8.v v24, (t0) 256 | add a2, a2, a5 257 | lbu a4, -1(a2) 258 | sub a3, a3, a5 259 | add t0, t0, a1 260 | bnez a3, 2b 261 | lbu a1, -2(a2) 262 | li a3, 92 263 | bne a1, a3, 3f 264 | lbu a1, -1(a2) 265 | li a2, 10 266 | beq a1, a2, 4f 267 | 3: 268 | addi a1, t0, 1 269 | sb a4, 0(t0) 270 | mv t0, a1 271 | 4: 272 | sub a1, t0, a0 273 | 1: 274 | mv a0, a1 275 | ret 276 | 277 | .global MX(mergelines_rvv_mshift_skip_) 278 | MX(mergelines_rvv_mshift_skip_): 279 | li a2, 2 280 | bltu a1, a2, 1f 281 | addi a2, a0, 1 282 | addi a3, a1, -1 283 | lbu t0, 0(a0) 284 | li a7, 92 285 | vsetvli a1, zero, e8, MXf8e8(), ta, ma 286 | li a6, -1 287 | mv a5, a0 288 | j 4f 289 | 2: 290 | vsetvli a4, zero, e8, MXf8e8(), ta, ma 291 | vslide1up.vx v17, v16, a6 292 | vsrl.vi v17, v17, 7 293 | vadd.vv v18, v16, v16 294 | vor.vv v17, v17, v18 295 | vsetvli zero, a1, e8, MX(), ta, ma 296 | vmand.mm v16, v16, v17 297 | vcompress.vm v24, v8, v16 298 | vcpop.m t0, v16 299 | xor t1, t0, a1 300 | sltu a4, a1, a3 301 | and a4, a4, t1 302 | add a4, a4, a1 303 | mv a1, t0 304 | vmv.v.v v8, v24 305 | 3: 306 | vsetvli zero, a1, e8, MX(), ta, ma 307 | vse8.v v8, (a5) 308 | add a2, a2, a4 309 | lbu t0, -1(a2) 310 | sub a3, a3, a4 311 | add a5, a5, a1 312 | beqz a3, 5f 313 | 4: 314 | vsetvli a1, a3, e8, MX(), ta, ma 315 | vle8.v v16, (a2) 316 | vslide1up.vx v8, v16, t0 317 | vmsne.vx v24, v8, a7 318 | vmsne.vi v25, v16, 10 319 | vmor.mm v16, v24, v25 320 | vcpop.m a4, v16 321 | bne a4, a1, 2b 322 | mv a4, a1 323 | j 3b 324 | 5: 325 | lbu a1, -2(a2) 326 | li a3, 92 327 | bne a1, a3, 6f 328 | lbu a1, -1(a2) 329 | li a2, 10 330 | beq a1, a2, 7f 331 | 6: 332 | addi a1, a5, 1 333 | sb t0, 0(a5) 334 | mv a5, a1 335 | 7: 336 | sub a1, a5, a0 337 | 1: 338 | mv a0, a1 339 | ret 340 | 341 | 342 | #endif 343 | -------------------------------------------------------------------------------- /bench/mergelines.c: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | size_t 4 | mergelines_scalar(char *str, size_t len) 5 | { 6 | char *dest = str; 7 | char *src = str; 8 | 9 | while (len > 1) { 10 | if (src[0] == '\\' && src[1] == '\n') 11 | src += 2, len -= 2; 12 | else 13 | *dest++ = *src++, --len; 14 | BENCH_CLOBBER(); 15 | } 16 | if (len > 0) 17 | *dest++ = *src++; 18 | return dest - str; 19 | } 20 | 21 | #define IMPLS(f) \ 22 | f(scalar) \ 23 | MX(f, rvv_vslide) \ 24 | MX(f, rvv_vslide_skip) \ 25 | MX(f, rvv_mshift) \ 26 | MX(f, rvv_mshift_skip) \ 27 | 28 | typedef size_t Func(char *buf, size_t len); 29 | 30 | #define DECLARE(f) extern Func mergelines_##f; 31 | IMPLS(DECLARE) 32 | 33 | #define EXTRACT(f) { #f, &mergelines_##f }, 34 | Impl impls[] = { IMPLS(EXTRACT) }; 35 | 36 | char *str; 37 | ux last; 38 | 39 | void init(void) { } 40 | ux checksum(size_t n) { return last; } 41 | 42 | void common(size_t n, char const *chars, size_t nChars) { 43 | str = (char*)mem + (bench_urand() & 255); 44 | for (size_t i = 0; i < n; ++i) 45 | str[i] = chars[bench_urand() % nChars]; 46 | } 47 | 48 | BENCH_BEG(2_3) { 49 | common(n, "\\\na", 3); 50 | TIME last = (uintptr_t)f(str, n); 51 | } BENCH_END 52 | 53 | BENCH_BEG(2_16) { 54 | common(n, "\\\nabcdefgh", 16); 55 | TIME last = (uintptr_t)f(str, n); 56 | } BENCH_END 57 | 58 | BENCH_BEG(2_32) { 59 | common(n, "\\\nabcdefgh123456789", 32); 60 | TIME last = (uintptr_t)f(str, n); 61 | } BENCH_END 62 | 63 | BENCH_BEG(2_256) { 64 | str = (char*)mem + (bench_urand() & 255); 65 | for (size_t i = 0; i < n; ++i) 66 | str[i] = bench_urand() & 0xff; 67 | TIME last = (uintptr_t)f(str, n); 68 | } BENCH_END 69 | 70 | #define COUNT SCALE_mergelines(MAX_MEM) - 256 71 | Bench benches[] = { 72 | BENCH( impls, COUNT, "mergelines 2/3", bench_2_3 ), 73 | BENCH( impls, COUNT, "mergelines 2/16", bench_2_16 ), 74 | BENCH( impls, COUNT, "mergelines 2/32", bench_2_32 ), 75 | BENCH( impls, COUNT, "mergelines 2/256", bench_2_256 ) 76 | }; BENCH_MAIN(benches) 77 | 78 | -------------------------------------------------------------------------------- /bench/poly1305.S: -------------------------------------------------------------------------------- 1 | #ifndef MX 2 | #if __riscv_xlen != 32 3 | #include "../thirdparty/rvv-chacha-poly/vpoly.s" 4 | #endif 5 | #endif 6 | -------------------------------------------------------------------------------- /bench/poly1305.c: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | #if __riscv_xlen != 32 3 | #include "../thirdparty/rvv-chacha-poly/boring.h" 4 | 5 | uint8_t *src; 6 | uint8_t key[32], sig[16]; 7 | 8 | extern uint64_t 9 | vector_poly1305(const uint8_t* in, size_t len, 10 | const uint8_t key[32], uint8_t sig[16]); 11 | 12 | static void 13 | poly1305_boring(void const *src, size_t n) { 14 | poly1305_state state; 15 | boring_poly1305_init(&state, key); 16 | boring_poly1305_update(&state, src, n); 17 | boring_poly1305_finish(&state, sig); 18 | } 19 | 20 | static void 21 | poly1305_rvv(void const *src, size_t n) { 22 | vector_poly1305(src, n, key, sig); 23 | } 24 | 25 | typedef void *Func(void const *src, size_t n); 26 | 27 | Impl impls[] = { 28 | { "boring", &poly1305_boring }, 29 | IF_VE64({ "rvv", &poly1305_rvv },) 30 | }; 31 | 32 | void init(void) { 33 | bench_memrand(key, sizeof key); 34 | bench_memrand(sig, sizeof sig); 35 | } 36 | 37 | ux checksum(size_t n) { 38 | ux sum = 0; 39 | for (size_t i = 0; i < ARR_LEN(sig); ++i) 40 | sum = uhash(sum) + sig[i]; 41 | return sum; 42 | } 43 | 44 | BENCH_BEG(aligned) { 45 | for (size_t i = 0; i < 256; ++i) 46 | mem[bench_urand()%n] = bench_urand(); 47 | n = (15+n) & -16; 48 | TIME f(mem, n); 49 | } BENCH_END 50 | 51 | Bench benches[] = { 52 | BENCH( impls, MAX_MEM, "poly1305 aligned", bench_aligned ) 53 | }; BENCH_MAIN(benches) 54 | 55 | 56 | #include "../thirdparty/rvv-chacha-poly/boring.c" 57 | #else 58 | void init(void) {} 59 | Impl impls[] = {}; 60 | Bench benches[] = {}; 61 | BENCH_MAIN(benches) 62 | #endif 63 | -------------------------------------------------------------------------------- /bench/strlen.S: -------------------------------------------------------------------------------- 1 | #if 0 2 | size_t strlen_rvv(char *src) { 3 | size_t vlmax = __riscv_vsetvlmax_e8m8(); 4 | char *p = src; 5 | long first = -1; 6 | size_t vl; 7 | while (first < 0) { 8 | vuint8m8_t v = __riscv_vle8ff_v_u8m8((uint8_t*)p, &vl, vlmax); 9 | first = __riscv_vfirst_m_b1(__riscv_vmseq_vx_u8m8_b1(v, 0, vl), vl); 10 | p += vl; 11 | } 12 | p -= vl - first; 13 | return (size_t)(p - src); 14 | } 15 | 16 | #define PAGE_SIZE 4096 17 | size_t strlen_rvv_page_aligned_(char *src) { 18 | char *p = src; 19 | long first = 0; 20 | 21 | size_t n = 0 - ((uintptr_t)src | -4096); 22 | size_t vl; 23 | for (; n > 0; n -= vl) { 24 | vl = __riscv_vsetvl_e8m8(n); 25 | vuint8m8_t v = __riscv_vle8_v_u8m8((uint8_t*)p, vl); 26 | first = __riscv_vfirst_m_b1(__riscv_vmseq_vx_u8m8_b1(v, 0, vl), vl); 27 | p += vl; 28 | if (first >= 0) { 29 | goto end; 30 | } 31 | } 32 | vl = __riscv_vsetvlmax_e8m8(); 33 | do { 34 | vuint8m8_t v = __riscv_vle8_v_u8m8((uint8_t*)p, vl); 35 | first = __riscv_vfirst_m_b1(__riscv_vmseq_vx_u8m8_b1(v, 0, vl), vl); 36 | p += vl; 37 | } while (first < 0); 38 | end: 39 | p -= vl - first; 40 | return (size_t)(p - src); 41 | } 42 | #endif 43 | 44 | 45 | #ifdef MX 46 | 47 | .global MX(strlen_rvv_) 48 | MX(strlen_rvv_): 49 | mv a3, a0 50 | 1: 51 | vsetvli a1, x0, e8, MX(), ta, ma 52 | vle8ff.v v8, (a3) 53 | csrr a1, vl 54 | vmseq.vi v0, v8, 0 55 | vfirst.m a2, v0 56 | add a3, a3, a1 # end += vl 57 | bltz a2, 1b 58 | add a0, a0, a1 # start += vl 59 | add a3, a3, a2 # end += idx 60 | sub a0, a3, a0 # start - end 61 | ret 62 | 63 | .global MX(strlen_rvv_page_aligned_) # generated by clang 64 | MX(strlen_rvv_page_aligned_): 65 | lui a1, 1048575 66 | or a1, a1, a0 67 | neg a4, a1 68 | mv a1, a0 69 | 1: 70 | vsetvli a2, a4, e8, MX(), ta, ma 71 | vle8.v v8, (a1) 72 | vmseq.vi v16, v8, 0 73 | vfirst.m a3, v16 74 | add a1, a1, a2 75 | bgez a3, 1f 76 | sub a4, a4, a2 77 | bnez a4, 1b 78 | vsetvli a2, zero, e8, MX(), ta, ma 79 | 2: 80 | vle8.v v8, (a1) 81 | vmseq.vi v16, v8, 0 82 | vfirst.m a3, v16 83 | add a1, a1, a2 84 | bltz a3, 2b 85 | 1: 86 | sub a1, a1, a2 87 | sub a0, a3, a0 88 | add a0, a0, a1 89 | ret 90 | 91 | #endif 92 | -------------------------------------------------------------------------------- /bench/strlen.c: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | size_t 4 | strlen_scalar(char const *s) 5 | { 6 | char const *a = s; 7 | while (*s) ++s, BENCH_CLOBBER(); 8 | return s - a; 9 | } 10 | 11 | size_t 12 | strlen_scalar_autovec(char const *s) 13 | { 14 | char const *a = s; 15 | while (*s) ++s; 16 | return s - a; 17 | } 18 | 19 | /* https://git.musl-libc.org/cgit/musl/tree/src/string/strlen.c */ 20 | #define ONES ((size_t)-1/UCHAR_MAX) 21 | #define HIGHS (ONES * (UCHAR_MAX/2+1)) 22 | #define HASZERO(x) (((x)-ONES) & ~(x) & HIGHS) 23 | size_t 24 | strlen_musl(char const *s) 25 | { 26 | char const *a = s; 27 | #ifdef __GNUC__ 28 | typedef size_t __attribute__((__may_alias__)) word; 29 | word const *w; 30 | for (; (uintptr_t)s % sizeof *w; s++) if (!*s) return s-a; 31 | for (w = (void const*)s; !HASZERO(*w); w++); 32 | s = (void const*)w; 33 | #endif 34 | for (; *s; s++); 35 | return s-a; 36 | } 37 | 38 | #define strlen_libc strlen 39 | 40 | #define IMPLS(f) \ 41 | f(scalar) \ 42 | f(scalar_autovec) \ 43 | IFHOSTED(f(libc)) \ 44 | f(musl) \ 45 | MX(f, rvv_page_aligned) \ 46 | MX(f, rvv) \ 47 | 48 | 49 | typedef size_t Func(char const *s); 50 | 51 | #define DECLARE(f) extern Func strlen_##f; 52 | IMPLS(DECLARE) 53 | 54 | #define EXTRACT(f) { #f, &strlen_##f }, 55 | Impl impls[] = { IMPLS(EXTRACT) }; 56 | 57 | ux last; 58 | 59 | void init(void) { 60 | for (size_t i = 0; i < MAX_MEM; ++i) 61 | mem[i] += !mem[i]; // remove null bytes 62 | } 63 | 64 | ux checksum(size_t n) { return last; } 65 | 66 | BENCH_BEG(base) { 67 | char *p = (char*)mem + (bench_urand() % 511); 68 | p[n] = 0; 69 | TIME last = f(p); 70 | p[n] = bench_urand() | 1; 71 | } BENCH_END 72 | 73 | Bench benches[] = { 74 | BENCH( impls, MAX_MEM - 521, "strlen", bench_base ), 75 | }; BENCH_MAIN(benches) 76 | 77 | -------------------------------------------------------------------------------- /bench/template.S: -------------------------------------------------------------------------------- 1 | #define NOLIBC_DEFINE_ONLY 2 | #include "../nolibc.h" 3 | #include "config.h" 4 | .text 5 | .balign 8 6 | 7 | #define CAT_(a,b) a##b 8 | #define CAT(a,b) CAT_(a,b) 9 | 10 | #define STR(x) #x 11 | #define STRe(x) STR(x) 12 | 13 | #define MX_N 0 14 | #include STRe(INC) 15 | 16 | #undef MX_N 17 | 18 | #define MX_N 1 19 | #define MX8(x) x##m8 20 | #define MX4(x) x##m4 21 | #define MX2(x) x##m2 22 | #define MX(x) x##m1 23 | #define MXf2(x) x##mf2 24 | #define MXf4(x) x##mf4 25 | #if __riscv_v_elen >= 64 26 | #define MXf8e8(x) x##mf8 27 | #else 28 | #define MXf8e8(x) x##mf4 29 | #endif 30 | #include STRe(INC) 31 | 32 | #undef MX_N 33 | #undef MX8 34 | #undef MX4 35 | #undef MX2 36 | #undef MX 37 | #undef MXf2 38 | #undef MXf4 39 | #undef MXf8 40 | #undef MXf8e8 41 | 42 | #define MX_N 2 43 | #define MX4(x) x##m8 44 | #define MX2(x) x##m4 45 | #define MX(x) x##m2 46 | #define MXf2(x) x##m1 47 | #define MXf4(x) x##mf2 48 | #define MXf8(x) x##mf4 49 | #define MXf8e8(x) x##mf4 50 | #include STRe(INC) 51 | 52 | #undef MX_N 53 | #undef MX4 54 | #undef MX2 55 | #undef MX 56 | #undef MXf2 57 | #undef MXf4 58 | #undef MXf8 59 | #undef MXf8e8 60 | 61 | #define MX_N 4 62 | #define MX2(x) x##m8 63 | #define MX(x) x##m4 64 | #define MXf2(x) x##m2 65 | #define MXf4(x) x##m1 66 | #define MXf8(x) x##mf2 67 | #define MXf8e8(x) x##mf2 68 | #include STRe(INC) 69 | 70 | #undef MX_N 71 | #undef MX2 72 | #undef MX 73 | #undef MXf2 74 | #undef MXf4 75 | #undef MXf8 76 | #undef MXf8e8 77 | 78 | #define MX_N 8 79 | #define MX(x) x##m8 80 | #define MXf2(x) x##m4 81 | #define MXf4(x) x##m2 82 | #define MXf8(x) x##m1 83 | #define MXf8e8(x) x##m1 84 | #include STRe(INC) 85 | 86 | -------------------------------------------------------------------------------- /bench/utf8_count.S: -------------------------------------------------------------------------------- 1 | #if 0 2 | size_t utf8_count_rvv(char const *buf, size_t len) { 3 | size_t sum = 0; 4 | for (size_t vl; len > 0; len -= vl, buf += vl) { 5 | vl = __riscv_vsetvl_e8m8(len); 6 | vint8m8_t v = __riscv_vle8_v_i8m8((void*)buf, vl); 7 | vbool1_t mask = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl); 8 | sum += __riscv_vcpop_m_b1(mask, vl); 9 | } 10 | return sum; 11 | } 12 | #endif 13 | 14 | #ifdef MX 15 | 16 | .global MX(utf8_count_rvv_) 17 | MX(utf8_count_rvv_): 18 | li a2, 0 19 | li a3, -65 20 | 1: 21 | vsetvli a4, a1, e8, MX(), ta, ma 22 | vle8.v v8, (a0) 23 | vmsgt.vx v16, v8, a3 24 | vcpop.m a5, v16 25 | add a2, a2, a5 26 | sub a1, a1, a4 27 | add a0, a0, a4 28 | bnez a1, 1b 29 | mv a0, a2 30 | ret 31 | 32 | .global MX(utf8_count_rvv_align_) 33 | MX(utf8_count_rvv_align_): 34 | mv a2, a0 35 | li a0, 0 36 | li a3, -65 37 | vsetvli t0, zero, e8, MX(), ta, ma # vlen 38 | bltu a1, t0, 2f # len < vlen 39 | # align dest to vlen 40 | sub t1, zero, a2 41 | remu t1, t1, t0 # align = (-dest) % vlen 42 | vsetvli t0, t1, e8, MX(), ta, ma 43 | 1: 44 | vle8.v v8,(a2) 45 | vmsgt.vx v16, v8, a3 46 | vcpop.m a4, v16 47 | add a0, a0, a4 48 | sub a1, a1, t0 49 | add a2, a2, t0 50 | 2: 51 | vsetvli t0, a1, e8, MX(), ta, ma 52 | bnez a1, 1b 53 | ret 54 | 55 | .global MX(utf8_count_rvv_tail_) 56 | MX(utf8_count_rvv_tail_): 57 | vsetvli t0, a1, e8, MX(), ta, ma 58 | remu a2, a1, t0 # tail = n % vlenb 59 | sub a1, a1, a2 # n -= tail 60 | add a3, a0, a1 # end = dest + n 61 | mv a1, a0 # n = dest 62 | li a0, 0 63 | li t1, -65 64 | 1: 65 | vle8.v v8, (a1) 66 | vmsgt.vx v16, v8, t1 67 | vcpop.m t2, v16 68 | add a0, a0, t2 69 | add a1, a1, t0 # src += vlenb 70 | bltu a1, a3, 1b # dest < end 71 | # copy tail 72 | vsetvli zero, a2, e8, MX(), ta, ma 73 | vle8.v v8, (a1) 74 | vmsgt.vx v16, v8, t1 75 | vcpop.m t2, v16 76 | add a0, a0, t2 77 | ret 78 | 79 | # this is supposed to test how well the implementation handles 80 | # operations with an vl smaller than VLMAX 81 | .global MX(utf8_count_rvv_128_) 82 | MX(utf8_count_rvv_128_): 83 | li t0, 128/8 84 | bgt a1, t0, 1f 85 | mv t0, a1 86 | 1: 87 | vsetvli t0, t0, e8, MX(), ta, ma 88 | remu a2, a1, t0 # tail = n % vlenb 89 | sub a1, a1, a2 # n -= tail 90 | add a3, a0, a1 # end = dest + n 91 | mv a1, a0 # n = dest 92 | li a0, 0 93 | li t1, -65 94 | 1: 95 | vle8.v v8, (a1) 96 | vmsgt.vx v16, v8, t1 97 | vcpop.m t2, v16 98 | add a0, a0, t2 99 | add a1, a1, t0 # src += vlenb 100 | bltu a1, a3, 1b # dest < end 101 | # copy tail 102 | vsetvli zero, a2, e8, MX(), ta, ma 103 | vle8.v v8, (a1) 104 | vmsgt.vx v16, v8, t1 105 | vcpop.m t2, v16 106 | add a0, a0, t2 107 | ret 108 | 109 | 110 | .global MX(utf8_count_rvv_4x_) 111 | MX(utf8_count_rvv_4x_): 112 | mv a2, a0 113 | li a0, 0 114 | li a6, -65 115 | 1: 116 | vsetvli a4, a1, e8, MX(), ta, ma 117 | vle8.v v8, (a2) 118 | vmsgt.vx v16, v8, a6 119 | vcpop.m a7, v16 120 | sub a1, a1, a4 121 | add a2, a2, a4 122 | vsetvli a4, a1, e8, MX(), ta, ma 123 | vle8.v v8, (a2) 124 | vmsgt.vx v16, v8, a6 125 | vcpop.m a3, v16 126 | sub a1, a1, a4 127 | add a2, a2, a4 128 | vsetvli a4, a1, e8, MX(), ta, ma 129 | vle8.v v8, (a2) 130 | vmsgt.vx v16, v8, a6 131 | vcpop.m a5, v16 132 | sub a1, a1, a4 133 | add a2, a2, a4 134 | vsetvli a4, a1, e8, MX(), ta, ma 135 | vle8.v v8, (a2) 136 | add a0, a0, a7 137 | add a0, a0, a3 138 | add a0, a0, a5 139 | vmsgt.vx v16, v8, a6 140 | vcpop.m a3, v16 141 | add a0, a0, a3 142 | sub a1, a1, a4 143 | add a2, a2, a4 144 | bnez a1, 1b 145 | ret 146 | 147 | // gcc generated from unrolled intrinsics implementation: 148 | // https://godbolt.org/z/q75c6r3Ta 149 | .global MX(utf8_count_rvv_4x_tail_) 150 | MX(utf8_count_rvv_4x_tail_): 151 | vsetvli a5, zero, e8, MX(), ta, ma 152 | slli t3, a5, 2 153 | add a1, a0, a1 154 | add a2, a0, t3 155 | mv a4, a0 156 | bltu a1, a2, 5f 157 | slli t4, a5, 1 158 | add t5, t4, a5 159 | li a0, 0 160 | li a6, -65 161 | 1: 162 | add a3, a5, a4 163 | vsetvli zero, zero, e8, MX(), ta, ma 164 | add a7, t4, a4 165 | vle8.v v8, (a4) 166 | vle8.v v16, (a3) 167 | vmsgt.vx v8, v8, a6 168 | vmsgt.vx v16, v16, a6 169 | vcpop.m a3, v8 170 | vcpop.m t1, v16 171 | add a3, a3, t1 172 | vle8.v v8, (a7) 173 | add a4, t5, a4 174 | vmsgt.vx v8, v8, a6 175 | vcpop.m a7, v8 176 | add a3, a3, a7 177 | vle8.v v8, (a4) 178 | mv a4, a2 179 | vmsgt.vx v8, v8, a6 180 | add a2, a2, t3 181 | vcpop.m a7, v8 182 | add a3, a3, a7 183 | add a0, a0, a3 184 | bgeu a1, a2, 1b 185 | 2: 186 | sub a3, a1, a4 187 | beq a1, a4, 4f 188 | li a2, 0 189 | li a1, -65 190 | 3: 191 | vsetvli a5, a3, e8, MX(), ta, ma 192 | sub a3, a3, a5 193 | vle8.v v8, (a4) 194 | add a4, a4, a5 195 | vmsgt.vx v8, v8, a1 196 | vcpop.m a5, v8 197 | add a2, a2, a5 198 | bne a3, zero, 3b 199 | add a0, a0, a2 200 | 4: 201 | ret 202 | 5: 203 | li a0, 0 204 | j 2b 205 | 206 | 207 | 208 | 209 | #endif 210 | 211 | 212 | 213 | 214 | -------------------------------------------------------------------------------- /bench/utf8_count.c: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | size_t 4 | utf8_count_scalar(char const *str, size_t len) 5 | { 6 | uint8_t const *p = (uint8_t const*)str; 7 | size_t count = 0; 8 | while (len--) count += (*p++ & 0xc0) != 0x80, BENCH_CLOBBER(); 9 | return count; 10 | } 11 | 12 | size_t 13 | utf8_count_scalar_autovec(char const *str, size_t len) 14 | { 15 | uint8_t const *p = (uint8_t const*)str; 16 | size_t count = 0; 17 | while (len--) count += (*p++ & 0xc0) != 0x80; 18 | return count; 19 | } 20 | 21 | #define GEN_SWAR(name, popc, clobber) \ 22 | size_t \ 23 | utf8_count_##name(char const *str, size_t len) \ 24 | { \ 25 | ux const BENCH_MAY_ALIAS *u; \ 26 | size_t count = 0, tail = 0; \ 27 | \ 28 | uint8_t const *u8 = (uint8_t const*)str; \ 29 | if (len < sizeof *u) { \ 30 | tail = len; \ 31 | goto skip; \ 32 | } \ 33 | \ 34 | tail = sizeof *u - (uintptr_t)str % sizeof *u; \ 35 | \ 36 | len -= tail; \ 37 | while (tail--) \ 38 | count += (*u8++ & 0xC0) != 0x80, clobber; \ 39 | \ 40 | u = (ux const*)u8; \ 41 | tail = len % sizeof *u; \ 42 | \ 43 | for (len /= sizeof *u; len--; ++u) { \ 44 | ux b1 = ~*u & (ux)0x8080808080808080; \ 45 | ux b2 = *u & (ux)0x4040404040404040; \ 46 | count += popc((b1 >> 1) | b2); \ 47 | clobber; \ 48 | } \ 49 | \ 50 | u8 = (uint8_t const*)u; \ 51 | skip: \ 52 | while (tail--) \ 53 | count += (*u8++ & 0xC0) != 0x80, clobber; \ 54 | return count; \ 55 | } 56 | 57 | #if __riscv_zbb 58 | GEN_SWAR(SWAR_popc,__builtin_popcountll,BENCH_CLOBBER()) 59 | GEN_SWAR(SWAR_popc_autovec,__builtin_popcountll,(void)0) 60 | # define POPC(f) f(SWAR_popc) f(SWAR_popc_autovec) 61 | #else 62 | # define POPC(f) 63 | #endif 64 | 65 | static inline int 66 | upopcnt(ux x) 67 | { 68 | /* 2-bit sums */ 69 | x -= (x >> 1) & (-(ux)1/3); 70 | /* 4-bit sums */ 71 | x = (x & (-(ux)1/15*3)) + ((x >> 2) & (-(ux)1/15*3)); 72 | /* 8-bit sums */ 73 | x = (x + (x >> 4)) & (-(ux)1/255*15); 74 | BENCH_CLOBBER(); 75 | /* now we can just add the sums together, because can't overflow, 76 | * since there can't be more than 255 bits set */ 77 | x += (x >> 8); /* 16-bit sums */ 78 | x += (x >> 16); /* sum 16-bit sums */ 79 | IF64(x += (x >> 32)); /* sum 32-bit sums */ 80 | return x & 127; 81 | } 82 | 83 | 84 | GEN_SWAR(SWAR_popc_bithack,upopcnt,BENCH_CLOBBER()) 85 | GEN_SWAR(SWAR_popc_bithack_autovec,upopcnt,(void)0) 86 | 87 | 88 | #define IMPLS(f) \ 89 | f(scalar) \ 90 | f(scalar_autovec) \ 91 | POPC(f) \ 92 | f(SWAR_popc_bithack) \ 93 | f(SWAR_popc_bithack_autovec) \ 94 | MX(f, rvv) \ 95 | MX(f, rvv_align) \ 96 | MX(f, rvv_tail) \ 97 | MX(f, rvv_128) \ 98 | MX(f, rvv_4x) \ 99 | MX(f, rvv_4x_tail) \ 100 | 101 | typedef size_t Func(char const *str, size_t len); 102 | 103 | #define DECLARE(f) extern Func utf8_count_##f; 104 | IMPLS(DECLARE) 105 | 106 | #define EXTRACT(f) { #f, &utf8_count_##f }, 107 | Impl impls[] = { IMPLS(EXTRACT) }; 108 | 109 | char *str; 110 | ux last; 111 | 112 | void init(void) { } 113 | ux checksum(size_t n) { return last; } 114 | 115 | void common(size_t n, size_t off) { 116 | str = (char*)mem + off; 117 | bench_memrand(str, n + 9); 118 | } 119 | 120 | BENCH_BEG(base) { 121 | common(n, bench_urand() & 511); 122 | TIME last = (uintptr_t)f(str, n); 123 | } BENCH_END 124 | 125 | BENCH_BEG(aligned) { 126 | common(n, 0); 127 | TIME last = (uintptr_t)f(str, n); 128 | } BENCH_END 129 | 130 | Bench benches[] = { 131 | BENCH( impls, MAX_MEM - 521, "utf8 count", bench_base ), 132 | BENCH( impls, MAX_MEM - 521, "utf8 count aligned", bench_aligned ) 133 | }; BENCH_MAIN(benches) 134 | 135 | 136 | -------------------------------------------------------------------------------- /config.mk: -------------------------------------------------------------------------------- 1 | WARN=-Wall -Wextra -Wno-unused-function -Wno-unused-parameter 2 | 3 | # append -DUSE_PERF_EVENT to CFLAGS, if the cycle csr isn't exposed 4 | # try -DUSE_PERF_EVENT_SLOW if the abvoe doesn't work 5 | 6 | # freestanding using any recent clang build 7 | CC=clang 8 | CFLAGS=--target=riscv64 -march=rv64gcv_zba_zbb_zbs -O3 ${WARN} -nostdlib -fno-builtin -ffreestanding 9 | #CFLAGS=--target=riscv32 -march=rv32gc_zve32f_zba_zbb_zbs -O3 ${WARN} -nostdlib -fno-builtin -ffreestanding 10 | 11 | 12 | # full cross compilation toolchain 13 | #CC=riscv64-linux-gnu-gcc 14 | #CFLAGS=-march=rv64gcv -O3 ${WARN} 15 | 16 | # native build 17 | #CC=cc 18 | #CFLAGS=-march=rv64gcv -O3 ${WARN} 19 | 20 | 21 | -------------------------------------------------------------------------------- /instructions/rvv/Makefile: -------------------------------------------------------------------------------- 1 | .POSIX: 2 | 3 | include ../../config.mk 4 | 5 | all: rvv 6 | 7 | rvv: gen.S main.c config.h 8 | m4 gen.S > main.S 9 | ${CC} ${CFLAGS} main.S main.c -o $@ 10 | rm main.S 11 | 12 | clean: 13 | rm -f rvv 14 | 15 | run: rvv 16 | ../../run.sh ./rvv 17 | -------------------------------------------------------------------------------- /instructions/rvv/config.h: -------------------------------------------------------------------------------- 1 | #define WARMUP 1000 2 | #define UNROLL 8 // automatically *8 3 | #define LOOP 512 4 | #define RUNS 32 5 | 6 | // processor specific configs 7 | // m8 m4 m2 m1 mf2 mf4 mf8 8 | // SEW: 6310 6310 6310 6310 9 | // 4268... 4268... 4268... 4268... 10 | #define T_A 0b1111111111111111111111111111 // all 11 | #define T_W 0b0000011101110111011101110111 // widen 12 | #define T_WR 0b0111011101110111011101110111 // widen reduction 13 | #define T_N 0b0000011101110111011101110111 // narrow 14 | #define T_F 0b1110111011101110111011101110 // float 15 | #define T_FW 0b0000011001100110011001100110 // float widen 16 | #define T_FWR 0b0110011001100110011001100110 // float widen reduction 17 | #define T_FN 0b0000011001100110011001100110 // float narrow 18 | 19 | #define T_E2 0b1110111011101110111011101110 // extend 2 20 | #define T_E4 0b1100110011001100110011001100 // extend 4 21 | #define T_E8 0b1000100010001000100010001000 // extend 8 22 | #define T_ei16 0b1110111111111111111111111111 // no m8 23 | 24 | // special: 25 | #define T_m1 ((1 << 28) | T_A) // emul<=1 26 | -------------------------------------------------------------------------------- /instructions/rvv/main.c: -------------------------------------------------------------------------------- 1 | #include "../../nolibc.h" 2 | #include "config.h" 3 | 4 | static ux seed = 123456; 5 | 6 | typedef ux (*BenchFunc)(void); 7 | extern size_t bench_count; 8 | extern char bench_names; 9 | extern ux bench_types; 10 | extern BenchFunc bench_mf8, bench_mf4, bench_mf2, bench_m1, bench_m2, bench_m4, bench_m8; 11 | static BenchFunc *benches[] = { &bench_mf8, &bench_mf4, &bench_mf2, &bench_m1, &bench_m2, &bench_m4, &bench_m8 }; 12 | 13 | extern ux run_bench(ux (*bench)(void), ux type, ux vl, ux seed); 14 | 15 | 16 | static int 17 | compare_ux(void const *a, void const *b) 18 | { 19 | return (*(ux*)a > *(ux*)b) - (*(ux*)a < *(ux*)b); 20 | } 21 | 22 | 23 | static void 24 | run_all_types(char const *name, ux bIdx, ux vl, int ta, int ma) 25 | { 26 | ux arr[RUNS]; 27 | 28 | 29 | print("")(s,name)(""); 30 | ux mask = bIdx[&bench_types]; 31 | 32 | ux lmuls[] = { 5, 6, 7, 0, 1, 2, 3 }; 33 | 34 | for (ux sew = 0; sew < 4; ++sew) 35 | for (ux lmul_idx = 0; lmul_idx < 7; ++lmul_idx) { 36 | ux lmul = lmuls[lmul_idx]; 37 | ux vtype = lmul | (sew<<3) | (!!ta << 6) | (!!ma << 7); 38 | 39 | if (!(mask >> (lmul_idx*4 + sew) & 1)) { 40 | print(""); 41 | continue; 42 | } 43 | 44 | ux lmul_val = 1 << lmul_idx; // fixed-point, denum 8 45 | ux sew_val = 1 << (sew + 3); 46 | // > For a given supported fractional LMUL setting, 47 | // > implementations must support SEW settings between SEWMIN 48 | // > and LMUL * ELEN, inclusive. 49 | if (sew_val * 8 > lmul_val * __riscv_v_elen) { 50 | print(""); 51 | continue; 52 | } 53 | 54 | ux emul = lmul_idx; 55 | if (mask == T_W || mask == T_FW || mask == T_N || mask == T_FN) 56 | emul += 1; 57 | if (mask == T_ei16 && sew == 0) 58 | emul = emul < 7 ? emul+1 : 7; 59 | if (mask == T_m1) 60 | emul = 4; // m2 61 | BenchFunc bench = benches[emul][bIdx]; 62 | 63 | for (ux i = 0; i < RUNS; ++i) { 64 | arr[i] = run_bench(bench, vtype, vl, seed); 65 | if (~arr[i] == 0) goto skip; 66 | seed = seed*7 + 13; 67 | } 68 | #if RUNS > 4 69 | qsort(arr, RUNS, sizeof *arr, compare_ux); 70 | ux sum = 0, count = 0; 71 | for (ux i = RUNS * 0.2f; i < RUNS * 0.8f; ++i, ++count) 72 | sum += arr[i]; 73 | #else 74 | ux sum = 0, count = RUNS; 75 | for (ux i = 0; i < RUNS; ++i) 76 | sum += arr[i]; 77 | #endif 78 | print("")(fn,2,sum * 1.0f/(UNROLL*LOOP*count*8))(""); 79 | continue; 80 | skip: 81 | print(""); 82 | } 83 | print("\n")(flush,); 84 | } 85 | 86 | int 87 | main(void) 88 | { 89 | size_t x; 90 | seed = rv_cycles(); 91 | seed ^= (uintptr_t)&x; 92 | 93 | ux vlarr[] = { 0, 1 }; 94 | for (ux i = 0; i < 2; ++i) { 95 | for (ux j = 4; j--; ) { 96 | print("\n"); 97 | if (vlarr[i] != 0) 98 | print("vl=")(u,vlarr[i]); 99 | else 100 | print("vl=VLMAX"); 101 | print(s,j & 2 ? " ta" : " tu")(s,j & 1 ? " ma" : " mu")("\n\n"); 102 | char const *name = &bench_names; 103 | for (ux bIdx = 0; bIdx < bench_count; ++bIdx) { 104 | run_all_types(name, bIdx, vlarr[i], j >> 1, j & 1); 105 | while (*name++); 106 | } 107 | } 108 | } 109 | return 0; 110 | } 111 | -------------------------------------------------------------------------------- /instructions/scalar/Makefile: -------------------------------------------------------------------------------- 1 | .POSIX: 2 | 3 | include ../../config.mk 4 | 5 | all: scalar 6 | 7 | scalar: main.S main.c config.h 8 | ${CC} ${CFLAGS} main.S main.c -o $@ 9 | 10 | clean: 11 | rm -f scalar 12 | 13 | run: scalar 14 | ../../run.sh ./scalar 15 | -------------------------------------------------------------------------------- /instructions/scalar/config.h: -------------------------------------------------------------------------------- 1 | #define WARMUP 1000 2 | #define UNROLL 64 3 | #define LOOP 512 4 | #define RUNS 64 5 | 6 | -------------------------------------------------------------------------------- /instructions/scalar/main.c: -------------------------------------------------------------------------------- 1 | #include "../../nolibc.h" 2 | #include "config.h" 3 | 4 | static ux mem[128]; 5 | static ux seed = 123456; 6 | 7 | extern char const benchmark_names; 8 | extern ux (*benchmarks)(void); 9 | extern ux run_bench(ux (*bench)(void), void *ptr, ux seed); 10 | 11 | 12 | static int 13 | compare_ux(void const *a, void const *b) 14 | { 15 | return (*(ux*)a > *(ux*)b) - (*(ux*)a < *(ux*)b); 16 | } 17 | 18 | void 19 | run(char const *name, ux (*bench)(void)) { 20 | ux arr[RUNS]; 21 | 22 | print("")(s,name)(""); 23 | for (ux i = 0; i < RUNS; ++i) { 24 | arr[i] = run_bench(bench, mem, seed); 25 | seed = seed*7 + 13; 26 | } 27 | 28 | #if RUNS > 4 29 | qsort(arr, RUNS, sizeof *arr, compare_ux); 30 | ux sum = 0, count = 0; 31 | for (ux i = RUNS * 0.2f; i < RUNS * 0.8f; ++i, ++count) 32 | sum += arr[i]; 33 | #else 34 | ux sum = 0, count = RUNS; 35 | for (ux i = 0; i < RUNS; ++i) 36 | sum += arr[i]; 37 | #endif 38 | 39 | print("")(fn,2,sum * 1.0f/(UNROLL*LOOP*count))(""); 40 | print("\n")(flush,); 41 | } 42 | 43 | 44 | int 45 | main(void) 46 | { 47 | size_t x; 48 | seed = rv_cycles(); 49 | seed ^= (uintptr_t)&x; 50 | 51 | ux (**it)(void) = &benchmarks; 52 | char const *name = &benchmark_names; 53 | while (*it) { 54 | run(name, *it); 55 | ++it; 56 | while (*name++); 57 | } 58 | return 0; 59 | } 60 | -------------------------------------------------------------------------------- /instructions/xtheadvector/Makefile: -------------------------------------------------------------------------------- 1 | .POSIX: 2 | 3 | include ../../config.mk 4 | 5 | all: xtheadvector 6 | 7 | xtheadvector: main.S main.c config.h 8 | ${CC} ${CFLAGS} main.S main.c -o $@ 9 | 10 | clean: 11 | rm -f xtheadvector 12 | 13 | run: xtheadvector 14 | ../../run.sh ./xtheadvector 15 | -------------------------------------------------------------------------------- /instructions/xtheadvector/config.h: -------------------------------------------------------------------------------- 1 | #define WARMUP 1000 2 | #define UNROLL 64 3 | #define LOOP 16 4 | #define RUNS 1000 5 | 6 | /* processor specific configs */ 7 | #if 1 8 | // C920/C906 (some boards, e.g. BL808): e8/16/32/64 f16/32/64 9 | #define T_A 0b11111111 10 | #define T_W 0b01110111 11 | #define T_N 0b01110111 12 | #define T_F 0b11101111 13 | #define T_FW 0b01100111 14 | #define T_FN 0b01100111 15 | #else 16 | // C906: e8/16/32 f16/32 17 | #define T_A 0b1111111 18 | #define T_W 0b0110111 19 | #define T_N 0b0110111 20 | #define T_F 0b1101111 21 | #define T_FW 0b0100111 22 | #define T_FN 0b0100111 23 | #endif 24 | -------------------------------------------------------------------------------- /instructions/xtheadvector/main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "config.h" 5 | 6 | #include 7 | #include 8 | 9 | typedef uint64_t u64; 10 | 11 | static u64 seed = 128; 12 | 13 | extern char const *benchmark_names; 14 | extern u64 benchmark_types; 15 | extern u64 (*benchmarks)(void); 16 | extern u64 run_bench(u64 (*bench)(void), u64 type, u64 vl, u64 seed); 17 | 18 | static int 19 | compare_u64(void const *a, void const *b) 20 | { 21 | return *(clock_t*)a - *(clock_t*)b; 22 | } 23 | 24 | void 25 | run_all_types(char const *name, u64 (*bench)(void), u64 types, u64 vl) { 26 | static u64 arr[RUNS]; 27 | 28 | printf("%s", name); 29 | // m1..m8, e8..e64 30 | for (u64 vtype = 0; vtype < 16; ++vtype) { 31 | 32 | if (!((1 << (vtype & 3)) & types) || 33 | !((1 << (vtype >> 2)) & (types >> 4))) { 34 | printf(""); 35 | continue; 36 | } 37 | 38 | for (u64 i = 0; i < RUNS; ++i) { 39 | arr[i] = run_bench(bench, vtype, vl, seed); 40 | seed = seed*7 + 13; 41 | } 42 | 43 | #if RUNS > 4 44 | qsort(arr, RUNS, sizeof *arr, compare_u64); 45 | u64 sum = 0, count = 0; 46 | for (u64 i = RUNS * 0.2f; i < RUNS * 0.8f; ++i, ++count) 47 | sum += arr[i]; 48 | #else 49 | u64 sum = 0, count = RUNS; 50 | for (u64 i = 0; i < RUNS; ++i) 51 | sum += arr[i]; 52 | #endif 53 | printf("%2.1f", sum * 1.0/(UNROLL*LOOP*count)); 54 | } 55 | puts(""); 56 | } 57 | 58 | int 59 | main(void) 60 | { 61 | 62 | int fd = open("/dev/urandom", O_RDONLY); 63 | read(fd, &seed, sizeof seed); 64 | close(fd); 65 | 66 | u64 vlarr[] = { 0, 1 }; 67 | for (u64 i = 0; i < 2; ++i) { 68 | puts(""); 69 | if (vlarr[i] != 0) 70 | printf("vl=%"PRIu64"\n\n", vlarr[i]); 71 | else 72 | puts("vl=VLMAX\n"); 73 | u64 (**it)(void) = &benchmarks; 74 | char const **name = &benchmark_names; 75 | u64 *types = &benchmark_types; 76 | while (*it) { 77 | run_all_types(*name, *it, *types, vlarr[i]); 78 | ++it; 79 | ++name; 80 | ++types; 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # configure `make run` to work with your setup 4 | 5 | # local execution 6 | #./$@ 7 | 8 | # using qemu 9 | qemu-riscv64-static -cpu rv64,b=on,v=on,vlen=256,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@ 10 | #qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=256,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,zvfh=true $@ 11 | -------------------------------------------------------------------------------- /single/Makefile: -------------------------------------------------------------------------------- 1 | .POSIX: 2 | 3 | include ../config.mk 4 | 5 | EXECS=veclibm 6 | 7 | all: ${EXECS} 8 | 9 | veclibm: veclibm.c 10 | ${CC} ${CFLAGS} -o $@ $< ../thirdparty/veclibm/src/*.c -I ../thirdparty/veclibm/include -lm -Wno-unused -Wno-maybe-uninitialized 11 | 12 | clean: 13 | rm -f ${EXECS} 14 | 15 | -------------------------------------------------------------------------------- /single/veclibm.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #ifndef N 8 | #define N (1024*128) /* change me */ 9 | #endif 10 | 11 | static void 12 | rvvlm_sqrt(size_t x_len, const double *x, double *y) 13 | { 14 | for (size_t vl; x_len > 0; x_len -= vl, x += vl, y += vl) { 15 | vl = __riscv_vsetvl_e64m8(x_len); 16 | vfloat64m8_t v = __riscv_vle64_v_f64m8(x, vl); 17 | __riscv_vse64(y, __riscv_vfsqrt(v, vl), vl); 18 | } 19 | } 20 | 21 | #define APPLY(X) \ 22 | X(exp) X(exp2) X(expm1) X(log) X(log10) X(log2) X(log1p) \ 23 | X(sqrt) X(cbrt) \ 24 | X(sin) X(cos) X(tan) X(asin) X(acos) X(atan) \ 25 | X(sinh) X(cosh) X(tanh) X(asinh) X(acosh) X(atanh) \ 26 | X(erf) X(erfc) X(tgamma) X(lgamma) 27 | 28 | #define DECLARE(f) void rvvlm_##f(size_t x_len, const double *x, double *y); 29 | APPLY(DECLARE) 30 | 31 | #define DEFINE(f) \ 32 | static void lm_##f(size_t x_len, const double *x, double *y) { \ 33 | for (size_t i = 0; i < x_len; ++i) y[i] = f(x[i]); \ 34 | } 35 | APPLY(DEFINE) 36 | struct Func { 37 | void (*rvvlm)(size_t, const double*, double*); 38 | void (*lm)(size_t, const double*, double*); 39 | const char *name; 40 | }; 41 | 42 | struct Func funcs[] = { 43 | #define ENTRY(f) { rvvlm_##f, lm_##f, #f }, 44 | APPLY(ENTRY) 45 | }; 46 | 47 | typedef struct { uint64_t x, y, z; } URand; 48 | 49 | /* RomuDuoJr, see https://romu-random.org/ */ 50 | static inline uint64_t 51 | urand(URand *r) 52 | { 53 | #define ROTL(x,n) (((x) << (n)) | ((x) >> (8*sizeof(x) - (n)))) 54 | uint64_t xp = r->x, yp = r->y, zp = r->z; 55 | r->x = 15241094284759029579u * zp; 56 | r->y = ROTL(yp - xp, 12); 57 | r->z = ROTL(zp - yp, 44); 58 | return xp; 59 | } 60 | 61 | 62 | int 63 | main(void) 64 | { 65 | double *in = malloc(N*sizeof *in), *out = malloc(N*sizeof *out); 66 | URand r = {123, (uintptr_t)&in, (uintptr_t)&out}; 67 | 68 | for (size_t i = 0; i < N; ++i) 69 | in[i] = (urand(&r) >> (64 - 53)) * (1.0 / (1ull << 53)); 70 | 71 | for (size_t i = 0; i < sizeof funcs / sizeof *funcs; ++i) { 72 | size_t beg, end; 73 | struct Func f = funcs[i]; 74 | printf("%s libm: ", f.name); 75 | for (size_t i = 0; i < 3; ++i) { 76 | __asm__ volatile("fence.i"); 77 | __asm__ volatile ("csrr %0, cycle" : "=r"(beg)); 78 | f.lm(N, in, out); 79 | __asm__ volatile("fence.i"); 80 | __asm__ volatile ("csrr %0, cycle" : "=r"(end)); 81 | printf(" %f", ((double)N) / (end-beg)); 82 | } 83 | printf(" elements/cycle\n%s rvvlm:", f.name); 84 | for (size_t i = 0; i < 3; ++i) { 85 | __asm__ volatile("fence.i"); 86 | __asm__ volatile ("csrr %0, cycle" : "=r"(beg)); 87 | f.rvvlm(N, in, out); 88 | __asm__ volatile("fence.i"); 89 | __asm__ volatile ("csrr %0, cycle" : "=r"(end)); 90 | printf(" %f", ((double)N) / (end-beg)); 91 | } 92 | printf(" elements/cycle\n"); 93 | } 94 | free(in); 95 | free(out); 96 | return 0; 97 | } 98 | 99 | -------------------------------------------------------------------------------- /thirdparty/rvv-chacha-poly/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | I can accept your patches and contributions to this project with the 4 | following caveats from my employer: 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Community Guidelines 19 | 20 | Treat people with respect. 21 | -------------------------------------------------------------------------------- /thirdparty/rvv-chacha-poly/LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /thirdparty/rvv-chacha-poly/README.md: -------------------------------------------------------------------------------- 1 | NOTE: code from https://github.com/edre/rvv-chacha-poly 2 | 3 | RISC-V vector extension implementation of chacha20 and poly1305 4 | cryptographic primitives. 5 | 6 | Chacha20 and poly1305 are simple to vectorize without specialized 7 | instructions. This project implements them in assembly, and verifies them 8 | against the BoringSSL C implementation. As expected the executed instruction 9 | count go down a lot, but I don't have real hardware to see if the runtime does 10 | too. 11 | 12 | This is not an officially supported Google product. 13 | 14 | This is a proof of concept crypto library. Those words should sound very scary 15 | together. Don't use this. 16 | -------------------------------------------------------------------------------- /thirdparty/rvv-chacha-poly/boring.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2014, Google Inc. 2 | * 3 | * Permission to use, copy, modify, and/or distribute this software for any 4 | * purpose with or without fee is hereby granted, provided that the above 5 | * copyright notice and this permission notice appear in all copies. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 10 | * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 12 | * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 13 | * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 14 | 15 | #include 16 | #include 17 | 18 | void boring_chacha20(uint8_t *out, const uint8_t *in, 19 | size_t in_len, const uint8_t key[32], 20 | const uint8_t nonce[12], uint32_t counter); 21 | 22 | typedef uint8_t poly1305_state[512]; 23 | 24 | void boring_poly1305_init(poly1305_state *state, 25 | const uint8_t key[32]); 26 | 27 | void boring_poly1305_update(poly1305_state *state, 28 | const uint8_t *in, size_t in_len); 29 | 30 | void boring_poly1305_finish(poly1305_state *state, 31 | uint8_t mac[16]); 32 | -------------------------------------------------------------------------------- /thirdparty/rvv-chacha-poly/main.c: -------------------------------------------------------------------------------- 1 | /* Copyright 2020 Google LLC 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License") ; 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * https://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. */ 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include "boring.h" 20 | 21 | void println_hex(uint8_t* data, int size) { 22 | while (size > 0) { 23 | printf("%02x", *data); 24 | data++; 25 | size--; 26 | } 27 | printf("\n"); 28 | } 29 | 30 | // TODO: test the vector doesn't write past the end 31 | // test function with multiple length inputs (optional printing) 32 | // test non-block sized lengths 33 | 34 | extern uint64_t instruction_counter(); 35 | 36 | const char* pass_str = "\x1b[32mPASS\x1b[0m"; 37 | const char* fail_str = "\x1b[31mFAIL\x1b[0m"; 38 | 39 | bool test_chacha(const uint8_t* data, size_t len, const uint8_t key[32], const uint8_t nonce[12], bool verbose) { 40 | extern void vector_chacha20(uint8_t *out, const uint8_t *in, 41 | size_t in_len, const uint8_t key[32], 42 | const uint8_t nonce[12], uint32_t counter); 43 | uint8_t* golden = malloc(len); 44 | memset(golden, 0, len); 45 | uint64_t start = instruction_counter(); 46 | boring_chacha20(golden, data, len, key, nonce, 0); 47 | uint64_t end = instruction_counter(); 48 | uint64_t boring_count = end - start; 49 | 50 | uint8_t* vector = malloc(len + 4); 51 | memset(vector, 0, len+4); 52 | start = instruction_counter(); 53 | vector_chacha20(vector, data, len, key, nonce, 0); 54 | end = instruction_counter(); 55 | 56 | bool pass = memcmp(golden, vector, len) == 0; 57 | 58 | if (verbose || !pass) { 59 | printf("golden: "); 60 | println_hex(golden, 32); 61 | printf("inst_count=%d, inst/byte=%.02f\n", boring_count, (float)(boring_count)/len); 62 | printf("vector: "); 63 | println_hex(vector, 32); 64 | printf("inst_count=%d, inst/byte=%.02f\n", end - start, (float)(end - start)/len); 65 | } 66 | 67 | uint32_t past_end = vector[len]; 68 | if (past_end != 0) { 69 | printf("vector wrote past end %08x\n", past_end); 70 | pass = false; 71 | } 72 | 73 | free(golden); 74 | free(vector); 75 | 76 | return pass; 77 | } 78 | 79 | void test_chachas(FILE* f) { 80 | int len = 1024 - 11; 81 | uint8_t* data = malloc(len); 82 | uint32_t rand = 1; 83 | for (int i = 0; i < len; i++) { 84 | rand *= 101; 85 | rand %= 16777213; // random prime 86 | data[i] = (uint8_t)(rand); 87 | } 88 | uint8_t key[32] = "Setec astronomy;too many secrets"; 89 | uint8_t nonce[12] = "BurnAfterUse"; 90 | int counter = 0; 91 | 92 | bool pass = test_chacha(data, len, key, nonce, true); 93 | 94 | if (pass) { 95 | for (int i = 1, len = 1; len < 1000; len += i++) { 96 | fread(key, 32, 1, f); 97 | fread(nonce, 12, 1, f); 98 | if (!test_chacha(data, len, key, nonce, false)) { 99 | printf("Failed with len=%d\n", len); 100 | pass = false; 101 | break; 102 | } 103 | } 104 | } 105 | 106 | if (pass) { 107 | printf("chacha %s\n", pass_str); 108 | } else { 109 | printf("chacha %s\n", fail_str); 110 | } 111 | } 112 | 113 | bool test_poly(const uint8_t* data, size_t len, const uint8_t key[32], bool verbose) { 114 | extern uint64_t vector_poly1305(const uint8_t* in, size_t len, 115 | const uint8_t key[32], uint8_t sig[16]); 116 | 117 | poly1305_state state; 118 | uint8_t *sig = malloc(16); // gets corrupted if I define it on the stack? 119 | uint64_t start = instruction_counter(); 120 | boring_poly1305_init(&state, key); 121 | boring_poly1305_update(&state, data, len); 122 | boring_poly1305_finish(&state, sig); 123 | uint64_t end = instruction_counter(); 124 | uint64_t boring_count = end - start; 125 | 126 | uint8_t *sig2 = malloc(16); 127 | start = instruction_counter(); 128 | uint64_t mid = vector_poly1305(data, len, key, sig2); 129 | end = instruction_counter(); 130 | 131 | bool pass = memcmp(sig, sig2, 16) == 0; 132 | 133 | if (verbose || !pass) { 134 | printf("boring mac: "); 135 | println_hex(sig, 16); 136 | printf("inst_count=%d, inst/byte=%.02f\n", boring_count, (float)(boring_count)/len); 137 | printf("vector mac: "); 138 | println_hex(sig2, 16); 139 | printf("precomputation=%d, processing=%d, inst/byte=%.02f\n", 140 | mid - start, end - mid, (float)(end - mid)/len); 141 | } 142 | 143 | free(sig); 144 | free(sig2); 145 | return pass; 146 | } 147 | 148 | void test_polys(FILE* f) { 149 | const int big_len = 1024; 150 | uint8_t *zero = malloc(2000); 151 | uint8_t *max_bits = malloc(big_len); 152 | memset(max_bits, 0xff, big_len); 153 | const uint8_t one[32] = {1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; 154 | const uint8_t key[32] = {1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 255, 155 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; 156 | const uint8_t data[272] = "Setec astronomy;too many secrets"; 157 | bool pass = test_poly(max_bits, big_len, max_bits, true); 158 | 159 | if (!pass) 160 | goto end; 161 | 162 | // random test 163 | const int max_len = 1000; 164 | uint8_t *rand = malloc(max_len); 165 | for (int len = 16; len <= max_len; len += 16) { 166 | fread((uint8_t*)key, 32, 1, f); 167 | fread((uint8_t*)rand, len, 1, f); 168 | if (!test_poly(data, len, key, false)) { 169 | printf("failed random input len=%d\n", len); 170 | pass = false; 171 | break; 172 | } 173 | } 174 | free(rand); 175 | 176 | end: 177 | if (pass) { 178 | printf("poly %s\n", pass_str); 179 | } else { 180 | printf("poly %s\n", fail_str); 181 | } 182 | 183 | free(zero); 184 | free(max_bits); 185 | } 186 | 187 | int main(int argc, uint8_t *argv[]) { 188 | extern uint32_t vlmax_u32(); 189 | printf("VLMAX in blocks: %d\n", vlmax_u32()); 190 | FILE* rand = fopen("/dev/urandom", "r"); 191 | test_chachas(rand); 192 | printf("\n"); 193 | test_polys(rand); 194 | fclose(rand); 195 | } 196 | -------------------------------------------------------------------------------- /thirdparty/rvv-chacha-poly/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License") ; 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Dependencies to be installed and on the PATH: 18 | # https://github.com/riscv/riscv-gnu-toolchain 19 | # https://github.com/riscv/riscv-isa-sim 20 | # configure --prefix=$RISCV --with-varch=v512:e64 21 | # https://github.com/riscv/riscv-pk 22 | 23 | ISA=rv64gcv 24 | 25 | riscv64-unknown-elf-gcc -march=$ISA main.c boring.c vchacha.s vpoly.s -o main -O && 26 | spike --isa=$ISA `which pk` main 27 | -------------------------------------------------------------------------------- /thirdparty/rvv-chacha-poly/vchacha.s: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License") ; 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | .global instruction_counter 16 | .global vector_chacha20 17 | .global vlmax_u32 18 | 19 | instruction_counter: 20 | rdinstret a0 21 | ret 22 | 23 | vlmax_u32: 24 | vsetvli a0, x0, e32, m1, ta, ma 25 | ret 26 | 27 | 28 | # Cell-based implementation strategy: 29 | # v0-v15: Cell vectors. Each element is from a different block 30 | 31 | ## Function initialization 32 | # Using the same order as the boring chacha arguments: 33 | # a0 = uint8_t *out 34 | # a1 = uint8_t *in 35 | # a2 = size_t in_len 36 | # a3 = uint8_t key[32] 37 | # a4 = uint8_t nonce[12] 38 | # a5 = uint32_t counter 39 | vector_chacha20: 40 | # a2 = initial length in bytes 41 | # t3 = remaining 64-byte blocks to mix 42 | # t4 = remaining full blocks to read/write 43 | # (if t3 and t4 are different by one, there is a partial block to manually xor) 44 | # t1 = vl in 64-byte blocks 45 | srli t4, a2, 6 46 | addi t0, a2, 63 47 | srli t3, t0, 6 48 | encrypt_blocks: 49 | # initialize vector state 50 | vsetvli t1, t3, e32, m1, ta, ma 51 | # Load 128 bit constant 52 | li t0, 0x61707865 # "expa" little endian 53 | vmv.v.x v0, t0 54 | li t0, 0x3320646e # "nd 3" little endian 55 | vmv.v.x v1, t0 56 | li t0, 0x79622d32 # "2-by" little endian 57 | vmv.v.x v2, t0 58 | li t0, 0x6b206574 # "te k" little endian 59 | vmv.v.x v3, t0 60 | # Load key 61 | lw t0, 0(a3) 62 | vmv.v.x v4, t0 63 | lw t0, 4(a3) 64 | vmv.v.x v5, t0 65 | lw t0, 8(a3) 66 | vmv.v.x v6, t0 67 | lw t0, 12(a3) 68 | vmv.v.x v7, t0 69 | lw t0, 16(a3) 70 | vmv.v.x v8, t0 71 | lw t0, 20(a3) 72 | vmv.v.x v9, t0 73 | lw t0, 24(a3) 74 | vmv.v.x v10, t0 75 | lw t0, 28(a3) 76 | vmv.v.x v11, t0 77 | # Load counter, and increment for each element 78 | vid.v v12 79 | vadd.vx v12, v12, a5 80 | # Load nonce 81 | lw t0, 0(a4) 82 | vmv.v.x v13, t0 83 | lw t0, 4(a4) 84 | vmv.v.x v14, t0 85 | lw t0, 8(a4) 86 | vmv.v.x v15, t0 87 | 88 | li t2, 10 # loop counter 89 | round_loop: 90 | 91 | .macro vrotl a i r 92 | #if __riscv_zvbb 93 | vror.vi \a, \a, 32-\i 94 | #else 95 | vsll.vi v16, \a, \i 96 | vsrl.vi \a, \a, 32-\i 97 | vor.vv \a, \a, v16 98 | #endif 99 | .endm 100 | 101 | .macro quarterround a b c d 102 | # a += b; d ^= a; d <<<= 16; 103 | vadd.vv \a, \a, \b 104 | vxor.vv \d, \d, \a 105 | vrotl \d, 16, t6 106 | # c += d; b ^= c; b <<<= 12; 107 | vadd.vv \c, \c, \d 108 | vxor.vv \b, \b, \c 109 | vrotl \b, 12, t7 110 | # a += b; d ^= a; d <<<= 8; 111 | vadd.vv \a, \a, \b 112 | vxor.vv \d, \d, \a 113 | vrotl \d, 8, t8 114 | # c += d; b ^= c; b <<<= 7; 115 | vadd.vv \c, \c, \d 116 | vxor.vv \b, \b, \c 117 | vrotl \b, 7, t9 118 | .endm 119 | 120 | # Mix columns. 121 | quarterround v0, v4, v8, v12 122 | quarterround v1, v5, v9, v13 123 | quarterround v2, v6, v10, v14 124 | quarterround v3, v7, v11, v15 125 | # Mix diagonals. 126 | quarterround v0, v5, v10, v15 127 | quarterround v1, v6, v11, v12 128 | quarterround v2, v7, v8, v13 129 | quarterround v3, v4, v9, v14 130 | 131 | addi t2, t2, -1 132 | bnez t2, round_loop 133 | 134 | # Add in initial block values. 135 | # 128 bit constant 136 | li t0, 0x61707865 # "expa" little endian 137 | vadd.vx v0, v0, t0 138 | li t0, 0x3320646e # "nd 3" little endian 139 | vadd.vx v1, v1, t0 140 | li t0, 0x79622d32 # "2-by" little endian 141 | vadd.vx v2, v2, t0 142 | li t0, 0x6b206574 # "te k" little endian 143 | vadd.vx v3, v3, t0 144 | # Add key 145 | lw t0, 0(a3) 146 | vadd.vx v4, v4, t0 147 | lw t0, 4(a3) 148 | vadd.vx v5, v5, t0 149 | lw t0, 8(a3) 150 | vadd.vx v6, v6, t0 151 | lw t0, 12(a3) 152 | vadd.vx v7, v7, t0 153 | lw t0, 16(a3) 154 | vadd.vx v8, v8, t0 155 | lw t0, 20(a3) 156 | vadd.vx v9, v9, t0 157 | lw t0, 24(a3) 158 | vadd.vx v10, v10, t0 159 | lw t0, 28(a3) 160 | vadd.vx v11, v11, t0 161 | # Add counter 162 | vid.v v16 163 | vadd.vv v12, v12, v16 164 | vadd.vx v12, v12, a5 165 | # Load nonce 166 | lw t0, 0(a4) 167 | vadd.vx v13, v13, t0 168 | lw t0, 4(a4) 169 | vadd.vx v14, v14, t0 170 | lw t0, 8(a4) 171 | vadd.vx v15, v15, t0 172 | 173 | # load in vector lanes with two strided segment loads 174 | # in case this is the final block, reset vl to full blocks 175 | vsetvli t5, t4, e32, m1, ta, ma 176 | li t0, 64 177 | vlsseg8e32.v v16, (a1), t0 178 | add a1, a1, 32 179 | vlsseg8e32.v v24, (a1), t0 180 | add a1, a1, -32 181 | 182 | # xor in state 183 | vxor.vv v16, v16, v0 184 | vxor.vv v17, v17, v1 185 | vxor.vv v18, v18, v2 186 | vxor.vv v19, v19, v3 187 | vxor.vv v20, v20, v4 188 | vxor.vv v21, v21, v5 189 | vxor.vv v22, v22, v6 190 | vxor.vv v23, v23, v7 191 | vxor.vv v24, v24, v8 192 | vxor.vv v25, v25, v9 193 | vxor.vv v26, v26, v10 194 | vxor.vv v27, v27, v11 195 | vxor.vv v28, v28, v12 196 | vxor.vv v29, v29, v13 197 | vxor.vv v30, v30, v14 198 | vxor.vv v31, v31, v15 199 | 200 | # write back out with 2 strided segment stores 201 | vssseg8e32.v v16, (a0), t0 202 | add a0, a0, 32 203 | vssseg8e32.v v24, (a0), t0 204 | add a0, a0, -32 205 | 206 | # update counters/pointers 207 | slli t0, t5, 6 # current VL in bytes 208 | add a0, a0, t0 # advance output pointer 209 | add a1, a1, t0 # advance input pointer 210 | sub a2, a2, t0 # decrement remaining bytes 211 | sub t3, t3, t1 # decrement remaining blocks 212 | sub t4, t4, t1 # decrement remaining blocks 213 | # TODO: crash if counter overflows 214 | add a5, a5, t1 # increment counter 215 | 216 | # loop again if we have remaining blocks 217 | bne x0, t3, encrypt_blocks 218 | 219 | # we're done if there are no more remaining bytes from a partial block 220 | beq zero, a2, return 221 | 222 | # to get the remaining partial block, we transfer the nth element of 223 | # all the state vectors into contiguous stack memory with vsseg, then 224 | # read them with byte-granularity vl 225 | 226 | # reconstruct vl for all computed blocks 227 | add t0, t3, t1 228 | vsetvli t0, t0, e32, m1, ta, ma 229 | add t0, t0, -1 230 | 231 | #vse.v v4, (a0) 232 | #j return 233 | 234 | # use a masked vsseg instead of sliding everything down? 235 | # both options seem like they might touch a lot of vector state... 236 | vslidedown.vx v16, v0, t0 237 | vslidedown.vx v17, v1, t0 238 | vslidedown.vx v18, v2, t0 239 | vslidedown.vx v19, v3, t0 240 | vslidedown.vx v20, v4, t0 241 | vslidedown.vx v21, v5, t0 242 | vslidedown.vx v22, v6, t0 243 | vslidedown.vx v23, v7, t0 244 | vslidedown.vx v24, v8, t0 245 | vslidedown.vx v25, v9, t0 246 | vslidedown.vx v26, v10, t0 247 | vslidedown.vx v27, v11, t0 248 | vslidedown.vx v28, v12, t0 249 | vslidedown.vx v29, v13, t0 250 | vslidedown.vx v30, v14, t0 251 | vslidedown.vx v31, v15, t0 252 | li t0, 1 253 | vsetvli zero, t0, e32, m1, ta, ma 254 | addi t0, sp, -64 255 | addi t1, sp, -32 256 | vsseg8e32.v v16, (t0) 257 | vsseg8e32.v v24, (t1) 258 | 259 | vsetvli a2, a2, e8, m8, ta, ma 260 | vle8.v v0, (a1) 261 | vle8.v v8, (t0) 262 | vxor.vv v0, v0, v8 263 | vse8.v v0, (a0) 264 | 265 | 266 | return: 267 | ret 268 | -------------------------------------------------------------------------------- /thirdparty/rvv-rollback.S: -------------------------------------------------------------------------------- 1 | # rvv-rollback.S -- A minimal benchmarking library 2 | # Olaf Bernstein 3 | # Distributed under the MIT license, see license at the end of the file. 4 | # New versions available at https://gist.github.com/camel-cdr/cfd9ba2b8754b521edf4892fe19c7031 5 | # Conversions taken from https://github.com/RISCVtestbed/rvv-rollback 6 | 7 | .macro vle32.v a:vararg 8 | vlw.v \a 9 | .endm 10 | .macro vle16.v a:vararg 11 | vlh.v \a 12 | .endm 13 | .macro vle8.v a:vararg 14 | vlb.v \a 15 | .endm 16 | .macro vle32ff.v a:vararg 17 | vlwff.v \a 18 | .endm 19 | .macro vle16ff.v a:vararg 20 | vlhff.v \a 21 | .endm 22 | .macro vle8ff.v a:vararg 23 | vlbff.v \a 24 | .endm 25 | .macro vse32.v a:vararg 26 | vsw.v \a 27 | .endm 28 | .macro vse16.v a:vararg 29 | vsh.v \a 30 | .endm 31 | .macro vse8.v a:vararg 32 | vsb.v \a 33 | .endm 34 | .macro vluxei32.v a:vararg 35 | vlxw.v \a 36 | .endm 37 | .macro vluxei16.v a:vararg 38 | vlxh.v \a 39 | .endm 40 | .macro vluxei8.v a:vararg 41 | vlxb.v \a 42 | .endm 43 | .macro vsuxei32.v a:vararg 44 | vsuxw.v \a 45 | .endm 46 | .macro vsuxei16.v a:vararg 47 | vsuxh.v \a 48 | .endm 49 | .macro vsuxei8.v a:vararg 50 | vsuxb.v \a 51 | .endm 52 | .macro vlse32.v a:vararg 53 | vlsw.v \a 54 | .endm 55 | .macro vlse16.v a:vararg 56 | vlsh.v \a 57 | .endm 58 | .macro vlse8.v a:vararg 59 | vlsb.v \a 60 | .endm 61 | .macro vsse32.v a:vararg 62 | vssw.v \a 63 | .endm 64 | .macro vsse16.v a:vararg 65 | vssh.v \a 66 | .endm 67 | .macro vsse8.v a:vararg 68 | vssb.v \a 69 | .endm 70 | .macro vloxei32.v a:vararg 71 | vlxw.v \a 72 | .endm 73 | .macro vloxei16.v a:vararg 74 | vlxh.v \a 75 | .endm 76 | .macro vloxei8.v a:vararg 77 | vlxb.v \a 78 | .endm 79 | .macro vsoxei32.v a:vararg 80 | vsxw.v \a 81 | .endm 82 | .macro vsoxei16.v a:vararg 83 | vsxh.v \a 84 | .endm 85 | .macro vsoxei8.v a:vararg 86 | vsxb.v \a 87 | .endm 88 | .macro vfncvt.xu.f.w a:vararg 89 | vfncvt.xu.f.v \a 90 | .endm 91 | .macro vfncvt.x.f.w a:vararg 92 | vfncvt.x.f.v \a 93 | .endm 94 | .macro vfncvt.f.xu.w a:vararg 95 | vfncvt.f.xu.v \a 96 | .endm 97 | .macro vfncvt.f.x.w a:vararg 98 | vfncvt.f.x.v \a 99 | .endm 100 | .macro vfncvt.f.f.w a:vararg 101 | vfncvt.f.f.v \a 102 | .endm 103 | .macro vfredusum a:vararg 104 | vfredsum \a 105 | .endm 106 | .macro vfwredusum.vs a:vararg 107 | vfwredsum.vs \a 108 | .endm 109 | .macro vnclip.wv a:vararg 110 | vnclip.vv \a 111 | .endm 112 | .macro vnclip.wx a:vararg 113 | vnclip.vx \a 114 | .endm 115 | .macro vnclip.wi a:vararg 116 | vnclip.vi \a 117 | .endm 118 | .macro vnclipu.wv a:vararg 119 | vnclipu.vv \a 120 | .endm 121 | .macro vnclipu.wx a:vararg 122 | vnclipu.vx \a 123 | .endm 124 | .macro vnclipu.wi a:vararg 125 | vnclipu.vi \a 126 | .endm 127 | .macro vnsra.wv a:vararg 128 | vnsra.vv \a 129 | .endm 130 | .macro vnsra.wx a:vararg 131 | vnsra.vx \a 132 | .endm 133 | .macro vnsra.wi a:vararg 134 | vnsra.vi \a 135 | .endm 136 | .macro vnsrl.wv a:vararg 137 | vnsrl.vv \a 138 | .endm 139 | .macro vnsrl.wx a:vararg 140 | vnsrl.vx \a 141 | .endm 142 | .macro vnsrl.wi a:vararg 143 | vnsrl.vi \a 144 | .endm 145 | .macro vmandn.mm a:vararg 146 | vmandnot.mm \a 147 | .endm 148 | .macro vmorn.mm a:vararg 149 | vmornot.mm \a 150 | .endm 151 | .macro vmmv.m a:vararg 152 | vmcpy.m \a 153 | .endm 154 | .macro vcpop.m a:vararg 155 | vmpopc.m \a 156 | .endm 157 | .macro vpop.m a:vararg 158 | vmpopc.m \a 159 | .endm 160 | .macro vfirst.m a:vararg 161 | vmfirst.m \a 162 | .endm 163 | 164 | .macro define_for_all_nf prefix suffix prefix2 suffix2 165 | .macro \prefix\()2\suffix a:vararg 166 | \prefix2\()2\suffix2 \a 167 | .endm 168 | .macro \prefix\()3\suffix a:vararg 169 | \prefix2\()3\suffix2 \a 170 | .endm 171 | .macro \prefix\()4\suffix a:vararg 172 | \prefix2\()4\suffix2 \a 173 | .endm 174 | .macro \prefix\()5\suffix a:vararg 175 | \prefix2\()5\suffix2 \a 176 | .endm 177 | .macro \prefix\()6\suffix a:vararg 178 | \prefix2\()6\suffix2 \a 179 | .endm 180 | .macro \prefix\()7\suffix a:vararg 181 | \prefix2\()7\suffix2 \a 182 | .endm 183 | .macro \prefix\()8\suffix a:vararg 184 | \prefix2\()8\suffix2 \a 185 | .endm 186 | .endm 187 | define_for_all_nf vlseg e8.v vlseg b.v 188 | define_for_all_nf vlseg e16.v vlseg h.v 189 | define_for_all_nf vlseg e32.v vlseg w.v 190 | 191 | define_for_all_nf vsseg e8.v vsseg b.v 192 | define_for_all_nf vsseg e16.v vsseg h.v 193 | define_for_all_nf vsseg e32.v vsseg w.v 194 | 195 | define_for_all_nf vlsseg e8.v vlsseg bu.v 196 | define_for_all_nf vlsseg e16.v vlsseg hu.v 197 | define_for_all_nf vlsseg e32.v vlsseg wu.v 198 | 199 | define_for_all_nf vssseg e8.v vssseg b.v 200 | define_for_all_nf vssseg e16.v vssseg h.v 201 | define_for_all_nf vssseg e32.v vssseg w.v 202 | 203 | define_for_all_nf vloxseg e8.v vlxseg b.v 204 | define_for_all_nf vloxseg e16.v vlxseg h.v 205 | define_for_all_nf vloxseg e32.v vlxseg w.v 206 | define_for_all_nf vluxseg e8.v vlxseg b.v 207 | define_for_all_nf vluxseg e16.v vlxseg h.v 208 | define_for_all_nf vluxseg e32.v vlxseg w.v 209 | 210 | define_for_all_nf vsoxseg e8.v vsxseg b.v 211 | define_for_all_nf vsoxseg e16.v vsxseg h.v 212 | define_for_all_nf vsoxseg e32.v vsxseg w.v 213 | define_for_all_nf vsuxseg e8.v vsxseg b.v 214 | define_for_all_nf vsuxseg e16.v vsxseg h.v 215 | define_for_all_nf vsuxseg e32.v vsxseg w.v 216 | 217 | 218 | .macro vsetvl0p7 rd, rs1, rs2, T=1, M=1 219 | vsetvl \rd, \rs1, \rs2 220 | .endm 221 | .macro vsetvli0p7 rd, rs1, e=e8, m=m1, T=1, M=1 222 | .ifc \m, mf2 223 | NOT SUPPORTED IN rvv0.7 224 | .endif 225 | .ifc \m, mf4 226 | NOT SUPPORTED IN rvv0.7 227 | .endif 228 | .ifc \m, mf8 229 | NOT SUPPORTED IN rvv0.7 230 | .endif 231 | vsetvli \rd, \rs1, \e, \m 232 | .endm 233 | 234 | #define vsetvl vsetvl0p7 235 | #define vsetvli vsetvli0p7 236 | 237 | 238 | 239 | # Copyright (c) 2023 Olaf Berstein 240 | # Permission is hereby granted, free of charge, to any person obtaining a copy 241 | # of this software and associated documentation files (the "Software"), to deal 242 | # in the Software without restriction, including without limitation the rights 243 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 244 | # copies of the Software, and to permit persons to whom the Software is 245 | # furnished to do so, subject to the following conditions: 246 | # The above copyright notice and this permission notice shall be included in 247 | # all copies or substantial portions of the Software. 248 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 249 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 250 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 251 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 252 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 253 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 254 | # SOFTWARE. 255 | 256 | -------------------------------------------------------------------------------- /vector-utf/16to8_gather.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | size_t utf16_to_utf8_scalar(uint16_t const *src, size_t count, char *dest); 4 | 5 | size_t 6 | utf16_to_utf8_rvv(uint16_t const *src, size_t count, char *dest) 7 | { 8 | size_t n = count; 9 | char *const destBeg = dest; 10 | size_t vl8m4 = __riscv_vsetvlmax_e8m4(); 11 | vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(__riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4); 12 | 13 | for (size_t vl, vlOut; n > 0; ) { 14 | 15 | vl = __riscv_vsetvl_e16m2(n); 16 | 17 | vuint16m2_t v = __riscv_vle16_v_u16m2(src, vl); 18 | vbool8_t m234 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x80-1, vl); 19 | 20 | if (__riscv_vfirst_m_b8(m234,vl) < 0) { /* 1 byte utf8 */ 21 | vlOut = vl; 22 | __riscv_vse8_v_u8m1((uint8_t*)dest, __riscv_vncvt_x_x_w_u8m1(v, vlOut), vlOut); 23 | n -= vl, src += vl, dest += vlOut; 24 | continue; 25 | } 26 | 27 | vbool8_t m34 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x800-1, vl); 28 | 29 | if (__riscv_vfirst_m_b8(m34,vl) < 0) { /* 1/2 byte utf8 */ 30 | /* 0: [ aaa|aabbbbbb] 31 | * 1: [aabbbbbb| ] vsll 8 32 | * 2: [ | aaaaa] vsrl 6 33 | * 3: [00111111|00111111] 34 | * 4: [ bbbbbb|000aaaaa] (1|2)&3 35 | * 5: [11000000|11000000] 36 | * 6: [10bbbbbb|110aaaaa] 4|5 */ 37 | vuint16m2_t twoByte = 38 | __riscv_vand_vx_u16m2(__riscv_vor_vv_u16m2( 39 | __riscv_vsll_vx_u16m2(v, 8, vl), 40 | __riscv_vsrl_vx_u16m2(v, 6, vl), 41 | vl), 0b0011111100111111, vl); 42 | v = __riscv_vor_vx_u16m2_mu(m234, v, twoByte, 0b1000000011000000, vl); 43 | vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(v); 44 | 45 | /* Every high byte that is zero should be compressed 46 | * low bytes should never be compressed, so we set them 47 | * to all ones, and then create a non-zero bytes mask */ 48 | vbool4_t mcomp = __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(__riscv_vor_vx_u16m2(v, 0xFF, vl)), 0, vl*2); 49 | vlOut = __riscv_vcpop_m_b4(mcomp, vl*2); 50 | 51 | vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl*2); 52 | __riscv_vse8_v_u8m2((uint8_t*)dest, vout, vlOut); 53 | 54 | n -= vl, src += vl, dest += vlOut; 55 | continue; 56 | } 57 | 58 | //vbool8_t sur = __riscv_vmsgtu_vx_u16m2_b8(v, 0xD800-1, vl); 59 | vbool8_t sur = __riscv_vmseq_vx_u16m2_b8(__riscv_vand_vx_u16m2(v, 0xF800, vl), 0xD800, vl); 60 | long first = __riscv_vfirst_m_b8(sur, vl); 61 | size_t tail = vl - first; 62 | vl = first < 0 ? vl : first; 63 | 64 | if (vl > 0) { /* 1/2/3 byte utf8 */ 65 | /* in: [aaaabbbb|bbcccccc] 66 | * v1: [0bcccccc| ] vsll 8 67 | * v1: [10cccccc| ] vsll 8 & 0b00111111 | 0b10000000 68 | * v2: [ |110bbbbb] vsrl 6 & 0b00111111 | 0b11000000 69 | * v2: [ |10bbbbbb] vsrl 6 & 0b00111111 | 0b10000000 70 | * v3: [ |1110aaaa] vsrl 12 | 0b11100000 71 | * 1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc] 72 | * 2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc] 73 | * 3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb] [10cccccc] 74 | */ 75 | vuint16m2_t v1, v2, v3, v12; 76 | v1 = __riscv_vor_vx_u16m2_mu(m234, v, __riscv_vand_vx_u16m2(v, 0b00111111, vl), 0b10000000, vl); 77 | v1 = __riscv_vsll_vx_u16m2(v1, 8, vl); 78 | 79 | v2 = __riscv_vor_vx_u16m2(__riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 6, vl), 0b00111111, vl), 0b10000000, vl); 80 | v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34,vl), v2, v2, 0b01000000, vl); 81 | v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 12, vl), 0b11100000, vl); 82 | v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl); 83 | 84 | vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1<<8, vl); 85 | vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl); 86 | vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123); 87 | 88 | vbool2_t mcomp = __riscv_vmor_mm_b2(m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl*4), vl*4); 89 | vlOut = __riscv_vcpop_m_b2(mcomp, vl*4); 90 | 91 | vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl*4); 92 | __riscv_vse8_v_u8m4((uint8_t*)dest, vout, vlOut); 93 | 94 | n -= vl, src += vl, dest += vlOut; 95 | } 96 | 97 | if (tail) while (n) { 98 | uint16_t word = *src; 99 | if((word & 0xFF80)==0) { 100 | break; 101 | } else if((word & 0xF800)==0) { 102 | break; 103 | } else if ((word & 0xF800) != 0xD800) { 104 | break; 105 | } else { 106 | // must be a surrogate pair 107 | if (n <= 1) return 0; 108 | uint16_t diff = word - 0xD800; 109 | if (diff > 0x3FF) return 0; 110 | uint16_t diff2 = src[1] - 0xDC00; 111 | if (diff2 > 0x3FF) return 0; 112 | 113 | uint32_t value = ((diff + 0x40) << 10) + diff2 ; 114 | // uint32_t value = (diff << 10) + diff2 + 0x10000; 115 | 116 | // will generate four UTF-8 bytes 117 | // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX 118 | *dest++ = (char)((value>>18) | 0b11110000); 119 | *dest++ = (char)(((value>>12) & 0b111111) | 0b10000000); 120 | *dest++ = (char)(((value>>6) & 0b111111) | 0b10000000); 121 | *dest++ = (char)((value & 0b111111) | 0b10000000); 122 | src += 2; 123 | n-=2; 124 | } 125 | } 126 | } 127 | 128 | return (size_t)(dest - destBeg); 129 | } 130 | 131 | -------------------------------------------------------------------------------- /vector-utf/8toN_gather.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | #if TO_16 5 | # define uintOut_t uint16_t 6 | # define utf8_to_utf32_scalar utf8_to_utf16_scalar 7 | # define utf8_to_utf32_rvv utf8_to_utf16_rvv 8 | #else 9 | # define uintOut_t uint32_t 10 | #endif 11 | 12 | size_t utf8_to_utf32_scalar(char const *src, size_t count, uintOut_t *dest); 13 | 14 | size_t 15 | utf8_to_utf32_rvv(char const *src, size_t count, uintOut_t *dest) 16 | { 17 | size_t tail = 3; 18 | if (count < tail) return utf8_to_utf32_scalar(src, count, dest); 19 | 20 | /* validate first three bytes */ 21 | { 22 | size_t idx = tail; 23 | while (idx < count && (src[idx] >> 6) == 0b10) 24 | ++idx; 25 | uintOut_t buf[10]; 26 | if (idx > tail + 3 || !utf8_to_utf32_scalar(src, idx, buf)) 27 | return 0; 28 | } 29 | 30 | size_t n = count - tail; 31 | uintOut_t *destBeg = dest; 32 | 33 | static const uint64_t err1m[] = { 0x0202020202020202, 0x4915012180808080 }; 34 | static const uint64_t err2m[] = { 0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB }; 35 | static const uint64_t err3m[] = { 0x0101010101010101, 0X01010101BABAAEE6 }; 36 | 37 | const vuint8m1_t err1tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2)); 38 | const vuint8m1_t err2tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2)); 39 | const vuint8m1_t err3tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2)); 40 | 41 | const size_t vl8m1 = __riscv_vsetvlmax_e8m1(); 42 | const size_t vl16m2 = __riscv_vsetvlmax_e16m2(); 43 | 44 | #if TO_16 45 | size_t vl8m2 = __riscv_vsetvlmax_e8m2(); 46 | const vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(__riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2); 47 | #endif 48 | 49 | for (size_t vl, vlOut; n > 0; n -= vl, src += vl, dest += vlOut) { 50 | 51 | vl = __riscv_vsetvl_e8m2(n); 52 | 53 | vuint8m2_t v0 = __riscv_vle8_v_u8m2((uint8_t const*)src, vl); 54 | uint64_t max = __riscv_vmv_x_s_u8m1_u8(__riscv_vredmaxu_vs_u8m2_u8m1(v0, __riscv_vmv_s_x_u8m1(0, vl), vl)); 55 | 56 | uint8_t next0 = src[vl+0]; 57 | uint8_t next1 = src[vl+1]; 58 | uint8_t next2 = src[vl+2]; 59 | 60 | /* fast path: ASCII */ 61 | if ((max|next0|next1|next2) < 0b10000000) { 62 | vlOut = vl; 63 | #if TO_16 64 | __riscv_vse16_v_u16m4(dest, __riscv_vzext_vf2_u16m4(v0, vlOut), vlOut); 65 | #else 66 | __riscv_vse32_v_u32m8(dest, __riscv_vzext_vf4_u32m8(v0, vlOut), vlOut); 67 | #endif 68 | continue; 69 | } 70 | 71 | /* see "Validating UTF-8 In Less Than One Instruction Per Byte" 72 | * https://arxiv.org/abs/2010.03090 */ 73 | vuint8m2_t v1 = __riscv_vslide1down_vx_u8m2(v0, next0, vl); 74 | vuint8m2_t v2 = __riscv_vslide1down_vx_u8m2(v1, next1, vl); 75 | vuint8m2_t v3 = __riscv_vslide1down_vx_u8m2(v2, next2, vl); 76 | 77 | vuint8m2_t s1 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(__riscv_vreinterpret_v_u8m2_u16m2(v2), 4, vl16m2)); 78 | vuint8m2_t s3 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(__riscv_vreinterpret_v_u8m2_u16m2(v3), 4, vl16m2)); 79 | 80 | vuint8m2_t idx2 = __riscv_vand_vx_u8m2(v2, 0xF, vl); 81 | vuint8m2_t idx1 = __riscv_vand_vx_u8m2(s1, 0xF, vl); 82 | vuint8m2_t idx3 = __riscv_vand_vx_u8m2(s3, 0xF, vl); 83 | 84 | #define VRGATHER_u8m1x2(tbl, idx) \ 85 | __riscv_vset_v_u8m1_u8m2(__riscv_vlmul_ext_v_u8m1_u8m2( \ 86 | __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 0), vl8m1)), 1, \ 87 | __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 1), vl8m1)); 88 | 89 | vuint8m2_t err1 = VRGATHER_u8m1x2(err1tbl, idx1); 90 | vuint8m2_t err2 = VRGATHER_u8m1x2(err2tbl, idx2); 91 | vuint8m2_t err3 = VRGATHER_u8m1x2(err3tbl, idx3); 92 | vint8m2_t errs = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vand_vv_u8m2(__riscv_vand_vv_u8m2(err1, err2, vl), err3, vl)); 93 | 94 | vbool4_t is_3 = __riscv_vmsgtu_vx_u8m2_b4(v1, 0b11100000-1, vl); 95 | vbool4_t is_4 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b11110000-1, vl); 96 | vbool4_t is_34 = __riscv_vmor_mm_b4(is_3, is_4, vl); 97 | vbool4_t err34 = __riscv_vmxor_mm_b4(is_34, __riscv_vmslt_vx_i8m2_b4(errs, 0, vl), vl); 98 | vbool4_t errm = __riscv_vmor_mm_b4(__riscv_vmsgt_vx_i8m2_b4(errs, 0, vl), err34, vl); 99 | if (__riscv_vfirst_m_b4(errm , vl) >= 0) 100 | return 0; 101 | 102 | /* decoding */ 103 | 104 | /* mask of non continuation bytes */ 105 | vbool4_t m = __riscv_vmsgt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v0), -65, vl); 106 | vlOut = __riscv_vcpop_m_b4(m, vl); 107 | 108 | /* extract first and second bytes */ 109 | vuint8m2_t b1 = __riscv_vcompress_vm_u8m2(v0, m, vl); 110 | vuint8m2_t b2 = __riscv_vcompress_vm_u8m2(v1, m, vl); 111 | 112 | /* fast path: one and two byte */ 113 | if (max < 0b11100000) { 114 | b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut); 115 | 116 | vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut); 117 | b1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut); 118 | 119 | vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(b1, __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1<<6, m1, vlOut), vlOut); 120 | b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut); 121 | #if TO_16 122 | __riscv_vse16_v_u16m4(dest, b12, vlOut); 123 | #else 124 | __riscv_vse32_v_u32m8(dest, __riscv_vzext_vf2_u32m8(b12, vlOut), vlOut); 125 | #endif 126 | continue; 127 | } 128 | 129 | /* fast path: one, two and three byte */ 130 | if (max < 0b11110000) { 131 | vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl); 132 | 133 | b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut); 134 | b3 = __riscv_vand_vx_u8m2(b3, 0b00111111, vlOut); 135 | 136 | vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut); 137 | vbool4_t m3 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b11011111, vlOut); 138 | 139 | vuint8m2_t t1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut); 140 | b1 = __riscv_vand_vx_u8m2_mu(m3, t1, b1, 15, vlOut); 141 | 142 | vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(b1, __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1<<6, m1, vlOut), vlOut); 143 | b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut); 144 | vuint16m4_t b123 = __riscv_vwaddu_wv_u16m4_mu(m3, b12, __riscv_vsll_vx_u16m4_mu(m3, b12, b12, 6, vlOut), b3, vlOut); 145 | #if TO_16 146 | __riscv_vse16_v_u16m4(dest, b123, vlOut); 147 | #else 148 | __riscv_vse32_v_u32m8(dest, __riscv_vzext_vf2_u32m8(b123, vlOut), vlOut); 149 | #endif 150 | continue; 151 | } 152 | 153 | 154 | /* extract third and fourth bytes */ 155 | vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl); 156 | vuint8m2_t b4 = __riscv_vcompress_vm_u8m2(v3, m, vl); 157 | 158 | #define M1_COMMON(idx) \ 159 | vuint8m1_t c1 = __riscv_vget_v_u8m2_u8m1(b1, idx); \ 160 | vuint8m1_t c2 = __riscv_vget_v_u8m2_u8m1(b2, idx); \ 161 | vuint8m1_t c3 = __riscv_vget_v_u8m2_u8m1(b3, idx); \ 162 | vuint8m1_t c4 = __riscv_vget_v_u8m2_u8m1(b4, idx); \ 163 | /* remove prefix from trailing bytes */ \ 164 | c2 = __riscv_vand_vx_u8m1(c2, 0b00111111, vlOut); \ 165 | c3 = __riscv_vand_vx_u8m1(c3, 0b00111111, vlOut); \ 166 | c4 = __riscv_vand_vx_u8m1(c4, 0b00111111, vlOut); \ 167 | /* remove prefix from leading bytes 168 | * 169 | * We shift left and then right by the number of bytes in the prefix, 170 | * which can be calculated as follows: 171 | * x max(x-10, 0) 172 | * 0xxx -> 0000-0111 -> sift by 0 or 1 -> 0 173 | * 10xx -> 1000-1011 -> don't care 174 | * 110x -> 1100,1101 -> sift by 3 -> 2,3 175 | * 1110 -> 1110 -> sift by 4 -> 4 176 | * 1111 -> 1111 -> sift by 5 -> 5 177 | * 178 | * vssubu.vx v, 10, (max(x-10, 0)) almost gives us what we want, we 179 | * just need to manually detect and handle the one special case: 180 | */ \ 181 | vuint8m1_t shift = __riscv_vsrl_vx_u8m1(c1, 4, vlOut); \ 182 | shift = __riscv_vmerge_vxm_u8m1(__riscv_vssubu_vx_u8m1(shift, 10, vlOut), 3, __riscv_vmseq_vx_u8m1_b8(shift, 12, vlOut), vlOut); \ 183 | \ 184 | c1 = __riscv_vsll_vv_u8m1(c1, shift, vlOut); \ 185 | c1 = __riscv_vsrl_vv_u8m1(c1, shift, vlOut); \ 186 | /* unconditionally widen and combine to c1234 */ \ 187 | vuint16m2_t c34 = __riscv_vwaddu_wv_u16m2(__riscv_vwmulu_vx_u16m2(c3,1<<6, vlOut), c4, vlOut); \ 188 | vuint16m2_t c12 = __riscv_vwaddu_wv_u16m2(__riscv_vwmulu_vx_u16m2(c1,1<<6, vlOut), c2, vlOut); \ 189 | vuint32m4_t c1234 = __riscv_vwaddu_wv_u32m4(__riscv_vwmulu_vx_u32m4(c12, 1 << 12, vlOut), c34, vlOut); \ 190 | /* derive required right-shift amount from `shift` to reduce 191 | * c1234 to the required number of bytes */ \ 192 | c1234 = __riscv_vsrl_vv_u32m4(c1234, __riscv_vzext_vf4_u32m4( \ 193 | __riscv_vmul_vx_u8m1(__riscv_vrsub_vx_u8m1(__riscv_vssubu_vx_u8m1(shift, 2, vlOut), 3, vlOut), 6, vlOut), \ 194 | vlOut), vlOut); 195 | 196 | #define DOWN __riscv_vreinterpret_v_u32m4_u16m4 197 | #define UP __riscv_vreinterpret_v_u16m4_u32m4 198 | 199 | #if !TO_16 200 | #define M1_STORE \ 201 | size_t vlDest = vlOut; \ 202 | __riscv_vse32_v_u32m4(dest, c1234, vlDest); 203 | #else 204 | #define M1_STORE \ 205 | /* convert [000000000000aaaa|aaaaaabbbbbbbbbb] 206 | * to [110111bbbbbbbbbb|110110aaaaaaaaaa] */ \ 207 | vuint32m4_t sur = __riscv_vsub_vx_u32m4(c1234, 0x10000, vlOut); \ 208 | sur = __riscv_vor_vv_u32m4( \ 209 | __riscv_vsll_vx_u32m4(sur, 16, vlOut), \ 210 | __riscv_vsrl_vx_u32m4(sur, 10, vlOut), \ 211 | vlOut); \ 212 | sur = __riscv_vand_vx_u32m4(sur, 0x3FF03FF, vlOut); \ 213 | sur = __riscv_vor_vx_u32m4(sur, 0xDC00D800, vlOut); \ 214 | /* merge 1 byte c1234 and 2 byte sur */ \ 215 | vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(c1234, 0xFFFF, vlOut); \ 216 | c1234 = __riscv_vmerge_vvm_u32m4(c1234, sur, m4, vlOut); \ 217 | /* compress and store */ \ 218 | vbool4_t mOut = __riscv_vmor_mm_b4(__riscv_vmsne_vx_u16m4_b4(DOWN(c1234), 0, vlOut*2), m4even, vlOut*2); \ 219 | c1234 = UP(__riscv_vcompress_vm_u16m4(DOWN(c1234), mOut, vlOut*2)); \ 220 | size_t vlDest = __riscv_vcpop_m_b4(mOut, vlOut*2); \ 221 | __riscv_vse16_v_u16m4(dest, DOWN(c1234), vlDest); 222 | #endif 223 | 224 | /* Unrolling this manually reduces register pressure and allows 225 | * us to terminate early. */ 226 | { 227 | size_t vlOutm2 = vlOut; 228 | vlOut = __riscv_vsetvl_e8m1(vlOut < vl8m1 ? vlOut : vl8m1); 229 | M1_COMMON(0) 230 | M1_STORE 231 | if (vlOutm2 == vlOut) { 232 | vlOut = vlDest; 233 | continue; 234 | } 235 | 236 | dest += vlDest; 237 | vlOut = vlOutm2 - vlOut; 238 | } 239 | { 240 | M1_COMMON(1) 241 | M1_STORE 242 | vlOut = vlDest; 243 | } 244 | 245 | #undef M1_COMMON 246 | #undef M1_STORE 247 | #undef DOWN 248 | #undef UP 249 | } 250 | 251 | /* validate the last character and reparse it + tail */ 252 | if (count > tail) { 253 | if ((src[0] >> 6) == 0b10) 254 | --dest; 255 | while ((src[0] >> 6) == 0b10 && tail < count) 256 | --src, ++tail; 257 | #if TO_16 258 | /* go back one more, when on high surrogate */ 259 | if (dest[-1] >= 0xD800 && dest[-1] <= 0xDBFF) 260 | --dest; 261 | #endif 262 | } 263 | size_t ret = utf8_to_utf32_scalar(src, tail, dest); 264 | if (ret == 0) return 0; 265 | return (size_t)(dest - destBeg) + ret; 266 | } 267 | 268 | #undef uintOut_t 269 | #undef utf8_to_utf32_scalar 270 | #undef utf8_to_utf32_rvv 271 | 272 | -------------------------------------------------------------------------------- /vector-utf/Makefile: -------------------------------------------------------------------------------- 1 | .POSIX: 2 | 3 | include ../config.mk 4 | 5 | BENCHS=bench_8to32 bench_8to16 bench_16to8 6 | 7 | all: ${BENCHS} 8 | 9 | bench_8to16: bench.c 8toN_gather.c 10 | ${CC} ${CFLAGS} -DNAME=utf8_to_utf16 -DTO_16=1 8toN_gather.c bench.c -o $@ 11 | 12 | bench_8to32: bench.c 8toN_gather.c 13 | ${CC} ${CFLAGS} -DNAME=utf8_to_utf32 8toN_gather.c bench.c -o $@ 14 | 15 | bench_16to8: bench.c 16to8_gather.c 16 | ${CC} ${CFLAGS} -DNAME=utf16_to_utf8 16to8_gather.c bench.c -o $@ 17 | 18 | clean: 19 | rm -f ${BENCHS} 20 | 21 | 22 | -------------------------------------------------------------------------------- /vector-utf/bench.c: -------------------------------------------------------------------------------- 1 | #define NOLIBC_MAIN 2 | #include "../nolibc.h" 3 | #include "scalar.h" 4 | 5 | size_t utf8_to_utf16_rvv(char const *src, size_t n, uint16_t *dest); 6 | size_t utf8_to_utf32_rvv(char const *src, size_t n, uint32_t *dest); 7 | size_t utf16_to_utf8_rvv(const uint16_t *src, size_t count, char *dest); 8 | 9 | 10 | #define MAX_IN (1024*1024*4) 11 | static uint64_t in[MAX_IN]; 12 | static uint64_t out[MAX_IN * 4]; 13 | 14 | #define NUM_REPEATS 30000 15 | 16 | #define PCAT(a,b) a##b 17 | #define CAT(a,b) PCAT(a,b) 18 | #define RVV CAT(NAME, _rvv) 19 | #define SCALAR CAT(NAME, _scalar) 20 | 21 | #define SCALE_utf8_to_utf16 1 22 | #define SCALE_utf8_to_utf32 1 23 | #define SCALE_utf16_to_utf8 2 24 | #define SCALE CAT(SCALE_, NAME) 25 | 26 | int 27 | main(void) 28 | { 29 | size_t inSize = memread(in, sizeof in); 30 | if (inSize == 0) { 31 | print("No input provided, please pipe it into the program\n"); 32 | return 1; 33 | } 34 | for (size_t s = 1; s; ) { 35 | s = memread((uint8_t*)in + inSize, (sizeof in) - inSize); 36 | inSize += s; 37 | } 38 | 39 | uint64_t beg, end; 40 | 41 | beg = rv_cycles(); 42 | for (size_t j = 0; j < NUM_REPEATS; ++j) 43 | SCALAR((void*)in, inSize / SCALE, (void*)out); 44 | end = rv_cycles(); 45 | 46 | double scalar_bc = inSize*(double)NUM_REPEATS / (end - beg); 47 | 48 | beg = rv_cycles(); 49 | for (size_t j = 0; j < NUM_REPEATS; ++j) 50 | RVV((void*)in, inSize / SCALE, (void*)out); 51 | end = rv_cycles(); 52 | 53 | double rvv_bc = inSize*(double)NUM_REPEATS / (end - beg); 54 | 55 | print("scalar: ")(f,scalar_bc)(" b/c rvv: ")(f,rvv_bc)(" b/c speedup: ")(f,rvv_bc/scalar_bc)("x\n"); 56 | 57 | return 0; 58 | } 59 | 60 | -------------------------------------------------------------------------------- /vector-utf/rvv-0.7.1/8to16.S: -------------------------------------------------------------------------------- 1 | 2 | #ifndef __riscv_v 3 | # include "../../thirdparty/rvv-rollback.S" 4 | #endif 5 | .text 6 | .balign 8 7 | 8 | // Changes from original codegen (clang-18): 9 | // * rvv-rollback for direct translation 10 | // * vsetivli -> vsetvl 11 | // * vle64 -> vle32 12 | // * vmvNr -> vmv.v.v & 2 vsetvli 13 | // * vzext.vN -> N/2 vwaddu & N/2+1 vsetvli 14 | // * vredmax -> vfirst vmsgtu 15 | // the SG2042 I've got access to seems to produce the wrong result for 16 | // vredmax, so we need to replace it 17 | 18 | 19 | .global utf8_to_utf16_rvv 20 | utf8_to_utf16_rvv: 21 | li a4, 3 22 | mv a3, a1 23 | bgeu a1, a4, .LBB0_2 24 | mv a1, a3 25 | tail utf8_to_utf16_scalar 26 | .LBB0_2: 27 | addi sp, sp, -96 28 | sd ra, 88(sp) 29 | sd s0, 80(sp) 30 | sd s1, 72(sp) 31 | sd s2, 64(sp) 32 | sd s3, 56(sp) 33 | sd s4, 48(sp) 34 | sd s5, 40(sp) 35 | sd s6, 32(sp) 36 | sd s7, 24(sp) 37 | mv s2, a2 38 | bne a3, a4, .LBB0_4 39 | mv s1, a3 40 | li a1, 3 41 | j .LBB0_9 42 | .LBB0_4: 43 | li a1, 3 44 | li a2, 128 45 | .LBB0_5: 46 | add a4, a0, a1 47 | lbu a4, 0(a4) 48 | andi a4, a4, 192 49 | bne a4, a2, .LBB0_8 50 | addi a1, a1, 1 51 | bne a3, a1, .LBB0_5 52 | mv s1, a3 53 | mv a1, a3 54 | li a2, 6 55 | bgeu a2, a3, .LBB0_9 56 | j .LBB0_24 57 | .LBB0_8: 58 | mv s1, a3 59 | li a2, 6 60 | bltu a2, a1, .LBB0_24 61 | .LBB0_9: 62 | addi a2, sp, 4 63 | mv s0, a0 64 | call utf8_to_utf16_scalar 65 | mv a1, a0 66 | beqz a0, .LBB0_24 67 | mv a0, s0 68 | addi a1, s1, -3 69 | vsetvli a2, zero, e8, m2, ta, ma 70 | beqz a1, .LBB0_32 71 | mv t5, s1 72 | li a2, 4 73 | vsetvli zero, a2, e32, m1, ta, ma 74 | la a2, utf8_to_utf16_rvv.err1m 75 | vle32.v v10, (a2) 76 | la a2, utf8_to_utf16_rvv.err2m 77 | vle32.v v11, (a2) 78 | la a2, utf8_to_utf16_rvv.err3m 79 | vle32.v v12, (a2) 80 | vsetvli a2, zero, e8, m2, ta, ma 81 | vid.v v8 82 | vand.vi v8, v8, 1 83 | vmseq.vi v13, v8, 0 84 | li s7, 127 85 | li s6, -33 86 | li a7, -17 87 | li s1, 63 88 | li a6, -65 89 | li s5, 64 90 | li t0, 10 91 | lui t1, 1 92 | li t2, 2 93 | li t3, 6 94 | lui t4, 16 95 | lui a2, 16368 96 | addi t6, a2, 1023 97 | addi s3, t4, -1 98 | lui a2, 901134 99 | addi s4, a2, -2048 100 | mv s0, s2 101 | j .LBB0_14 102 | .LBB0_12: 103 | vsetvli zero, zero, e8, m2, ta, ma 104 | vwaddu.vx v16, v8, x0 105 | vsetvli zero, zero, e16, m4, ta, ma 106 | vse16.v v16, (s0) 107 | mv a5, a3 108 | .LBB0_13: 109 | sub a1, a1, a3 110 | slli a5, a5, 1 111 | add s0, s0, a5 112 | beqz a1, .LBB0_25 113 | .LBB0_14: 114 | vsetvli a3, a1, e8, m2, ta, ma 115 | vle8.v v8, (a0) 116 | vmsgtu.vx v14, v8, s7 117 | vfirst.m a2, v14 118 | add a0, a0, a3 119 | bltz a2, .LBB0_12 120 | lbu a2, 0(a0) 121 | lbu a4, 1(a0) 122 | lbu a5, 2(a0) 123 | vslide1down.vx v16, v8, a2 124 | vslide1down.vx v18, v16, a4 125 | vslide1down.vx v20, v18, a5 126 | vsetvli a2, zero, e16, m2, ta, ma 127 | vsrl.vi v14, v18, 4 128 | vsrl.vi v22, v20, 4 129 | vsetvli zero, a3, e8, m2, ta, ma 130 | vand.vi v24, v18, 15 131 | vand.vi v14, v14, 15 132 | vand.vi v22, v22, 15 133 | vsetvli a2, zero, e8, m1, ta, ma 134 | vrgather.vv v26, v10, v14 135 | vrgather.vv v27, v10, v15 136 | vrgather.vv v14, v11, v24 137 | vrgather.vv v15, v11, v25 138 | vrgather.vv v24, v12, v22 139 | vrgather.vv v25, v12, v23 140 | vsetvli zero, a3, e8, m2, ta, ma 141 | vand.vv v14, v26, v14 142 | vand.vv v14, v14, v24 143 | vmsgtu.vx v23, v16, s6 144 | vmsgtu.vx v22, v8, a7 145 | vmor.mm v23, v23, v22 146 | vmsgtu.vx v24, v14, s7 147 | vmxor.mm v23, v23, v24 148 | vmsgt.vi v24, v14, 0 149 | vmor.mm v14, v24, v23 150 | vfirst.m a2, v14 151 | bgez a2, .LBB0_24 152 | vsrl.vi v14, v8, 6 153 | vmsne.vi v23, v14, 2 154 | vcpop.m a5, v23 155 | vcompress.vm v14, v8, v23 156 | vmsgtu.vx v24, v8, s6 157 | vfirst.m a2, v24 158 | vcompress.vm v8, v16, v23 159 | bltz a2, .LBB0_20 160 | vfirst.m a2, v22 161 | vcompress.vm v16, v18, v23 162 | bltz a2, .LBB0_21 163 | vcompress.vm v18, v20, v23 164 | vsetvli a4, a5, e8, m1, ta, ma 165 | vand.vx v8, v8, s1 166 | vand.vx v16, v16, s1 167 | vand.vx v18, v18, s1 168 | vsrl.vi v20, v14, 4 169 | vmseq.vi v0, v20, 12 170 | vssubu.vx v20, v20, t0 171 | vmerge.vim v20, v20, 3, v0 172 | vsll.vv v14, v14, v20 173 | vsrl.vv v14, v14, v20 174 | vwmulu.vx v22, v16, s5 175 | vwaddu.wv v22, v22, v18 176 | vwmulu.vx v24, v14, s5 177 | vwaddu.wv v24, v24, v8 178 | vsetvli zero, zero, e16, m2, ta, ma 179 | vwmulu.vx v28, v24, t1 180 | vwaddu.wv v28, v28, v22 181 | vsetvli zero, zero, e8, m1, ta, ma 182 | vssubu.vx v8, v20, t2 183 | vrsub.vi v8, v8, 3 184 | vmul.vx v8, v8, t3 185 | vsetvli zero, zero, e8, m1, ta, ma 186 | vwaddu.vx v0, v8, x0 187 | vsetvli zero, zero, e16, m2, ta, ma 188 | vwaddu.vx v20, v0, x0 189 | vsetvli zero, zero, e32, m4, ta, mu 190 | vsrl.vv v20, v28, v20 191 | vsub.vx v24, v20, t4 192 | vsll.vi v28, v24, 16 193 | vsrl.vi v24, v24, 10 194 | vor.vv v24, v28, v24 195 | vmsgtu.vx v0, v20, s3 196 | vand.vx v24, v24, t6 197 | vor.vx v20, v24, s4, v0.t 198 | slli a2, a4, 1 199 | vsetvli zero, a2, e16, m4, ta, ma 200 | vmsne.vi v8, v20, 0 201 | vmor.mm v8, v8, v13 202 | vcompress.vm v24, v20, v8 203 | vcpop.m a2, v8 204 | vsetvli zero, a2, e16, m4, ta, ma 205 | vse16.v v24, (s0) 206 | bne a5, a4, .LBB0_22 207 | mv a5, a2 208 | j .LBB0_13 209 | .LBB0_20: 210 | vsetvli zero, a5, e8, m2, ta, mu 211 | vmsgtu.vx v0, v14, a6 212 | vand.vx v8, v8, s1 213 | vand.vx v14, v14, s1, v0.t 214 | vmv.v.i v16, 1 215 | vmerge.vxm v16, v16, s5, v0 216 | vwmulu.vv v20, v14, v16 217 | vwaddu.wv v20, v20, v8, v0.t 218 | vsetvli zero, a5, e16, m4, ta, ma 219 | j .LBB0_23 220 | .LBB0_21: 221 | vsetvli zero, a5, e8, m2, ta, mu 222 | vand.vx v18, v8, s1 223 | vand.vx v16, v16, s1 224 | vmsgtu.vx v8, v14, a6 225 | vmsgtu.vx v9, v14, s6 226 | vmv.v.v v20, v14 227 | vsetvli zero, zero, e8, m1, ta, ma 228 | vmv.v.v v0, v8 229 | vsetvli zero, a5, e8, m2, ta, mu 230 | vand.vx v20, v14, s1, v0.t 231 | vsetvli zero, zero, e8, m1, ta, ma 232 | vmv.v.v v0, v9 233 | vsetvli zero, a5, e8, m2, ta, mu 234 | vand.vi v20, v14, 15, v0.t 235 | vmv.v.i v14, 1 236 | vsetvli zero, zero, e8, m1, ta, ma 237 | vmv.v.v v0, v8 238 | vsetvli zero, a5, e8, m2, ta, mu 239 | vmerge.vxm v14, v14, s5, v0 240 | vwmulu.vv v24, v20, v14 241 | vwaddu.wv v24, v24, v18, v0.t 242 | vsetvli zero, zero, e16, m4, ta, mu 243 | vsetvli zero, zero, e8, m1, ta, ma 244 | vmv.v.v v0, v9 245 | vsetvli zero, a5, e16, m4, ta, mu 246 | vmv.v.v v20, v24 247 | vsll.vi v20, v24, 6, v0.t 248 | vsetvli zero, zero, e8, m2, ta, mu 249 | vwaddu.wv v24, v20, v16, v0.t 250 | vsetvli zero, zero, e16, m4, ta, mu 251 | vse16.v v24, (s0) 252 | j .LBB0_13 253 | .LBB0_22: 254 | sub a5, a5, a4 255 | slli a2, a2, 1 256 | add s0, s0, a2 257 | vsetvli zero, a5, e8, m1, ta, ma 258 | vand.vx v8, v9, s1 259 | vand.vx v9, v17, s1 260 | vand.vx v14, v19, s1 261 | vsrl.vi v16, v15, 4 262 | vmseq.vi v0, v16, 12 263 | vssubu.vx v16, v16, t0 264 | vmerge.vim v16, v16, 3, v0 265 | vsll.vv v15, v15, v16 266 | vsrl.vv v15, v15, v16 267 | vwmulu.vx v18, v9, s5 268 | vwaddu.wv v18, v18, v14 269 | vwmulu.vx v20, v15, s5 270 | vwaddu.wv v20, v20, v8 271 | vsetvli zero, zero, e16, m2, ta, ma 272 | vwmulu.vx v24, v20, t1 273 | vwaddu.wv v24, v24, v18 274 | vsetvli zero, zero, e8, m1, ta, ma 275 | vssubu.vx v8, v16, t2 276 | vrsub.vi v8, v8, 3 277 | vmul.vx v8, v8, t3 278 | vsetvli zero, zero, e32, m4, ta, mu 279 | vsetvli zero, zero, e8, m1, ta, ma 280 | vwaddu.vx v0, v8, x0 281 | vsetvli zero, zero, e16, m2, ta, ma 282 | vwaddu.vx v16, v0, x0 283 | vsetvli zero, zero, e32, m4, ta, mu 284 | vsrl.vv v16, v24, v16 285 | vsub.vx v20, v16, t4 286 | vsll.vi v24, v20, 16 287 | vsrl.vi v20, v20, 10 288 | vor.vv v20, v24, v20 289 | vmsgtu.vx v0, v16, s3 290 | vand.vx v20, v20, t6 291 | vor.vx v16, v20, s4, v0.t 292 | slli a5, a5, 1 293 | vsetvli zero, a5, e16, m4, ta, ma 294 | vmsne.vi v8, v16, 0 295 | vmor.mm v8, v8, v13 296 | vcompress.vm v20, v16, v8 297 | vcpop.m a5, v8 298 | vsetvli zero, a5, e16, m4, ta, ma 299 | .LBB0_23: 300 | vse16.v v20, (s0) 301 | j .LBB0_13 302 | .LBB0_24: 303 | li a0, 0 304 | j .LBB0_34 305 | .LBB0_25: 306 | li a1, 3 307 | beq t5, a1, .LBB0_33 308 | lbu a1, 0(a0) 309 | andi a3, a1, 192 310 | addi a1, a3, -128 311 | snez a1, a1 312 | addi a1, a1, -1 313 | andi a1, a1, -2 314 | add s0, s0, a1 315 | li a2, 128 316 | li a1, 3 317 | bne a3, a2, .LBB0_30 318 | li a1, 3 319 | .LBB0_28: 320 | lbu a3, -1(a0) 321 | addi a0, a0, -1 322 | andi a3, a3, 192 323 | addi a1, a1, 1 324 | bne a3, a2, .LBB0_30 325 | bltu a1, t5, .LBB0_28 326 | .LBB0_30: 327 | lhu a2, -2(s0) 328 | srli a2, a2, 10 329 | li a3, 54 330 | bne a2, a3, .LBB0_33 331 | addi s0, s0, -2 332 | j .LBB0_33 333 | .LBB0_32: 334 | li a1, 3 335 | mv s0, s2 336 | .LBB0_33: 337 | mv a2, s0 338 | call utf8_to_utf16_scalar 339 | seqz a1, a0 340 | sub a2, s0, s2 341 | srai a2, a2, 1 342 | add a0, a0, a2 343 | addi a1, a1, -1 344 | and a0, a0, a1 345 | .LBB0_34: 346 | ld ra, 88(sp) 347 | ld s0, 80(sp) 348 | ld s1, 72(sp) 349 | ld s2, 64(sp) 350 | ld s3, 56(sp) 351 | ld s4, 48(sp) 352 | ld s5, 40(sp) 353 | ld s6, 32(sp) 354 | ld s7, 24(sp) 355 | addi sp, sp, 96 356 | ret 357 | 358 | 359 | .data 360 | utf8_to_utf16_rvv.err1m: 361 | .quad 144680345676153346 362 | .quad 5266116582681116800 363 | 364 | utf8_to_utf16_rvv.err2m: 365 | .quad -3761689263670582297 366 | .quad -3761671395393942581 367 | 368 | utf8_to_utf16_rvv.err3m: 369 | .quad 72340172838076673 370 | .quad 72340175954030310 371 | 372 | -------------------------------------------------------------------------------- /vector-utf/rvv-0.7.1/8to32.S: -------------------------------------------------------------------------------- 1 | #ifndef __riscv_v 2 | # include "../../thirdparty/rvv-rollback.S" 3 | #endif 4 | .text 5 | .balign 8 6 | 7 | // Changes from original codegen (clang-18): 8 | // * rvv-rollback for direct translation 9 | // * vsetivli -> vsetvl 10 | // * vle64 -> vle32 11 | // * vmvNr -> vmv.v.v & 2 vsetvli 12 | // * vzext.vN -> N/2 vwaddu.vx & N/2+1 vsetvli 13 | // * vredmax -> vfirst vmsgtu 14 | // the SG2042 I've got access to seems to produce the wrong result for 15 | // vredmax, so we need to replace it 16 | 17 | .global utf8_to_utf32_rvv 18 | utf8_to_utf32_rvv: 19 | li a4, 3 20 | mv a3, a1 21 | bgeu a1, a4, .LBB0_2 22 | mv a1, a3 23 | tail utf8_to_utf32_scalar 24 | .LBB0_2: 25 | addi sp, sp, -80 26 | sd ra, 72(sp) 27 | sd s0, 64(sp) 28 | sd s1, 56(sp) 29 | sd s2, 48(sp) 30 | sd s3, 40(sp) 31 | mv s2, a2 32 | bne a3, a4, .LBB0_4 33 | mv s1, a3 34 | li a1, 3 35 | j .LBB0_9 36 | .LBB0_4: 37 | li a1, 3 38 | li a2, 128 39 | .LBB0_5: 40 | add a4, a0, a1 41 | lbu a4, 0(a4) 42 | andi a4, a4, 192 43 | bne a4, a2, .LBB0_8 44 | addi a1, a1, 1 45 | bne a3, a1, .LBB0_5 46 | mv s1, a3 47 | mv a1, a3 48 | li a2, 6 49 | bgeu a2, a3, .LBB0_9 50 | j .LBB0_22 51 | .LBB0_8: 52 | mv s1, a3 53 | li a2, 6 54 | bltu a2, a1, .LBB0_22 55 | .LBB0_9: 56 | mv a2, sp 57 | mv s0, a0 58 | call utf8_to_utf32_scalar 59 | mv a1, a0 60 | beqz a0, .LBB0_22 61 | mv a0, s0 62 | addi a1, s1, -3 63 | vsetvli a2, zero, e16, m2, ta, ma 64 | beqz a1, .LBB0_28 65 | mv t4, s1 66 | li a2, 4 67 | vsetvli zero, a2, e32, m1, ta, ma 68 | la a2, utf8_to_utf32_rvv.err1m 69 | vle32.v v10, (a2) 70 | la a2, utf8_to_utf32_rvv.err2m 71 | vle32.v v11, (a2) 72 | la a2, utf8_to_utf32_rvv.err3m 73 | vle32.v v12, (a2) 74 | li s3, 127 75 | li t6, -33 76 | li a7, -17 77 | li s1, 63 78 | li a6, -65 79 | li t5, 64 80 | li t0, 10 81 | lui t1, 1 82 | li t2, 2 83 | li t3, 6 84 | mv s0, s2 85 | j .LBB0_14 86 | nop # needed for alignment, I think :3 87 | .LBB0_12: 88 | vsetvli zero, zero, e8, m2, ta, ma 89 | vwaddu.vx v0, v8, x0 90 | vsetvli zero, zero, e16, m4, ta, ma 91 | vwaddu.vx v16, v0, x0 92 | vsetvli zero, zero, e32, m8, ta, ma 93 | vse32.v v16, (s0) 94 | mv a5, a2 95 | .LBB0_13: 96 | sub a1, a1, a2 97 | slli a5, a5, 2 98 | add s0, s0, a5 99 | beqz a1, .LBB0_23 100 | .LBB0_14: 101 | vsetvli a2, a1, e8, m2, ta, ma 102 | vle8.v v8, (a0) 103 | vmsgtu.vx v13, v8, s3 104 | vfirst.m a4, v13 105 | add a0, a0, a2 106 | bltz a4, .LBB0_12 107 | lbu a4, 0(a0) 108 | lbu a5, 1(a0) 109 | lbu a3, 2(a0) 110 | vslide1down.vx v20, v8, a4 111 | vslide1down.vx v18, v20, a5 112 | vslide1down.vx v16, v18, a3 113 | vsetvli a3, zero, e16, m2, ta, ma 114 | vsrl.vi v14, v18, 4 115 | vsrl.vi v22, v16, 4 116 | vsetvli zero, a2, e8, m2, ta, ma 117 | vand.vi v24, v18, 15 118 | vand.vi v14, v14, 15 119 | vand.vi v22, v22, 15 120 | vsetvli a3, zero, e8, m1, ta, ma 121 | vrgather.vv v26, v10, v14 122 | vrgather.vv v27, v10, v15 123 | vrgather.vv v14, v11, v24 124 | vrgather.vv v15, v11, v25 125 | vrgather.vv v24, v12, v22 126 | vrgather.vv v25, v12, v23 127 | vsetvli zero, a2, e8, m2, ta, ma 128 | vand.vv v14, v26, v14 129 | vand.vv v14, v14, v24 130 | vmsgtu.vx v22, v20, t6 131 | vmsgtu.vx v13, v8, a7 132 | vmor.mm v22, v22, v13 133 | vmsgtu.vx v23, v14, s3 134 | vmxor.mm v22, v22, v23 135 | vmsgt.vi v23, v14, 0 136 | vmor.mm v14, v23, v22 137 | vfirst.m a3, v14 138 | bgez a3, .LBB0_22 139 | vsrl.vi v14, v8, 6 140 | vmsne.vi v22, v14, 2 141 | vcpop.m a5, v22 142 | vcompress.vm v14, v8, v22 143 | vmsgtu.vx v23, v8, t6 144 | vfirst.m a3, v23 145 | vcompress.vm v8, v20, v22 146 | bltz a3, .LBB0_20 147 | vfirst.m a3, v13 148 | vcompress.vm v20, v18, v22 149 | bltz a3, .LBB0_21 150 | vcompress.vm v18, v16, v22 151 | vsetvli a4, a5, e8, m1, ta, ma 152 | vand.vx v8, v8, s1 153 | vand.vx v13, v20, s1 154 | vand.vx v16, v18, s1 155 | vsrl.vi v17, v14, 4 156 | vmseq.vi v0, v17, 12 157 | vssubu.vx v17, v17, t0 158 | vmerge.vim v17, v17, 3, v0 159 | vsll.vv v14, v14, v17 160 | vsrl.vv v14, v14, v17 161 | vwmulu.vx v22, v13, t5 162 | vwaddu.wv v22, v22, v16 163 | vwmulu.vx v24, v14, t5 164 | vwaddu.wv v24, v24, v8 165 | vsetvli zero, zero, e16, m2, ta, ma 166 | vwmulu.vx v28, v24, t1 167 | vwaddu.wv v28, v28, v22 168 | vsetvli zero, zero, e8, m1, ta, ma 169 | vssubu.vx v8, v17, t2 170 | vrsub.vi v8, v8, 3 171 | vmul.vx v8, v8, t3 172 | vsetvli zero, zero, e8, m1, ta, ma 173 | vwaddu.vx v0, v8, x0 174 | vsetvli zero, zero, e16, m2, ta, ma 175 | vwaddu.vx v24, v0, x0 176 | vsetvli zero, zero, e32, m4, ta, ma 177 | vsrl.vv v24, v28, v24 178 | vse32.v v24, (s0) 179 | beq a5, a4, .LBB0_13 180 | sub a5, a5, a4 181 | slli a4, a4, 2 182 | add s0, s0, a4 183 | vsetvli zero, a5, e8, m1, ta, ma 184 | vand.vx v8, v9, s1 185 | vand.vx v9, v21, s1 186 | vand.vx v13, v19, s1 187 | vsrl.vi v14, v15, 4 188 | vmseq.vi v0, v14, 12 189 | vssubu.vx v14, v14, t0 190 | vmerge.vim v14, v14, 3, v0 191 | vsll.vv v15, v15, v14 192 | vsrl.vv v15, v15, v14 193 | vwmulu.vx v16, v9, t5 194 | vwaddu.wv v16, v16, v13 195 | vwmulu.vx v18, v15, t5 196 | vwaddu.wv v18, v18, v8 197 | vsetvli zero, zero, e16, m2, ta, ma 198 | vwmulu.vx v20, v18, t1 199 | vwaddu.wv v20, v20, v16 200 | vsetvli zero, zero, e8, m1, ta, ma 201 | vssubu.vx v8, v14, t2 202 | vrsub.vi v8, v8, 3 203 | vmul.vx v8, v8, t3 204 | vsetvli zero, zero, e32, m4, ta, ma 205 | vsetvli zero, zero, e8, m1, ta, ma 206 | vwaddu.vx v0, v8, x0 207 | vsetvli zero, zero, e16, m2, ta, ma 208 | vwaddu.vx v16, v0, x0 209 | vsetvli zero, zero, e32, m4, ta, ma 210 | vsrl.vv v16, v20, v16 211 | vse32.v v16, (s0) 212 | j .LBB0_13 213 | .LBB0_20: 214 | vsetvli zero, a5, e8, m2, ta, mu 215 | vmsgtu.vx v0, v14, a6 216 | vand.vx v8, v8, s1 217 | vand.vx v14, v14, s1, v0.t 218 | vmv.v.i v16, 1 219 | vmerge.vxm v16, v16, t5, v0 220 | vwmulu.vv v20, v14, v16 221 | vwaddu.wv v20, v20, v8, v0.t 222 | vsetvli zero, zero, e16, m4, ta, ma 223 | vwaddu.vx v24, v20, x0 224 | vsetvli zero, zero, e32, m8, ta, ma 225 | vse32.v v24, (s0) 226 | j .LBB0_13 227 | .LBB0_21: 228 | vsetvli zero, a5, e8, m2, ta, ma 229 | vand.vx v22, v8, s1 230 | vand.vx v20, v20, s1 231 | vmsgtu.vx v8, v14, a6 232 | vmsgtu.vx v9, v14, t6 233 | vmv.v.v v24, v14 234 | vsetvli zero, zero, e8, m1, ta, ma 235 | vmv.v.v v0, v8 236 | vsetvli zero, a5, e8, m2, ta, mu 237 | vand.vx v24, v14, s1, v0.t 238 | vsetvli zero, zero, e8, m1, ta, ma 239 | vmv.v.v v0, v9 240 | vsetvli zero, a5, e8, m2, ta, mu 241 | vand.vi v24, v14, 15, v0.t 242 | vmv.v.i v14, 1 243 | vsetvli zero, zero, e8, m1, ta, ma 244 | vmv.v.v v0, v8 245 | vsetvli zero, a5, e8, m2, ta, mu 246 | vmerge.vxm v14, v14, t5, v0 247 | vwmulu.vv v16, v24, v14 248 | vwaddu.wv v16, v16, v22, v0.t 249 | vsetvli zero, zero, e8, m1, ta, ma 250 | vmv.v.v v0, v9 251 | vsetvli zero, a5, e16, m4, ta, mu 252 | vmv.v.v v24, v16 253 | vsll.vi v24, v16, 6, v0.t 254 | vsetvli zero, zero, e8, m2, ta, mu 255 | vwaddu.wv v16, v24, v20, v0.t 256 | vsetvli zero, zero, e16, m4, ta, ma 257 | vwaddu.vx v24, v16, x0 258 | vsetvli zero, zero, e32, m8, ta, ma 259 | vse32.v v24, (s0) 260 | j .LBB0_13 261 | .LBB0_22: 262 | li a0, 0 263 | j .LBB0_30 264 | .LBB0_23: 265 | li a1, 3 266 | beq t4, a1, .LBB0_29 267 | lbu a2, 0(a0) 268 | andi a3, a2, 192 269 | addi a2, a3, -128 270 | snez a2, a2 271 | slli a2, a2, 2 272 | add s0, s0, a2 273 | li a2, 128 274 | addi s0, s0, -4 275 | bne a3, a2, .LBB0_29 276 | li a1, 3 277 | .LBB0_26: 278 | lbu a3, -1(a0) 279 | addi a0, a0, -1 280 | andi a3, a3, 192 281 | addi a1, a1, 1 282 | bne a3, a2, .LBB0_29 283 | bltu a1, t4, .LBB0_26 284 | j .LBB0_29 285 | .LBB0_28: 286 | li a1, 3 287 | mv s0, s2 288 | .LBB0_29: 289 | mv a2, s0 290 | call utf8_to_utf32_scalar 291 | seqz a1, a0 292 | sub a2, s0, s2 293 | srai a2, a2, 2 294 | add a0, a0, a2 295 | addi a1, a1, -1 296 | and a0, a0, a1 297 | .LBB0_30: 298 | ld ra, 72(sp) 299 | ld s0, 64(sp) 300 | ld s1, 56(sp) 301 | ld s2, 48(sp) 302 | ld s3, 40(sp) 303 | addi sp, sp, 80 304 | ret 305 | 306 | .data 307 | utf8_to_utf32_rvv.err1m: 308 | .quad 144680345676153346 309 | .quad 5266116582681116800 310 | utf8_to_utf32_rvv.err2m: 311 | .quad -3761689263670582297 312 | .quad -3761671395393942581 313 | utf8_to_utf32_rvv.err3m: 314 | .quad 72340172838076673 315 | .quad 72340175954030310 316 | 317 | -------------------------------------------------------------------------------- /vector-utf/scalar.h: -------------------------------------------------------------------------------- 1 | // code from https://github.com/simdutf/simdutf/tree/master/src/scalar 2 | 3 | // little endian 4 | size_t 5 | utf8_to_utf16_scalar(const char *buf, size_t len, uint16_t *utf16_output) 6 | { 7 | const uint8_t *data = (const uint8_t *)buf; 8 | size_t pos = 0; 9 | uint16_t *start = utf16_output; 10 | #if 1 11 | while (pos < len) { 12 | // try to convert the next block of 16 ASCII bytes 13 | if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii 14 | uint64_t v1; 15 | memcpy(&v1, data + pos, sizeof(uint64_t)); 16 | uint64_t v2; 17 | memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); 18 | uint64_t v = v1 | v2; 19 | if ((v & 0x8080808080808080) == 0) { 20 | size_t final_pos = pos + 16; 21 | while(pos < final_pos) { 22 | *utf16_output++ = (uint16_t)(buf[pos]); 23 | pos++; 24 | } 25 | continue; 26 | } 27 | } 28 | #else 29 | // only uses aligned load/stores 30 | size_t aligned = 8 - ((uintptr_t)data & 7); 31 | while (pos < len) { 32 | // try to convert the next block of 16 ASCII bytes 33 | if ((pos & 7) == aligned && pos + 16 <= len) { 34 | uintptr_t p = (uintptr_t)(data+pos) & ~7ull; // compiler hint 35 | uint64_t v1; 36 | memcpy(&v1, (const uint8_t*)p, sizeof(uint64_t)); 37 | uint64_t v2; 38 | memcpy(&v2, (const uint8_t*)p + sizeof(uint64_t), sizeof(uint64_t)); 39 | 40 | uint64_t v = v1 | v2; 41 | if ((v & 0x8080808080808080) == 0) { 42 | for (size_t i = 0; i < 16; ++i) 43 | *utf16_output++ = (uint16_t)buf[pos++]; 44 | continue; 45 | } 46 | } 47 | #endif 48 | 49 | 50 | uint8_t leading_byte = data[pos]; // leading byte 51 | if (leading_byte < 0b10000000) { 52 | // converting one ASCII byte !!! 53 | *utf16_output++ = (uint16_t)leading_byte; 54 | pos++; 55 | } else if ((leading_byte & 0b11100000) == 0b11000000) { 56 | // We have a two-byte UTF-8, it should become 57 | // a single UTF-16 word. 58 | if(pos + 1 >= len) { return 0; } // minimal bound checking 59 | if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } 60 | // range check 61 | uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); 62 | if (code_point < 0x80 || 0x7ff < code_point) { return 0; } 63 | *utf16_output++ = (uint16_t)code_point; 64 | pos += 2; 65 | } else if ((leading_byte & 0b11110000) == 0b11100000) { 66 | // We have a three-byte UTF-8, it should become 67 | // a single UTF-16 word. 68 | if(pos + 2 >= len) { return 0; } // minimal bound checking 69 | 70 | if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } 71 | if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; } 72 | // range check 73 | uint32_t code_point = (leading_byte & 0b00001111) << 12 | 74 | (data[pos + 1] & 0b00111111) << 6 | 75 | (data[pos + 2] & 0b00111111); 76 | if (code_point < 0x800 || 0xffff < code_point || 77 | (0xd7ff < code_point && code_point < 0xe000)) { 78 | return 0; 79 | } 80 | *utf16_output++ = (uint16_t)code_point; 81 | pos += 3; 82 | } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 83 | // we have a 4-byte UTF-8 word. 84 | if(pos + 3 >= len) { return 0; } // minimal bound checking 85 | if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } 86 | if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; } 87 | if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; } 88 | 89 | // range check 90 | uint32_t code_point = 91 | (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | 92 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); 93 | if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; } 94 | code_point -= 0x10000; 95 | uint16_t high_surrogate = (uint16_t)(0xD800 + (code_point >> 10)); 96 | uint16_t low_surrogate = (uint16_t)(0xDC00 + (code_point & 0x3FF)); 97 | *utf16_output++ = (uint16_t)(high_surrogate); 98 | *utf16_output++ = (uint16_t)(low_surrogate); 99 | pos += 4; 100 | } else { 101 | return 0; 102 | } 103 | } 104 | return utf16_output - start; 105 | } 106 | 107 | size_t 108 | utf8_to_utf32_scalar(const char *buf, size_t len, uint32_t *utf32_output) 109 | { 110 | const uint8_t *data = (const uint8_t *)buf; 111 | size_t pos = 0; 112 | uint32_t* start = utf32_output; 113 | while (pos < len) { 114 | // try to convert the next block of 16 ASCII bytes 115 | if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii 116 | uint64_t v1; 117 | memcpy(&v1, data + pos, sizeof(uint64_t)); 118 | uint64_t v2; 119 | memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t)); 120 | uint64_t v = v1 | v2; 121 | if ((v & 0x8080808080808080) == 0) { 122 | size_t final_pos = pos + 16; 123 | while(pos < final_pos) { 124 | *utf32_output++ = (uint32_t)buf[pos]; 125 | pos++; 126 | } 127 | continue; 128 | } 129 | } 130 | uint8_t leading_byte = data[pos]; // leading byte 131 | if (leading_byte < 0b10000000) { 132 | // converting one ASCII byte !!! 133 | *utf32_output++ = (uint32_t)leading_byte; 134 | pos++; 135 | } else if ((leading_byte & 0b11100000) == 0b11000000) { 136 | // We have a two-byte UTF-8 137 | if(pos + 1 >= len) { return 0; } // minimal bound checking 138 | if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } 139 | // range check 140 | uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); 141 | if (code_point < 0x80 || 0x7ff < code_point) { return 0; } 142 | *utf32_output++ = (uint32_t)code_point; 143 | pos += 2; 144 | } else if ((leading_byte & 0b11110000) == 0b11100000) { 145 | // We have a three-byte UTF-8 146 | if (pos + 2 >= len) { return 0; } // minimal bound checking 147 | 148 | if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } 149 | if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; } 150 | // range check 151 | uint32_t code_point = (leading_byte & 0b00001111) << 12 | 152 | (data[pos + 1] & 0b00111111) << 6 | 153 | (data[pos + 2] & 0b00111111); 154 | if (code_point < 0x800 || 0xffff < code_point || 155 | (0xd7ff < code_point && code_point < 0xe000)) { 156 | return 0; 157 | } 158 | *utf32_output++ = (uint32_t)code_point; 159 | pos += 3; 160 | } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000 161 | // we have a 4-byte UTF-8 word. 162 | if(pos + 3 >= len) { return 0; } // minimal bound checking 163 | if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; } 164 | if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; } 165 | if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; } 166 | 167 | // range check 168 | uint32_t code_point = 169 | (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | 170 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111); 171 | if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; } 172 | *utf32_output++ = (uint32_t)code_point; 173 | pos += 4; 174 | } else { 175 | return 0; 176 | } 177 | } 178 | return utf32_output - start; 179 | } 180 | 181 | 182 | // little endian 183 | size_t 184 | utf16_to_utf8_scalar(const uint16_t *buf, size_t len, char *utf8_output) 185 | { 186 | const uint16_t *data = (const uint16_t *)buf; 187 | size_t pos = 0; 188 | char *start = utf8_output; 189 | while (pos < len) { 190 | // try to convert the next block of 8 bytes 191 | if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii 192 | uint64_t v; 193 | memcpy(&v, data + pos, sizeof(uint64_t)); 194 | if ((v & 0xFF80FF80FF80FF80) == 0) { 195 | size_t final_pos = pos + 4; 196 | while(pos < final_pos) { 197 | *utf8_output++ = (char)(buf[pos]); 198 | pos++; 199 | } 200 | continue; 201 | } 202 | } 203 | uint16_t word = data[pos]; 204 | if((word & 0xFF80)==0) { 205 | // will generate one UTF-8 bytes 206 | *utf8_output++ = (char)(word); 207 | pos++; 208 | } else if((word & 0xF800)==0) { 209 | // will generate two UTF-8 bytes 210 | // we have 0b110XXXXX 0b10XXXXXX 211 | *utf8_output++ = (char)((word>>6) | 0b11000000); 212 | *utf8_output++ = (char)((word & 0b111111) | 0b10000000); 213 | pos++; 214 | } else if((word &0xF800 ) != 0xD800) { 215 | // will generate three UTF-8 bytes 216 | // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX 217 | *utf8_output++ = (char)((word>>12) | 0b11100000); 218 | *utf8_output++ = (char)(((word>>6) & 0b111111) | 0b10000000); 219 | *utf8_output++ = (char)((word & 0b111111) | 0b10000000); 220 | pos++; 221 | } else { 222 | // must be a surrogate pair 223 | if(pos + 1 >= len) { return 0; } 224 | uint16_t diff = (uint16_t)(word - 0xD800); 225 | if(diff > 0x3FF) { return 0; } 226 | uint16_t next_word = data[pos + 1]; 227 | uint16_t diff2 = (uint16_t)(next_word - 0xDC00); 228 | if(diff2 > 0x3FF) { return 0; } 229 | uint32_t value = (diff << 10) + diff2 + 0x10000; 230 | // will generate four UTF-8 bytes 231 | // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX 232 | *utf8_output++ = (char)((value>>18) | 0b11110000); 233 | *utf8_output++ = (char)(((value>>12) & 0b111111) | 0b10000000); 234 | *utf8_output++ = (char)(((value>>6) & 0b111111) | 0b10000000); 235 | *utf8_output++ = (char)((value & 0b111111) | 0b10000000); 236 | pos += 2; 237 | } 238 | } 239 | return utf8_output - start; 240 | } 241 | 242 | 243 | size_t 244 | utf32_to_utf8_scalar(const uint32_t *buf, size_t len, char *utf8_output) 245 | { 246 | const uint32_t *data = (const uint32_t *)buf; 247 | size_t pos = 0; 248 | char *start = utf8_output; 249 | while (pos < len) { 250 | // try to convert the next block of 2 ASCII characters 251 | if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii 252 | uint64_t v; 253 | memcpy(&v, data + pos, sizeof(uint64_t)); 254 | if ((v & 0xFFFFFF80FFFFFF80) == 0) { 255 | *utf8_output++ = (char)(buf[pos]); 256 | *utf8_output++ = (char)(buf[pos+1]); 257 | pos += 2; 258 | continue; 259 | } 260 | } 261 | uint32_t word = data[pos]; 262 | if((word & 0xFFFFFF80)==0) { 263 | // will generate one UTF-8 bytes 264 | *utf8_output++ = (char)(word); 265 | pos++; 266 | } else if((word & 0xFFFFF800)==0) { 267 | // will generate two UTF-8 bytes 268 | // we have 0b110XXXXX 0b10XXXXXX 269 | *utf8_output++ = (char)((word>>6) | 0b11000000); 270 | *utf8_output++ = (char)((word & 0b111111) | 0b10000000); 271 | pos++; 272 | } else if((word & 0xFFFF0000)==0) { 273 | // will generate three UTF-8 bytes 274 | // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX 275 | if (word >= 0xD800 && word <= 0xDFFF) { return 0; } 276 | *utf8_output++ = (char)((word>>12) | 0b11100000); 277 | *utf8_output++ = (char)(((word>>6) & 0b111111) | 0b10000000); 278 | *utf8_output++ = (char)((word & 0b111111) | 0b10000000); 279 | pos++; 280 | } else { 281 | // will generate four UTF-8 bytes 282 | // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX 283 | if (word > 0x10FFFF) { return 0; } 284 | *utf8_output++ = (char)((word>>18) | 0b11110000); 285 | *utf8_output++ = (char)(((word>>12) & 0b111111) | 0b10000000); 286 | *utf8_output++ = (char)(((word>>6) & 0b111111) | 0b10000000); 287 | *utf8_output++ = (char)((word & 0b111111) | 0b10000000); 288 | pos ++; 289 | } 290 | } 291 | return utf8_output - start; 292 | } 293 | 294 | // little endian 295 | size_t 296 | utf32_to_utf16_scalar(const uint32_t *buf, size_t len, uint16_t *utf16_output) 297 | { 298 | const uint32_t *data = (const uint32_t*)buf; 299 | size_t pos = 0; 300 | uint16_t *start = utf16_output; 301 | while (pos < len) { 302 | uint32_t word = data[pos]; 303 | if((word & 0xFFFF0000)==0) { 304 | if (word >= 0xD800 && word <= 0xDFFF) { return 0; } 305 | // will not generate a surrogate pair 306 | *utf16_output++ = word; 307 | } else { 308 | // will generate a surrogate pair 309 | if (word > 0x10FFFF) { return 0; } 310 | word -= 0x10000; 311 | uint16_t high_surrogate = 0xD800 + (word >> 10); 312 | uint16_t low_surrogate = 0xDC00 + (word & 0x3FF); 313 | *utf16_output++ = high_surrogate; 314 | *utf16_output++ = low_surrogate; 315 | } 316 | pos++; 317 | } 318 | return utf16_output - start; 319 | } 320 | -------------------------------------------------------------------------------- /vector-utf/simdutf.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern "C" { 6 | 7 | size_t 8 | utf8_to_utf16_rvv(char const *src, size_t count, uint16_t *dest) 9 | { 10 | return simdutf::convert_utf8_to_utf16le(src, count, (char16_t*)dest); 11 | } 12 | 13 | size_t 14 | utf8_to_utf32_rvv(char const *src, size_t count, uint32_t *dest) 15 | { 16 | return simdutf::convert_utf8_to_utf32(src, count, (char32_t*)dest); 17 | } 18 | 19 | size_t 20 | utf16_to_utf8_rvv(uint16_t const *src, size_t count, char *dest) 21 | { 22 | return simdutf::convert_utf16le_to_utf8((char16_t*)src, count, dest); 23 | } 24 | 25 | } 26 | 27 | -------------------------------------------------------------------------------- /vector-utf/tests/16to8.c: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | size_t utf16_to_utf8_rvv(const uint16_t *src, size_t count, char *dest); 4 | 5 | #define MAX_UTF32_CHARS (1024*16) 6 | static uint32_t utf32[MAX_UTF32_CHARS]; 7 | static uint8_t out[MAX_UTF32_CHARS*4]; 8 | static uint8_t golden[MAX_UTF32_CHARS*4]; 9 | static uint16_t in[MAX_UTF32_CHARS*2]; 10 | 11 | static void 12 | test(size_t length, size_t bitFlipCount) 13 | { 14 | size_t len32 = randu64() % length, origLen; 15 | origLen = len32; 16 | for (size_t i = 0; i < len32; ++i) { 17 | do utf32[i] = randu64() >> (64 - randu64() % 22); 18 | while (utf32[i] > 0x10FFFF || (utf32[i] >= 0xD800 && utf32[i] <= 0xDFFF)); 19 | } 20 | size_t lenIn = utf32_to_utf16_scalar(utf32, len32, in); 21 | 22 | if (lenIn) 23 | for (size_t i = 0; i < bitFlipCount; ++i) 24 | in[randu64() % lenIn] ^= 1 << (randu64() & (sizeof *in - 1)); 25 | 26 | size_t lenGolden = utf16_to_utf8_scalar(in, lenIn, (char*)golden); 27 | size_t lenOut = utf16_to_utf8_rvv(in, lenIn, (char*)out); 28 | 29 | if (lenGolden != lenOut) { 30 | print("ERROR: length mismatch, expected ")(u,lenGolden)(" got ")(u,lenOut)(" from ")(u,origLen); 31 | print("\nin: "); 32 | for (size_t i = 0; i < lenIn; ++i) 33 | print(b16,in[i])(" "); 34 | print("\nout: "); 35 | for (size_t i = 0; i < lenOut; ++i) 36 | print(b8,out[i])(" "); 37 | print("\ntar: "); 38 | for (size_t i = 0; i < lenGolden; ++i) 39 | print(b8,golden[i])(" "); 40 | print_flush();exit(0); 41 | return; 42 | } 43 | for (size_t i = 0; i < lenGolden; ++i) { 44 | if (golden[i] != out[i]) { 45 | print("ERROR: at ")(u,i)("/")(u,lenGolden)(" expected ")(u,golden[i])(" got ")(u,out[i])("\n"); 46 | print("\nin: "); 47 | for (size_t i = 0; i < lenIn; ++i) 48 | print(b16,in[i])(" "); 49 | print("\nout: "); 50 | for (size_t i = 0; i < lenOut; ++i) 51 | print(b8,out[i])(" "); 52 | print("\ntar: "); 53 | for (size_t i = 0; i < lenGolden; ++i) 54 | print(b8,golden[i])(" "); 55 | print_flush();exit(0); 56 | } 57 | } 58 | } 59 | 60 | int 61 | main(void) 62 | { 63 | randState.x ^= rv_cycles(); 64 | for (size_t i = 0; i < 10000000; ++i) { 65 | test(10, 2); 66 | test(10, 10); 67 | test(10, 100); 68 | test(100, 2); 69 | test(100, 10); 70 | test(100, 100); 71 | test(400, 2); 72 | test(400, 10); 73 | test(2000, 2); 74 | test(2000, 100); 75 | if ((i & 127) == 0) 76 | print("\r")(u,i)(" ")(flush,); 77 | } 78 | return 0; 79 | } 80 | 81 | -------------------------------------------------------------------------------- /vector-utf/tests/8to16.c: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | size_t utf8_to_utf16_rvv(char const *src, size_t n, uint16_t *dest); 4 | 5 | #define MAX_UTF32_CHARS (1024*16) 6 | static uint32_t utf32[MAX_UTF32_CHARS]; 7 | static uint16_t out[MAX_UTF32_CHARS*2]; 8 | static uint16_t golden[MAX_UTF32_CHARS*2]; 9 | static uint8_t in[MAX_UTF32_CHARS*4]; 10 | 11 | static void 12 | test(size_t length, size_t bitFlipCount) 13 | { 14 | size_t len32 = randu64() % length, origLen; 15 | origLen = len32; 16 | for (size_t i = 0; i < len32; ++i) { 17 | do utf32[i] = randu64() >> (64 - randu64() % 22); 18 | while (utf32[i] > 0x10FFFF || (utf32[i] >= 0xD800 && utf32[i] <= 0xDFFF)); 19 | } 20 | size_t lenIn = utf32_to_utf8_scalar(utf32, len32, (char*)in); 21 | 22 | if (lenIn) 23 | for (size_t i = 0; i < bitFlipCount; ++i) 24 | in[randu64() % lenIn] ^= 1 << (randu64() & (sizeof *in - 1)); 25 | 26 | size_t lenGolden = utf8_to_utf16_scalar((char*)in, lenIn, golden); 27 | size_t lenOut = utf8_to_utf16_rvv((char*)in, lenIn, out); 28 | 29 | if (lenGolden != lenOut) { 30 | print("ERROR: length mismatch, expected ")(u,lenGolden)(" got ")(u,lenOut)(" from ")(u,origLen)("\n"); 31 | for (size_t i = 0; i < lenIn; ++i) 32 | print(b8,in[i])(" "); 33 | print("\nin: "); 34 | for (size_t i = 0; i < lenIn; ++i) 35 | print(b8,in[i])(" "); 36 | print("\nout: "); 37 | for (size_t i = 0; i < lenOut; ++i) 38 | print(b16,out[i])(" "); 39 | print("\ntar: "); 40 | for (size_t i = 0; i < lenGolden; ++i) 41 | print(b16,golden[i])(" "); 42 | print_flush();exit(0); 43 | return; 44 | } 45 | for (size_t i = 0; i < lenGolden; ++i) { 46 | if (golden[i] != out[i]) { 47 | print("ERROR: at ")(u,i)("/")(u,lenGolden)(" expected ")(u,golden[i])(" got ")(u,out[i])("\n"); 48 | return; 49 | } 50 | } 51 | } 52 | 53 | int 54 | main(void) 55 | { 56 | randState.x ^= rv_cycles(); 57 | for (size_t i = 0; i < 10000000; ++i) { 58 | test(10, 2); 59 | test(10, 10); 60 | test(10, 100); 61 | test(100, 2); 62 | test(100, 10); 63 | test(100, 100); 64 | test(400, 2); 65 | test(400, 10); 66 | test(2000, 2); 67 | test(2000, 100); 68 | if ((i & 127) == 0) 69 | print("\r")(u,i)(" ")(flush,); 70 | } 71 | return 0; 72 | } 73 | 74 | -------------------------------------------------------------------------------- /vector-utf/tests/8to32.c: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | size_t utf8_to_utf32_rvv(char const *src, size_t n, uint32_t *dest); 4 | 5 | #define MAX_UTF32_CHARS (1024*16) 6 | static uint32_t utf32[MAX_UTF32_CHARS]; 7 | static uint32_t out[MAX_UTF32_CHARS]; 8 | static uint8_t in[MAX_UTF32_CHARS*4]; 9 | 10 | static void 11 | test(size_t maxlen32, size_t bitFlipCount) 12 | { 13 | size_t len32 = randu64() % maxlen32; 14 | for (size_t i = 0; i < len32; ++i) { 15 | do utf32[i] = randu64() >> (64 - randu64() % 22); 16 | while (utf32[i] > 0x10FFFF || (utf32[i] >= 0xD800 && utf32[i] <= 0xDFFF)); 17 | } 18 | size_t lenIn = utf32_to_utf8_scalar(utf32, len32, (char*)in); 19 | 20 | if (lenIn) 21 | for (size_t i = 0; i < bitFlipCount; ++i) 22 | in[randu64() % lenIn] ^= 1 << (sizeof *in - 1); 23 | 24 | if (bitFlipCount) 25 | len32 = utf8_to_utf32_scalar((char*)in, lenIn, utf32); 26 | size_t lenOut = utf8_to_utf32_rvv((char*)in, lenIn, out); 27 | 28 | if (len32 != lenOut) { 29 | print("ERROR: length mismatch, expected ")(u,len32)(" got ")(u,lenOut)("\n"); 30 | print("\nin: "); 31 | for (size_t i = 0; i < lenIn; ++i) 32 | print(b8,in[i])(" "); 33 | print("\nout: "); 34 | for (size_t i = 0; i < lenOut; ++i) 35 | print(b32,out[i])(" "); 36 | print("\ntar: "); 37 | for (size_t i = 0; i < len32; ++i) 38 | print(b32,utf32[i])(" "); 39 | print_flush();exit(0); 40 | return; 41 | } 42 | for (size_t i = 0; i < len32; ++i) { 43 | if (utf32[i] != out[i]) { 44 | print("ERROR: at ")(u,i)("/")(u,len32)(" expected ")(u,utf32[i])(" got ")(u,out[i])("\n"); 45 | return; 46 | } 47 | } 48 | } 49 | 50 | int 51 | main(void) 52 | { 53 | randState.x ^= rv_cycles(); 54 | for (size_t i = 0; i < 10000000; ++i) { 55 | test(10, 2); 56 | test(10, 10); 57 | test(10, 100); 58 | test(100, 2); 59 | test(100, 10); 60 | test(100, 100); 61 | test(400, 2); 62 | test(400, 10); 63 | test(2000, 2); 64 | test(2000, 100); 65 | if ((i & 127) == 0) 66 | print("\r")(u,i)(" ")(flush,); 67 | } 68 | return 0; 69 | 70 | } 71 | 72 | -------------------------------------------------------------------------------- /vector-utf/tests/Makefile: -------------------------------------------------------------------------------- 1 | .POSIX: 2 | 3 | include ../../config.mk 4 | 5 | TESTS=8to32 8to16 16to8 6 | 7 | all: ${TESTS} 8 | 9 | 8to16: 8to16.c common.h ../8toN_gather.c 10 | ${CC} ${CFLAGS} -DTO_16=1 -o 8to16 8to16.c ../8toN_gather.c 11 | 12 | 8to32: 8to32.c common.h ../8toN_gather.c 13 | ${CC} ${CFLAGS} -o 8to32 8to32.c ../8toN_gather.c 14 | 15 | 16to8: 16to8.c common.h ../16to8_gather.c 16 | ${CC} ${CFLAGS} -o 16to8 16to8.c ../16to8_gather.c 17 | 18 | clean: 19 | rm -f ${TESTS} 20 | 21 | 22 | -------------------------------------------------------------------------------- /vector-utf/tests/common.h: -------------------------------------------------------------------------------- 1 | #define NOLIBC_MAIN 2 | #include "../../nolibc.h" 3 | #include "../scalar.h" 4 | 5 | static URand randState = { 123, 456, 789 }; 6 | 7 | static uint64_t randu64(void) { return urand(&randState); } 8 | 9 | static void 10 | print_b8(uint8_t val) 11 | { 12 | if (printEnd - printIt < 8) print_flush(); 13 | size_t n = 8; 14 | while (n--) *printIt++ = (val >> 7) + '0', val <<= 1; 15 | } 16 | 17 | static void 18 | print_b16(uint16_t val) 19 | { 20 | if (printEnd - printIt < 16) print_flush(); 21 | size_t n = 16; 22 | while (n--) *printIt++ = (val >> 15) + '0', val <<= 1; 23 | } 24 | 25 | static void 26 | print_b32(uint32_t val) 27 | { 28 | if (printEnd - printIt < 32) print_flush(); 29 | size_t n = 32; 30 | while (n--) *printIt++ = (val >> 31) + '0', val <<= 1; 31 | } 32 | 33 | --------------------------------------------------------------------------------