├── .github
    └── workflows
    │   ├── bench-config.h
    │   ├── rv32-config.mk
    │   ├── rv32-run.sh
    │   ├── rv64-config-hosted.mk
    │   ├── rv64-config.mk
    │   ├── rv64-run.sh
    │   └── validate-bench.yml
├── .gitmodules
├── LICENSE
├── README.md
├── bench
    ├── LUT4.S
    ├── LUT4.c
    ├── LUT6.S
    ├── LUT6.c
    ├── Makefile
    ├── ascii_to_utf16.S
    ├── ascii_to_utf16.c
    ├── ascii_to_utf32.S
    ├── ascii_to_utf32.c
    ├── base64_encode.S
    ├── base64_encode.c
    ├── bench.h
    ├── byteswap.S
    ├── byteswap.c
    ├── chacha20.S
    ├── chacha20.c
    ├── config.h
    ├── hist.S
    ├── hist.c
    ├── mandelbrot.S
    ├── mandelbrot.c
    ├── memcpy.S
    ├── memcpy.c
    ├── memset.S
    ├── memset.c
    ├── mergelines.S
    ├── mergelines.c
    ├── poly1305.S
    ├── poly1305.c
    ├── strlen.S
    ├── strlen.c
    ├── template.S
    ├── utf8_count.S
    └── utf8_count.c
├── config.mk
├── instructions
    ├── rvv
    │   ├── Makefile
    │   ├── config.h
    │   ├── gen.S
    │   └── main.c
    ├── scalar
    │   ├── Makefile
    │   ├── config.h
    │   ├── main.S
    │   └── main.c
    └── xtheadvector
    │   ├── Makefile
    │   ├── config.h
    │   ├── main.S
    │   └── main.c
├── nolibc.h
├── run.sh
├── single
    ├── Makefile
    └── veclibm.c
├── thirdparty
    ├── rvv-chacha-poly
    │   ├── CONTRIBUTING.md
    │   ├── LICENSE
    │   ├── README.md
    │   ├── boring.c
    │   ├── boring.h
    │   ├── main.c
    │   ├── test.sh
    │   ├── vchacha.s
    │   └── vpoly.s
    └── rvv-rollback.S
└── vector-utf
    ├── 16to8_gather.c
    ├── 8toN_gather.c
    ├── Makefile
    ├── bench.c
    ├── rvv-0.7.1
        ├── 8to16.S
        └── 8to32.S
    ├── scalar.h
    ├── simdutf.cpp
    └── tests
        ├── 16to8.c
        ├── 8to16.c
        ├── 8to32.c
        ├── Makefile
        └── common.h


/.github/workflows/bench-config.h:
--------------------------------------------------------------------------------
 1 | #define MAX_MEM (4096*8)
 2 | #define NEXT(c) (c + c/3 + 3)
 3 | #define VALIDATE 1
 4 | #define MIN_REPEATS 2
 5 | #define MAX_REPEATS 2
 6 | 
 7 | #define STOP_CYCLES (1024*1024*500)
 8 | #define SCALE_mandelbrot(N) ((N)/10)
 9 | #define SCALE_mergelines(N) ((N)/10)
10 | #define mandelbrot_ITER 100
11 | 


--------------------------------------------------------------------------------
/.github/workflows/rv32-config.mk:
--------------------------------------------------------------------------------
1 | WARN=-Wall -Wextra -Wno-unused-function -Wno-unused-parameter
2 | CC=clang-17
3 | CFLAGS=--target=riscv32 -march=rv32gc_zve32f_zfh_zba_zbb_zbs -O3 ${WARN} -nostdlib -fno-builtin -nodefaultlibs -ffreestanding
4 | 


--------------------------------------------------------------------------------
/.github/workflows/rv32-run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # zfh=true,zvfh=true disabled for now
 4 | qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@ && \
 5 | qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=256,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@ && \
 6 | qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=512,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@ && \
 7 | qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=1024,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@ &&\
 8 | qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=off,rvv_ma_all_1s=off $@ && \
 9 | qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=1024,rvv_ta_all_1s=off,rvv_ma_all_1s=off $@
10 | 
11 | 


--------------------------------------------------------------------------------
/.github/workflows/rv64-config-hosted.mk:
--------------------------------------------------------------------------------
1 | WARN=-Wall -Wextra -Wno-unused-function -Wno-unused-parameter
2 | CC=riscv64-linux-gnu-gcc
3 | CFLAGS=-march=rv64gcv_zfh_zba_zbb_zbs -O3 ${WARN}
4 | 


--------------------------------------------------------------------------------
/.github/workflows/rv64-config.mk:
--------------------------------------------------------------------------------
1 | WARN=-Wall -Wextra -Wno-unused-function -Wno-unused-parameter
2 | CC=clang-17
3 | CFLAGS=--target=riscv64 -march=rv64gcv_zfh_zba_zbb_zbs -O3 ${WARN} -nostdlib -fno-builtin -nodefaultlibs -ffreestanding
4 | 


--------------------------------------------------------------------------------
/.github/workflows/rv64-run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # zfh=true,zvfh=true disabled for now
 4 | qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@ && \
 5 | qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=256,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@ && \
 6 | qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=512,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@ && \
 7 | qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=1024,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@ && \
 8 | qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=off,rvv_ma_all_1s=off $@ && \
 9 | qemu-riscv64-static -cpu rv64,v=on,vext_spec=v1.0,vlen=1024,rvv_ta_all_1s=off,rvv_ma_all_1s=off $@
10 | 


--------------------------------------------------------------------------------
/.github/workflows/validate-bench.yml:
--------------------------------------------------------------------------------
 1 | name: Validate bench
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   Tests:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - uses: actions/checkout@v4
14 |     - name: Install packages
15 |       run: |
16 |         git submodule update --init --recursive
17 |         sudo apt-get update -y
18 |         sudo apt-get install -y make qemu-user-static clang-17 lld-17 gcc-riscv64-linux-gnu
19 |         sed 's/zfh_zvfh//g' -i ./config.mk
20 |     - name: Validate RV64
21 |       run: |
22 |         cp .github/workflows/rv64-config.mk ./config.mk
23 |         cp .github/workflows/rv64-run.sh ./run.sh
24 |         cp .github/workflows/bench-config.h ./bench/config.h
25 |         make -C bench run -j$(nproc)
26 |         make -C bench clean
27 |     - name: Validate RV32
28 |       run: |
29 |         cp .github/workflows/rv32-config.mk ./config.mk
30 |         cp .github/workflows/rv32-run.sh ./run.sh
31 |         cp .github/workflows/bench-config.h ./bench/config.h
32 |         make -C bench run -j$(nproc)
33 |         make -C bench clean
34 |     - name: Build freestanding 64-bit
35 |       run: |
36 |         cp .github/workflows/rv64-config.mk ./config.mk
37 |         make -C bench -j$(nproc)
38 |         make -C bench clean
39 |         make -C instructions/rvv
40 |         make -C instructions/rvv clean
41 |         make -C instructions/scalar
42 |         make -C instructions/scalar clean
43 |     - name: Build freestanding 32-bit
44 |       run: |
45 |         cp .github/workflows/rv32-config.mk ./config.mk
46 |         make -C bench -j$(nproc)
47 |         make -C bench clean
48 |         make -C instructions/rvv
49 |         make -C instructions/rvv clean
50 |         make -C instructions/scalar
51 |         make -C instructions/scalar clean
52 |     - name: Build hosted 64-bit
53 |       run: |
54 |         cp .github/workflows/rv64-config-hosted.mk ./config.mk
55 |         make -C bench -j$(nproc)
56 |         make -C bench clean
57 |         make -C instructions/rvv
58 |         make -C instructions/rvv clean
59 |         make -C instructions/scalar
60 |         make -C instructions/scalar clean
61 |         sed '/CFLAGS/s/$/ -DUSE_PERF_EVENT/' -i ./config.mk
62 |         make -C bench -j$(nproc)
63 |         make -C bench clean
64 |         make -C instructions/rvv
65 |         make -C instructions/rvv clean
66 |         make -C instructions/scalar
67 |         make -C instructions/scalar clean
68 |         sed 's/-DUSE_PERF_EVENT/-DUSE_PERF_EVENT_SLOW/' -i ./config.mk
69 |         make -C bench -j$(nproc)
70 |         make -C bench clean
71 |         make -C instructions/rvv
72 |         make -C instructions/rvv clean
73 |         make -C instructions/scalar
74 |         make -C instructions/scalar clean
75 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "thirdparty/veclibm"]
2 | 	path = thirdparty/veclibm
3 | 	url = https://github.com/rivosinc/veclibm
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Olaf Bernstein
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RISC-V Vector benchmark
 2 | 
 3 | A collection of RISC-V Vector (RVV) benchmarks to help developers write portably performant RVV code.
 4 | 
 5 | Benchmark results can be found at: https://camel-cdr.github.io/rvv-bench-results
 6 | 
 7 | ## Benchmarks ([./bench/](./bench/))
 8 | 
 9 | Contains a bunch of benchmark of different implementations of certain algorithms.
10 | 
11 | ## Instruction cycle count ([./instructions/](./instructions/))
12 | 
13 | Measures the cycle count of RVV instructions by unrolling and looping over the given instruction repeatedly.
14 | 
15 | ## Getting started
16 | 
17 | Start by configuring [./config.mk](./config.mk), such that `make` works and optionally [./run.sh](./run.sh), which allows you to compile and run using `make run`.
18 | 
19 | The default configuration should work with all recent clang builds and doesn't require a full cross compilation toolchain, because it builds in freestanding mode.
20 | This means it will only work on linux, or linux syscall compatible OS.
21 | 
22 | On recent linux versions, the performance counters aren't exposed by default, you may have to execute `echo 2 >/proc/sys/kernel/perf_user_access` and append `-DUSE_PERF_EVENT` to the `CFLAGS=...` line in [./config.mk](./config.mk) (if that doesn't work, try `-DUSE_PERF_EVENT_SLOW` instead).
23 | 
24 | You can configure [./config.mk](./config.mk) to build a hosted build or configure it with your custom toolchain, add the `-DCUSTOM_HOST` define, and implement the unimplemented functions under `#ifdef CUSTOM_HOST` in [./nolibc.h](./nolibc.h).
25 | 
26 | XTheadVector isn't supported anymore.
27 | 
28 | ### Running benchmarks ([./bench/](./bench/))
29 | 
30 | To run the benchmarks, first look through ([./bench/config.h](./bench/config.h)) and adjust it to your processor (e.g. set `HAS_E64`). If it takes too long to execute, try lowering `MAX_MEM`, which is used to scale the benchmark, and play around with the other constants until it executes in a reasonable amount of time and gives a relatively smooth graph.
31 | 
32 | Now you can just run the benchmarks using `make run` in the ([./bench/](./bench/)) directory, or `make` to just build the executables.
33 | 
34 | ### Measuring cycle count ([./instructions/](./instructions/))
35 | 
36 | To run the cycle count measurement, first configure [instructions/rvv/config.h](instructions/rvv/config.h) to your processor.
37 | 
38 | Now you can run the measurement using `make run` in the ([./instructions/rvv/](./instructions/rvv/)) directory, or `make` to just build the executables.
39 | 
40 | For XTheadVector use the ([./instructions/xtheadvector/](./instructions/xtheadvector/)) directory instead. (this isn't maintained anymore)
41 | 
42 | ## Contributing
43 | 
44 | Here are some suggestions of things that still need to be done.
45 | 
46 | * contribute a measurement of a new CPU to: https://github.com/camel-cdr/rvv-bench-results \
47 |   You can just create an issue with a single json file, which contains all concatenated [./bench/](./bench/) results. (after proper setup, `make run > out.json` should do the trick). \
48 | * implement non memory bound benchmarks
49 | * implement more benchmarks
50 | * better cycle count measurements: throughput vs latency (also: can we figure out the execution port configuration?)
51 | * cycle count for load/stores
52 | * cycle count for vsetvl
53 | 
54 | ## License
55 | 
56 | This repository is licensed under the MIT [LICENSE](LICENSE).
57 | 
58 | 


--------------------------------------------------------------------------------
/bench/LUT4.S:
--------------------------------------------------------------------------------
 1 | #ifdef MX
 2 | 
 3 | .global MX(LUT4_rvv_vloxei8_)
 4 | MX(LUT4_rvv_vloxei8_):
 5 | 1:
 6 | 	vsetvli a3, a2, e8, MX(), ta, ma
 7 | 	vle8.v v8, (a1)
 8 | 	vand.vi v8, v8, 15
 9 | 	vloxei8.v v8, (a0), v8
10 | 	vse8.v v8, (a1)
11 | 	sub a2, a2, a3
12 | 	add a1, a1, a3
13 | 	bnez a2, 1b
14 | 	ret
15 | 
16 | .global MX(LUT4_rvv_vluxei8_)
17 | MX(LUT4_rvv_vluxei8_):
18 | 1:
19 | 	vsetvli a3, a2, e8, MX(), ta, ma
20 | 	vle8.v v8, (a1)
21 | 	vand.vi v8, v8, 15
22 | 	vluxei8.v v8, (a0), v8
23 | 	vse8.v v8, (a1)
24 | 	sub a2, a2, a3
25 | 	add a1, a1, a3
26 | 	bnez a2, 1b
27 | 	ret
28 | 
29 | # a0 = lut, a1 = ptr, a2 = len
30 | .global MX(LUT4_rvv_gather_)
31 | MX(LUT4_rvv_gather_):
32 | 	li t0, 16
33 | 	vsetvli zero, t0, e8, m1, ta, ma
34 | 	vle8.v v0, (a0)
35 | 1:
36 | 	vsetvli a0, a2, e8, MX(), ta, ma
37 | 	vle8.v v8, (a1)
38 | 	vand.vi v8, v8, 15
39 | 	vrgather.vv v16, v0, v8
40 | 	vse8.v v16, (a1)
41 | 	sub a2, a2, a0
42 | 	add a1, a1, a0
43 | 	bnez a2, 1b
44 | 	ret
45 | #endif
46 | 
47 | #if MX_N == 2
48 | 
49 | .macro LUT4_rvv_m1_gathers n
50 | 	.global LUT4_rvv_m1_gathers_m\n
51 | 	LUT4_rvv_m1_gathers_m\n:
52 | 		li t0, 16
53 | 		vsetvli zero, t0, e8, m1, ta, ma
54 | 		vle8.v v0, (a0)
55 | 1:
56 | 		vsetvli a0, a2, e8, m\n, ta, ma
57 | 		vle8.v v8, (a1)
58 | 		vand.vi v8, v8, 15
59 | 		vsetvli t1, x0, e8, m1, ta, ma
60 | 		vrgather.vv v16, v0, v8
61 | 	.ifge \n-2
62 | 		vrgather.vv v17, v0, v9
63 | 	.ifge \n-4
64 | 		vrgather.vv v18, v0, v10
65 | 		vrgather.vv v19, v0, v11
66 | 	.ifge \n-8
67 | 		vrgather.vv v20, v0, v12
68 | 		vrgather.vv v21, v0, v13
69 | 		vrgather.vv v22, v0, v14
70 | 		vrgather.vv v23, v0, v15
71 | 	.endif
72 | 	.endif
73 | 	.endif
74 | 		vsetvli x0, a0, e8, m\n, ta, ma
75 | 		vse8.v v16, (a1)
76 | 		sub a2, a2, a0
77 | 		add a1, a1, a0
78 | 		bnez a2, 1b
79 | 		ret
80 | .endm
81 | 
82 | LUT4_rvv_m1_gathers 2
83 | #endif
84 | #if MX_N == 4
85 | LUT4_rvv_m1_gathers 4
86 | #endif
87 | #if MX_N == 8
88 | LUT4_rvv_m1_gathers 8
89 | #endif
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/bench/LUT4.c:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | 
 3 | void
 4 | LUT4_scalar(uint8_t lut[16], uint8_t *ptr, size_t n)
 5 | {
 6 | 	for (; n--; ++ptr)
 7 | 		*ptr = lut[*ptr & 0xF], BENCH_CLOBBER();
 8 | }
 9 | 
10 | void
11 | LUT4_scalar_autovec(uint8_t lut[restrict 16], uint8_t *restrict ptr, size_t n)
12 | {
13 | 	for (; n--; ++ptr)
14 | 		*ptr = lut[*ptr & 0xF];
15 | }
16 | 
17 | 
18 | #define IMPLS(f) \
19 | 	f(scalar) \
20 | 	f(scalar_autovec) \
21 | 	MX(f, rvv_gather) \
22 | 	f(rvv_m1_gathers_m2) \
23 | 	f(rvv_m1_gathers_m4) \
24 | 	f(rvv_m1_gathers_m8) \
25 | 	MX(f, rvv_vluxei8) \
26 | 	MX(f, rvv_vloxei8) \
27 | 
28 | typedef void Func(uint8_t lut[16], uint8_t *ptr, size_t n);
29 | 
30 | #define DECLARE(f) extern Func LUT4_##f;
31 | IMPLS(DECLARE)
32 | 
33 | #define EXTRACT(f) { #f, &LUT4_##f },
34 | Impl impls[] = { IMPLS(EXTRACT) };
35 | 
36 | uint8_t *ptr;
37 | 
38 | void init(void) { ptr = (uint8_t*)mem; }
39 | 
40 | ux checksum(size_t n) {
41 | 	ux sum = 0;
42 | 	for (size_t i = 0; i < n; ++i)
43 | 		sum = uhash(sum) + ptr[i];
44 | 	return sum;
45 | }
46 | 
47 | BENCH_BEG(base) {
48 | 	static uint32_t lut[4] = { 0x4564907f, 0xb8ce2de0, 0xc0f7adf8, 0xa048aa9f };
49 | 	bench_memrand(ptr, n * sizeof *ptr);
50 | 	TIME f((uint8_t*)lut, ptr, n);
51 | } BENCH_END
52 | 
53 | Bench benches[] = {
54 | 	BENCH( impls, MAX_MEM, "LUT4", bench_base )
55 | }; BENCH_MAIN(benches)
56 | 
57 | 


--------------------------------------------------------------------------------
/bench/LUT6.S:
--------------------------------------------------------------------------------
  1 | #if MX_N == 4
  2 | 
  3 | .global LUT6_rvv_vloxei8_m4
  4 | LUT6_rvv_vloxei8_m4:
  5 | 	vsetvli t0, x0, e8, m4, ta, ma
  6 | 	li t0, 63
  7 | 	vmv.v.x v24, t0
  8 | 1:
  9 | 	vsetvli a3, a2, e8, m4, ta, ma
 10 | 	vle8.v v8, (a1)
 11 | 	vand.vv v8, v8, v24
 12 | 	vloxei8.v v8, (a0), v8
 13 | 	vse8.v v8, (a1)
 14 | 	sub a2, a2, a3
 15 | 	add a1, a1, a3
 16 | 	bnez a2, 1b
 17 | 	ret
 18 | 
 19 | .global LUT6_rvv_vluxei8_m4
 20 | LUT6_rvv_vluxei8_m4:
 21 | 	vsetvli t0, x0, e8, m4, ta, ma
 22 | 	li t0, 63
 23 | 	vmv.v.x v24, t0
 24 | 1:
 25 | 	vsetvli a3, a2, e8, m4, ta, ma
 26 | 	vle8.v v8, (a1)
 27 | 	vand.vv v8, v8, v24
 28 | 	vluxei8.v v8, (a0), v8
 29 | 	vse8.v v8, (a1)
 30 | 	sub a2, a2, a3
 31 | 	add a1, a1, a3
 32 | 	bnez a2, 1b
 33 | 	ret
 34 | 
 35 | # a0 = lut, a1 = ptr, a2 = len
 36 | .global LUT6_rvv_gather_m4
 37 | LUT6_rvv_gather_m4:
 38 | 	li t0, 64
 39 | 	vsetvli zero, t0, e8, m4, ta, ma
 40 | 	vle8.v v0, (a0)
 41 | 
 42 | 	vsetvli t0, x0, e8, m4, ta, ma
 43 | 	li t0, 63
 44 | 	vmv.v.x v24, t0
 45 | 1:
 46 | 	vsetvli a0, a2, e8, m4, ta, ma
 47 | 	vle8.v v8, (a1)
 48 | 	vand.vv v8, v8, v24
 49 | 	vrgather.vv v16, v0, v8
 50 | 	vse8.v v16, (a1)
 51 | 	sub a2, a2, a0
 52 | 	add a1, a1, a0
 53 | 	bnez a2, 1b
 54 | 	ret
 55 | 
 56 | .global LUT6_rvv_m1m2m4_gathers_m4
 57 | LUT6_rvv_m1m2m4_gathers_m4:
 58 | 	li t0, 64
 59 | 	vsetvli zero, t0, e8, m4, ta, ma
 60 | 	vle8.v v0, (a0)
 61 | 
 62 | 	vsetvli t0, x0, e8, m4, ta, ma
 63 | 	li t0, 63
 64 | 	vmv.v.x v24, t0
 65 | 
 66 | 	csrr t0, vlenb
 67 | 	srl t0, t0, 4
 68 | 	sltiu t1, t0, 4
 69 | 	sltiu t0, t0, 2
 70 | 	j 0f
 71 | 1:
 72 | 	vsetvli t1, x0, e8, m1, ta, ma
 73 | 	vrgather.vv v16, v0, v8
 74 | 	vrgather.vv v17, v0, v9
 75 | 	vrgather.vv v18, v0, v10
 76 | 	vrgather.vv v19, v0, v11
 77 | 8:
 78 | 	vsetvli x0, a0, e8, m4, ta, ma
 79 | 	vse8.v v16, (a1)
 80 | 	sub a2, a2, a0
 81 | 	add a1, a1, a0
 82 | 	beqz a2, 9f
 83 | 0:
 84 | 	vsetvli a0, a2, e8, m4, ta, ma
 85 | 	vle8.v v8, (a1)
 86 | 	vand.vv v8, v8, v24
 87 | 	beqz t1, 1b
 88 | 	beqz t0, 2f
 89 | 	vrgather.vv v16, v0, v8
 90 | 	j 8b
 91 | 2:
 92 | 	vsetvli t1, x0, e8, m2, ta, ma
 93 | 	vrgather.vv v16, v0, v8
 94 | 	vrgather.vv v18, v0, v10
 95 | 	j 8b
 96 | 9:
 97 | 	ret
 98 | 
 99 | #endif
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/bench/LUT6.c:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | 
 3 | void
 4 | LUT6_scalar(uint8_t lut[64], uint8_t *ptr, size_t n)
 5 | {
 6 | 	for (; n--; ++ptr)
 7 | 		*ptr = lut[*ptr & 63], BENCH_CLOBBER();
 8 | }
 9 | 
10 | void
11 | LUT6_scalar_autovec(uint8_t lut[restrict 64], uint8_t *restrict ptr, size_t n)
12 | {
13 | 	for (; n--; ++ptr)
14 | 		*ptr = lut[*ptr & 63];
15 | }
16 | 
17 | 
18 | #define IMPLS(f) \
19 | 	f(scalar) \
20 | 	f(scalar_autovec) \
21 | 	f(rvv_gather_m4) \
22 | 	f(rvv_m1m2m4_gathers_m4) \
23 | 	f(rvv_vluxei8_m4) \
24 | 	f(rvv_vloxei8_m4) \
25 | 
26 | typedef void Func(uint8_t lut[64], uint8_t *ptr, size_t n);
27 | 
28 | #define DECLARE(f) extern Func LUT6_##f;
29 | IMPLS(DECLARE)
30 | 
31 | #define EXTRACT(f) { #f, &LUT6_##f },
32 | Impl impls[] = { IMPLS(EXTRACT) };
33 | 
34 | uint8_t *ptr;
35 | 
36 | void init(void) { ptr = (uint8_t*)mem; }
37 | 
38 | ux checksum(size_t n) {
39 | 	ux sum = 0;
40 | 	for (size_t i = 0; i < n; ++i)
41 | 		sum = uhash(sum) + ptr[i];
42 | 	return sum;
43 | }
44 | 
45 | BENCH_BEG(base) {
46 | 	static uint8_t lut[] =
47 | 		"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
48 | 		"abcdefghijklmnopqrstuvwxyz"
49 | 		"0123456789"
50 | 		"+/";
51 | 	bench_memrand(ptr, n * sizeof *ptr);
52 | 	TIME f(lut, ptr, n);
53 | } BENCH_END
54 | 
55 | Bench benches[] = {
56 | 	BENCH( impls, MAX_MEM, "LUT6", bench_base )
57 | }; BENCH_MAIN(benches)
58 | 
59 | 


--------------------------------------------------------------------------------
/bench/Makefile:
--------------------------------------------------------------------------------
 1 | .POSIX:
 2 | 
 3 | include ../config.mk
 4 | 
 5 | EXECS=memcpy memset utf8_count strlen mergelines mandelbrot chacha20 poly1305 ascii_to_utf16 ascii_to_utf32 byteswap LUT4 LUT6 hist base64_encode
 6 | 
 7 | all: ${EXECS}
 8 | 
 9 | .c: $@.S template.S config.h bench.h
10 | 	${CC} ${CFLAGS} -o $@ $< -DINC=$@.S template.S
11 | 
12 | clean:
13 | 	rm -f ${EXECS}
14 | 
15 | run: all
16 | 	for i in ${EXECS}; do ../run.sh ./$$i || { printf "\n\n\033[0;31mFAILED\033[0m\n\n"; exit 1; } ; done
17 | 
18 | 


--------------------------------------------------------------------------------
/bench/ascii_to_utf16.S:
--------------------------------------------------------------------------------
 1 | #ifdef MX
 2 | 
 3 | #if MX_N == 4 || MX_N == 2 || MX_N == 1
 4 | 
 5 | .global MX(ascii_to_utf16_rvv_vsseg_)
 6 | MX(ascii_to_utf16_rvv_vsseg_):
 7 | 	vsetvli t0, x0, e8, MX2(), ta, ma
 8 | 	vmv.v.i v0, 0
 9 | 1:
10 | 	vsetvli t0, a2, e8, MX(), ta, ma
11 | 	vle8.v v0, (a1)
12 | 	vsseg2e8.v v0, (a0)
13 | 	add a1, a1, t0
14 | 	sub a2, a2, t0
15 | 	slli t0, t0, 1
16 | 	add a0, a0, t0
17 | 	bnez a2, 1b
18 | 	ret
19 | 
20 | 
21 | 
22 | .global MX(ascii_to_utf16_rvv_ext_)
23 | MX(ascii_to_utf16_rvv_ext_):
24 | 1:
25 | 	vsetvli t0, a2, e8, MX(), ta, ma
26 | 	vle8.v v0, (a1)
27 | 	vsetvli x0, x0, e16, MX2(), ta, ma
28 | 	vzext.vf2 v8, v0
29 | 	vse16.v v8, (a0)
30 | 	add a1, a1, t0
31 | 	sub a2, a2, t0
32 | 	slli t0, t0, 1
33 | 	add a0, a0, t0
34 | 	bnez a2, 1b
35 | 	ret
36 | 
37 | 
38 | .global MX(ascii_to_utf16_rvv_vss_)
39 | MX(ascii_to_utf16_rvv_vss_):
40 | 	vsetvli t0, x0, e8, MX2(), ta, ma
41 | 	vmv.v.i v0, 0
42 | 	li a3, 2
43 | 1:
44 | 	vsetvli t0, a2, e16, MX2(), ta, ma
45 | 	vse16.v v0, (a0)
46 | 
47 | 	vsetvli t0, a2, e8, MX(), ta, ma
48 | 	vle8.v v8, (a1)
49 | 	vsse8.v v8, (a0), a3
50 | 
51 | 	add a1, a1, t0
52 | 	sub a2, a2, t0
53 | 	slli t0, t0, 1
54 | 	add a0, a0, t0
55 | 	bnez a2, 1b
56 | 	ret
57 | 
58 | #endif
59 | #endif
60 | 
61 | 


--------------------------------------------------------------------------------
/bench/ascii_to_utf16.c:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | 
 3 | void
 4 | ascii_to_utf16_scalar(uint16_t *restrict dest, uint8_t const *restrict src, size_t len)
 5 | {
 6 | 	while (len--) BENCH_CLOBBER(), *dest++ = *src++;
 7 | }
 8 | 
 9 | void
10 | ascii_to_utf16_scalar_autovec(uint16_t *restrict dest, uint8_t const *restrict src, size_t len)
11 | {
12 | 	while (len--) *dest++ = *src++;
13 | }
14 | 
15 | #define IMPLS(f) \
16 | 	f(scalar) f(scalar_autovec) \
17 | 	f(rvv_ext_m1) f(rvv_ext_m2) f(rvv_ext_m4) \
18 | 	f(rvv_vsseg_m1) f(rvv_vsseg_m2) f(rvv_vsseg_m4) \
19 | 	f(rvv_vss_m1) f(rvv_vss_m2) f(rvv_vss_m4) \
20 | 
21 | typedef void Func(uint16_t *restrict dest, uint8_t const *restrict src, size_t len);
22 | 
23 | #define DECLARE(f) extern Func ascii_to_utf16_##f;
24 | IMPLS(DECLARE)
25 | 
26 | #define EXTRACT(f) { #f, &ascii_to_utf16_##f },
27 | Impl impls[] = { IMPLS(EXTRACT) };
28 | 
29 | uint16_t *dest;
30 | uint8_t *src;
31 | 
32 | void init(void) { }
33 | 
34 | ux checksum(size_t n) {
35 | 	ux sum = 0;
36 | 	for (size_t i = 0; i < n+9; ++i)
37 | 		sum = uhash(sum) + dest[i];
38 | 	return sum;
39 | }
40 | 
41 | void common(size_t n, size_t dOff, size_t sOff) {
42 | 	dest = (uint16_t*)mem + dOff/2;
43 | 	src = (uint8_t*)(dest + 9 + MAX_MEM/3) + sOff;
44 | 	bench_memrand(src, n+9);
45 | 	for (size_t i = 0; i < n+9; ++i) src[i] |= 0x7F;
46 | 	memset(dest, 1, (n+9)*2);
47 | }
48 | 
49 | BENCH_BEG(base) {
50 | 	common(n, bench_urand() & 255, bench_urand() & 255);
51 | 	TIME f(dest, src, n);
52 | } BENCH_END
53 | 
54 | BENCH_BEG(aligned) {
55 | 	common(n, 0, 0);
56 | 	TIME f(dest, src, n);
57 | } BENCH_END
58 | 
59 | Bench benches[] = {
60 | 	BENCH( impls, MAX_MEM/3 - 512-9*2, "ascii to utf16", bench_base ),
61 | 	BENCH( impls, MAX_MEM/3 - 512-9*2, "ascii to utf16 aligned", bench_aligned ),
62 | }; BENCH_MAIN(benches)
63 | 
64 | 


--------------------------------------------------------------------------------
/bench/ascii_to_utf32.S:
--------------------------------------------------------------------------------
 1 | #ifdef MX
 2 | 
 3 | #if MX_N == 2 || MX_N == 1
 4 | 
 5 | .global MX(ascii_to_utf32_rvv_vsseg_)
 6 | MX(ascii_to_utf32_rvv_vsseg_):
 7 | 	vsetvli t0, x0, e8, MX4(), ta, ma
 8 | 	vmv.v.i v0, 0
 9 | 1:
10 | 	vsetvli t0, a2, e8, MX(), ta, ma
11 | 	vle8.v v0, (a1)
12 | 	vsseg4e8.v v0, (a0)
13 | 	add a1, a1, t0
14 | 	sub a2, a2, t0
15 | 	slli t0, t0, 2
16 | 	add a0, a0, t0
17 | 	bnez a2, 1b
18 | 	ret
19 | 
20 | 
21 | .global MX(ascii_to_utf32_rvv_ext_)
22 | MX(ascii_to_utf32_rvv_ext_):
23 | 1:
24 | 	vsetvli t0, a2, e8, MX(), ta, ma
25 | 	vle8.v v0, (a1)
26 | 	vsetvli x0, x0, e32, MX4(), ta, ma
27 | 	vzext.vf4 v8, v0
28 | 	vse32.v v8, (a0)
29 | 	add a1, a1, t0
30 | 	sub a2, a2, t0
31 | 	slli t0, t0, 2
32 | 	add a0, a0, t0
33 | 	bnez a2, 1b
34 | 	ret
35 | 
36 | 
37 | .global MX(ascii_to_utf32_rvv_vss_)
38 | MX(ascii_to_utf32_rvv_vss_):
39 | 	vsetvli t0, x0, e8, MX4(), ta, ma
40 | 	vmv.v.i v0, 0
41 | 	li a3, 4
42 | 1:
43 | 	vsetvli t0, a2, e32, MX4(), ta, ma
44 | 	vse32.v v0, (a0)
45 | 
46 | 	vsetvli t0, a2, e8, MX(), ta, ma
47 | 	vle8.v v8, (a1)
48 | 	vsse8.v v8, (a0), a3
49 | 
50 | 	add a1, a1, t0
51 | 	sub a2, a2, t0
52 | 	slli t0, t0, 2
53 | 	add a0, a0, t0
54 | 	bnez a2, 1b
55 | 	ret
56 | 
57 | #endif
58 | #endif
59 | 
60 | 


--------------------------------------------------------------------------------
/bench/ascii_to_utf32.c:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | 
 3 | void
 4 | ascii_to_utf32_scalar(uint32_t *restrict dest, uint8_t const *restrict src, size_t len)
 5 | {
 6 | 	while (len--)  BENCH_CLOBBER(), *dest++ = *src++;
 7 | }
 8 | 
 9 | void
10 | ascii_to_utf32_scalar_autovec(uint32_t *restrict dest, uint8_t const *restrict src, size_t len)
11 | {
12 | 	while (len--) *dest++ = *src++;
13 | }
14 | 
15 | #define IMPLS(f) \
16 | 	f(scalar) f(scalar_autovec) \
17 | 	f(rvv_ext_m1) f(rvv_ext_m2) \
18 | 	f(rvv_vsseg_m1) f(rvv_vsseg_m2) \
19 | 	f(rvv_vss_m1) f(rvv_vss_m2) \
20 | 
21 | typedef void Func(uint32_t *restrict dest, uint8_t const *restrict src, size_t len);
22 | 
23 | #define DECLARE(f) extern Func ascii_to_utf32_##f;
24 | IMPLS(DECLARE)
25 | 
26 | #define EXTRACT(f) { #f, &ascii_to_utf32_##f },
27 | Impl impls[] = { IMPLS(EXTRACT) };
28 | 
29 | uint32_t *dest;
30 | uint8_t *src;
31 | 
32 | void init(void) { }
33 | 
34 | ux checksum(size_t n) {
35 | 	ux sum = 0;
36 | 	for (size_t i = 0; i < n+9; ++i)
37 | 		sum = uhash(sum) + dest[i];
38 | 	return sum;
39 | }
40 | 
41 | void common(size_t n, size_t dOff, size_t sOff) {
42 | 	dest = (uint32_t*)mem + dOff/4;
43 | 	src = (uint8_t*)(dest + 9 + MAX_MEM/5) + sOff;
44 | 	bench_memrand(src, n+9);
45 | 	for (size_t i = 0; i < n+9; ++i) src[i] |= 0x7F;
46 | 	memset(dest, 1, (n+9)*4);
47 | }
48 | 
49 | BENCH_BEG(base) {
50 | 	common(n, bench_urand() & 255, bench_urand() & 255);
51 | 	TIME f(dest, src, n);
52 | } BENCH_END
53 | 
54 | BENCH_BEG(aligned) {
55 | 	common(n, 0, 0);
56 | 	TIME f(dest, src, n);
57 | } BENCH_END
58 | 
59 | Bench benches[] = {
60 | 	BENCH( impls, MAX_MEM/5 - 512-9*2, "ascii to utf32", bench_base ),
61 | 	BENCH( impls, MAX_MEM/5 - 512-9*2, "ascii to utf32 aligned", bench_aligned ),
62 | }; BENCH_MAIN(benches)
63 | 
64 | 


--------------------------------------------------------------------------------
/bench/base64_encode.S:
--------------------------------------------------------------------------------
  1 | // Code generated using clang-20 from:
  2 | // https://github.com/camel-cdr/rvv-playground/blob/main/base64-encode.c
  3 | // which was slighly modified to remove a GPR spill:
  4 | // https://godbolt.org/z/vqYMv4r9c
  5 | 
  6 | #if MX_N == 4
  7 | 
  8 | .global b64_encode_rvv_LUT64
  9 | b64_encode_rvv_LUT64:
 10 | 	mv a4, a2
 11 | 	mv a2, a1
 12 | 	vsetvli a6, zero, e8, m1, ta, ma
 13 | 	slli a5, a6, 2
 14 | 	bgeu a4, a5, .LBB0_3
 15 | 	mv a1, a0
 16 | .LBB0_2:
 17 | 	sub a0, a1, a0
 18 | 	mv a3, a4
 19 | 	tail b64_encode_scalar_tail
 20 | .LBB0_3:
 21 | 	vid.v v24
 22 | 	li t2, 3
 23 | 	lui t3, 4128
 24 | 	lui a7, 96
 25 | 	lui t0, 128
 26 | 	vsetvli zero, a5, e8, m4, ta, ma
 27 | 	vid.v v20
 28 | 	li a1, 64
 29 | 	vsetvli zero, a1, e8, m4, ta, ma
 30 | 	vle8.v v8, (a3)
 31 | 	srli t1, a6, 2
 32 | 	addi a7, a7, 10
 33 | 	addi t0, t0, 4
 34 | 	slli a1, t1, 1
 35 | 	vsetvli a3, zero, e32, m4, ta, ma
 36 | 	vmv.v.x v12, a7
 37 | 	slli a3, t1, 3
 38 | 	vmv.v.x v16, t0
 39 | 	add a7, a1, t1
 40 | 	sub t0, a3, a1
 41 | 	add t1, t1, a3
 42 | 	li a1, 63
 43 | 	vsetvli zero, zero, e8, m1, ta, ma
 44 | 	vsrl.vi v24, v24, 2
 45 | 	addi t3, t3, 1
 46 | 	vsetvli zero, a5, e8, m4, ta, ma
 47 | 	vand.vi v20, v20, 1
 48 | 	vmsne.vi v0, v20, 0
 49 | 	vmv.v.x v20, a1
 50 | 	vsetvli a3, zero, e8, m1, ta, ma
 51 | 	vmul.vx v24, v24, t2
 52 | 	vsetvli zero, a6, e32, m1, ta, ma
 53 | 	vadd.vx v7, v24, t3
 54 | 	slli a3, a6, 1
 55 | 	add t3, a3, a6
 56 | 	bgeu a1, a6, .LBB0_9
 57 | 	vsetvli a1, zero, e16, m2, ta, ma
 58 | 	vid.v v10
 59 | 	lui a1, 32769
 60 | 	vsrl.vi v10, v10, 2
 61 | 	slli a1, a1, 21
 62 | 	vmul.vx v10, v10, t2
 63 | 	addi a1, a1, 1
 64 | 	vsetvli zero, a6, e64, m2, ta, ma
 65 | 	vadd.vx v10, v10, a1
 66 | 	li t2, 257
 67 | 	mv a1, a0
 68 | 	j .LBB0_7
 69 | .LBB0_5:
 70 | 	vrgather.vv v24, v9, v7
 71 | 	vrgather.vv v25, v26, v7
 72 | 	vrgather.vv v26, v27, v7
 73 | 	vrgather.vv v27, v28, v7
 74 | .LBB0_6:
 75 | 	vsetvli zero, a5, e16, m4, ta, ma
 76 | 	vsrl.vv v28, v24, v12
 77 | 	vsll.vv v24, v24, v16
 78 | 	sub a4, a4, t3
 79 | 	add a2, a2, t3
 80 | 	vsetvli zero, a5, e8, m4, ta, ma
 81 | 	vmerge.vvm v24, v28, v24, v0
 82 | 	vand.vv v24, v24, v20
 83 | 	vsetvli a3, zero, e8, m1, ta, ma
 84 | 	vrgather.vv v28, v8, v24
 85 | 	vrgather.vv v29, v8, v25
 86 | 	vrgather.vv v30, v8, v26
 87 | 	vrgather.vv v31, v8, v27
 88 | 	vsetvli zero, a5, e8, m4, ta, ma
 89 | 	vse8.v v28, (a1)
 90 | 	add a1, a1, a5
 91 | 	bltu a4, a5, .LBB0_2
 92 | .LBB0_7:
 93 | 	vsetvli a3, zero, e8, m1, ta, ma
 94 | 	vle8.v v9, (a2)
 95 | 	add a3, a2, a7
 96 | 	vle8.v v26, (a3)
 97 | 	add a3, a2, t0
 98 | 	vle8.v v27, (a3)
 99 | 	add a3, a2, t1
100 | 	vle8.v v28, (a3)
101 | 	bltu a6, t2, .LBB0_5
102 | 	vrgatherei16.vv v24, v9, v10
103 | 	vrgatherei16.vv v25, v26, v10
104 | 	vrgatherei16.vv v26, v27, v10
105 | 	vrgatherei16.vv v27, v28, v10
106 | 	j .LBB0_6
107 | .LBB0_9:
108 | 	li t2, 31
109 | 	vsetvli a1, zero, e8, m2, ta, ma
110 | 	mv a1, a0
111 | 	j .LBB0_12
112 | .LBB0_10:
113 | 	vsetvli a3, zero, e8, m2, ta, ma
114 | 	vrgather.vv v24, v8, v28
115 | 	vrgather.vv v26, v8, v30
116 | .LBB0_11:
117 | 	vsetvli zero, a5, e8, m4, ta, ma
118 | 	vse8.v v24, (a1)
119 | 	sub a4, a4, t3
120 | 	add a2, a2, t3
121 | 	add a1, a1, a5
122 | 	bltu a4, a5, .LBB0_2
123 | .LBB0_12:
124 | 	vsetvli a3, zero, e8, m1, ta, ma
125 | 	vle8.v v25, (a2)
126 | 	add a3, a2, a7
127 | 	vle8.v v26, (a3)
128 | 	add a3, a2, t0
129 | 	vle8.v v27, (a3)
130 | 	add a3, a2, t1
131 | 	vle8.v v28, (a3)
132 | 	vrgather.vv v24, v25, v7
133 | 	vrgather.vv v25, v26, v7
134 | 	vrgather.vv v26, v27, v7
135 | 	vrgather.vv v27, v28, v7
136 | 	vsetvli zero, a5, e16, m4, ta, ma
137 | 	vsrl.vv v28, v24, v12
138 | 	vsll.vv v24, v24, v16
139 | 	vsetvli zero, a5, e8, m4, ta, ma
140 | 	vmerge.vvm v24, v28, v24, v0
141 | 	vand.vv v28, v24, v20
142 | 	bltu t2, a6, .LBB0_10
143 | 	vrgather.vv v24, v8, v28
144 | 	j .LBB0_11
145 | 
146 | .global b64_encode_rvv_seg_LUT64
147 | b64_encode_rvv_seg_LUT64:
148 | 	mv a4, a2
149 | 	mv a2, a1
150 | 	vsetvli a7, zero, e8, m1, ta, ma
151 | 	slli a5, a7, 2
152 | 	bgeu a4, a5, .LBB1_2
153 | 	mv a1, a0
154 | 	sub a0, a0, a0
155 | 	mv a3, a4
156 | 	tail b64_encode_scalar_tail
157 | .LBB1_2:
158 | 	li a6, 64
159 | 	vsetvli zero, a6, e8, m4, ta, ma
160 | 	vle8.v v8, (a3)
161 | 	li a3, 63
162 | 	vsetvli a1, zero, e8, m1, ta, ma
163 | 	vmv.v.x v12, a3
164 | 	slli a3, a7, 1
165 | 	add t0, a3, a7
166 | 	bltu a7, a6, .LBB1_6
167 | 	li a6, 4
168 | 	li a7, 16
169 | 	mv a1, a0
170 | .LBB1_4:
171 | 	vlseg3e8.v v9, (a2)
172 | 	sub a4, a4, t0
173 | 	add a2, a2, t0
174 | 	vand.vv v13, v11, v12
175 | 	vsrl.vi v11, v11, 6
176 | 	vsrl.vi v14, v10, 4
177 | 	vsrl.vi v15, v9, 2
178 | 	vmacc.vx v11, a6, v10
179 | 	vmacc.vx v14, a7, v9
180 | 	vrgather.vv v16, v8, v15
181 | 	vand.vv v9, v11, v12
182 | 	vand.vv v10, v14, v12
183 | 	vrgather.vv v17, v8, v10
184 | 	vrgather.vv v18, v8, v9
185 | 	vrgather.vv v19, v8, v13
186 | 	vsseg4e8.v v16, (a1)
187 | 	add a1, a1, a5
188 | 	bgeu a4, a5, .LBB1_4
189 | .LBB1_5:
190 | 	sub a0, a1, a0
191 | 	mv a3, a4
192 | 	tail b64_encode_scalar_tail
193 | .LBB1_6:
194 | 	li a1, 31
195 | 	bgeu a1, a7, .LBB1_9
196 | 	vsetvli a1, zero, e8, m2, ta, ma
197 | 	li a6, 4
198 | 	li a7, 16
199 | 	mv a1, a0
200 | .LBB1_8:
201 | 	vsetvli a3, zero, e8, m1, ta, ma
202 | 	vlseg3e8.v v13, (a2)
203 | 	sub a4, a4, t0
204 | 	add a2, a2, t0
205 | 	vand.vv v11, v15, v12
206 | 	vsrl.vi v10, v15, 6
207 | 	vsrl.vi v15, v14, 4
208 | 	vmacc.vx v10, a6, v14
209 | 	vmacc.vx v15, a7, v13
210 | 	vand.vv v10, v10, v12
211 | 	vand.vv v17, v15, v12
212 | 	vsrl.vi v16, v13, 2
213 | 	vsetvli a3, zero, e8, m2, ta, ma
214 | 	vrgather.vv v20, v8, v16
215 | 	vrgather.vv v22, v8, v10
216 | 	vsetvli a3, zero, e8, m1, ta, ma
217 | 	vsseg4e8.v v20, (a1)
218 | 	add a1, a1, a5
219 | 	bgeu a4, a5, .LBB1_8
220 | 	j .LBB1_5
221 | .LBB1_9:
222 | 	li a6, 4
223 | 	li a7, 16
224 | 	mv a1, a0
225 | .LBB1_10:
226 | 	vlseg3e8.v v13, (a2)
227 | 	sub a4, a4, t0
228 | 	add a2, a2, t0
229 | 	vand.vv v19, v15, v12
230 | 	vsrl.vi v15, v15, 6
231 | 	vmacc.vx v15, a6, v14
232 | 	vand.vv v18, v15, v12
233 | 	vsrl.vi v14, v14, 4
234 | 	vmacc.vx v14, a7, v13
235 | 	vand.vv v17, v14, v12
236 | 	vsrl.vi v16, v13, 2
237 | 	vsetvli zero, a5, e8, m4, ta, ma
238 | 	vrgather.vv v20, v8, v16
239 | 	vsetvli a3, zero, e8, m1, ta, ma
240 | 	vsseg4e8.v v20, (a1)
241 | 	add a1, a1, a5
242 | 	bgeu a4, a5, .LBB1_10
243 | 	j .LBB1_5
244 | 
245 | .global b64_encode_rvv_LUT16
246 | b64_encode_rvv_LUT16:
247 | 	mv a4, a2
248 | 	mv a2, a1
249 | 	vsetvli t3, zero, e8, m1, ta, ma
250 | 	slli a5, t3, 2
251 | 	bgeu a4, a5, .LBB2_2
252 | 	mv a1, a0
253 | 	sub a0, a0, a0
254 | 	mv a3, a4
255 | 	tail b64_encode_scalar_tail
256 | .LBB2_2:
257 | 	li t2, 3
258 | 	lui a6, 96
259 | 	lui a7, 128
260 | 	vsetvli zero, a5, e8, m4, ta, ma
261 | 	vid.v v8
262 | 	li t0, 63
263 | 	addi a3, a3, 64
264 | 	srli a1, t3, 2
265 | 	addi a6, a6, 10
266 | 	addi a7, a7, 4
267 | 	vand.vi v20, v8, 1
268 | 	vmv.v.x v12, t0
269 | 	vsetivli zero, 16, e8, m1, ta, ma
270 | 	vle8.v v9, (a3)
271 | 	slli t1, a1, 1
272 | 	slli t0, a1, 3
273 | 	vsetvli a3, zero, e32, m4, ta, ma
274 | 	vmv.v.x v16, a6
275 | 	vsetvli zero, a5, e8, m4, ta, ma
276 | 	vmsne.vi v8, v20, 0
277 | 	vsetvli a3, zero, e32, m4, ta, ma
278 | 	vmv.v.x v20, a7
279 | 	add a6, t1, a1
280 | 	sub a7, t0, t1
281 | 	add t0, t0, a1
282 | 	slli a3, t3, 1
283 | 	li a1, 257
284 | 	add t4, a3, t3
285 | 	bgeu t3, a1, .LBB2_6
286 | 	vsetvli zero, zero, e8, m1, ta, ma
287 | 	vid.v v10
288 | 	lui a1, 4128
289 | 	li t1, 51
290 | 	vsrl.vi v10, v10, 2
291 | 	addi a1, a1, 1
292 | 	vmul.vx v10, v10, t2
293 | 	vsetvli zero, t3, e32, m1, ta, ma
294 | 	vadd.vx v10, v10, a1
295 | 	li t2, 26
296 | 	mv a1, a0
297 | .LBB2_4:
298 | 	vsetvli a3, zero, e8, m1, ta, ma
299 | 	vmv1r.v v0, v8
300 | 	vle8.v v11, (a2)
301 | 	add a3, a2, a6
302 | 	vle8.v v26, (a3)
303 | 	add a3, a2, a7
304 | 	vle8.v v27, (a3)
305 | 	add a3, a2, t0
306 | 	sub a4, a4, t4
307 | 	add a2, a2, t4
308 | 	vle8.v v28, (a3)
309 | 	vrgather.vv v24, v11, v10
310 | 	vrgather.vv v25, v26, v10
311 | 	vrgather.vv v26, v27, v10
312 | 	vrgather.vv v27, v28, v10
313 | 	vsetvli zero, a5, e16, m4, ta, ma
314 | 	vsrl.vv v28, v24, v16
315 | 	vsll.vv v24, v24, v20
316 | 	vsetvli zero, a5, e8, m4, ta, ma
317 | 	vmerge.vvm v24, v28, v24, v0
318 | 	vand.vv v24, v24, v12
319 | 	vmsltu.vx v0, v24, t2
320 | 	vssubu.vx v28, v24, t1
321 | 	vmerge.vim v28, v28, 13, v0
322 | 	vsetvli a3, zero, e8, m1, ta, ma
323 | 	vrgather.vv v4, v9, v28
324 | 	vrgather.vv v5, v9, v29
325 | 	vrgather.vv v6, v9, v30
326 | 	vrgather.vv v7, v9, v31
327 | 	vsetvli zero, a5, e8, m4, ta, ma
328 | 	vadd.vv v24, v24, v4
329 | 	vse8.v v24, (a1)
330 | 	add a1, a1, a5
331 | 	bgeu a4, a5, .LBB2_4
332 | .LBB2_5:
333 | 	sub a0, a1, a0
334 | 	mv a3, a4
335 | 	tail b64_encode_scalar_tail
336 | .LBB2_6:
337 | 	vsetvli zero, zero, e16, m2, ta, ma
338 | 	vid.v v10
339 | 	lui a1, 32769
340 | 	li t1, 51
341 | 	vsrl.vi v10, v10, 2
342 | 	slli a1, a1, 21
343 | 	vmul.vx v10, v10, t2
344 | 	addi a1, a1, 1
345 | 	vsetvli zero, t3, e64, m2, ta, ma
346 | 	vadd.vx v10, v10, a1
347 | 	li t2, 26
348 | 	mv a1, a0
349 | .LBB2_7:
350 | 	vsetvli a3, zero, e8, m1, ta, ma
351 | 	vmv1r.v v0, v8
352 | 	vle8.v v25, (a2)
353 | 	add a3, a2, a6
354 | 	vle8.v v26, (a3)
355 | 	add a3, a2, a7
356 | 	vle8.v v27, (a3)
357 | 	add a3, a2, t0
358 | 	sub a4, a4, t4
359 | 	add a2, a2, t4
360 | 	vle8.v v28, (a3)
361 | 	vrgatherei16.vv v24, v25, v10
362 | 	vrgatherei16.vv v25, v26, v10
363 | 	vrgatherei16.vv v26, v27, v10
364 | 	vrgatherei16.vv v27, v28, v10
365 | 	vsetvli zero, a5, e16, m4, ta, ma
366 | 	vsrl.vv v28, v24, v16
367 | 	vsll.vv v24, v24, v20
368 | 	vsetvli zero, a5, e8, m4, ta, ma
369 | 	vmerge.vvm v24, v28, v24, v0
370 | 	vand.vv v24, v24, v12
371 | 	vmsltu.vx v0, v24, t2
372 | 	vssubu.vx v28, v24, t1
373 | 	vmerge.vim v28, v28, 13, v0
374 | 	vsetvli a3, zero, e8, m1, ta, ma
375 | 	vrgather.vv v4, v9, v28
376 | 	vrgather.vv v5, v9, v29
377 | 	vrgather.vv v6, v9, v30
378 | 	vrgather.vv v7, v9, v31
379 | 	vsetvli zero, a5, e8, m4, ta, ma
380 | 	vadd.vv v24, v24, v4
381 | 	vse8.v v24, (a1)
382 | 	add a1, a1, a5
383 | 	bgeu a4, a5, .LBB2_7
384 | 	j .LBB2_5
385 | 
386 | .global b64_encode_rvv_seg_LUT16
387 | b64_encode_rvv_seg_LUT16:
388 | 	mv a4, a2
389 | 	mv a2, a1
390 | 	vsetvli a1, zero, e8, m1, ta, ma
391 | 	slli a5, a1, 2
392 | 	bgeu a4, a5, .LBB3_2
393 | 	mv a1, a0
394 | 	sub a0, a0, a0
395 | 	mv a3, a4
396 | 	tail b64_encode_scalar_tail
397 | .LBB3_2:
398 | 	li a6, 63
399 | 	addi a7, a3, 64
400 | 	slli a3, a1, 1
401 | 	vmv.v.x v8, a6
402 | 	vsetivli zero, 16, e8, m1, ta, ma
403 | 	vle8.v v9, (a7)
404 | 	add t2, a3, a1
405 | 	li a6, 4
406 | 	li a7, 16
407 | 	li t0, 51
408 | 	li t1, 26
409 | 	mv a1, a0
410 | .LBB3_3:
411 | 	vsetvli a3, zero, e8, m1, ta, ma
412 | 	vlseg3e8.v v10, (a2)
413 | 	sub a4, a4, t2
414 | 	add a2, a2, t2
415 | 	vand.vv v15, v12, v8
416 | 	vsrl.vi v12, v12, 6
417 | 	vsrl.vi v13, v11, 4
418 | 	vmacc.vx v12, a6, v11
419 | 	vmacc.vx v13, a7, v10
420 | 	vand.vv v14, v12, v8
421 | 	vand.vv v13, v13, v8
422 | 	vsrl.vi v12, v10, 2
423 | 	vsetvli zero, a5, e8, m4, ta, ma
424 | 	vmsltu.vx v0, v12, t1
425 | 	vssubu.vx v16, v12, t0
426 | 	vmerge.vim v16, v16, 13, v0
427 | 	vsetvli a3, zero, e8, m1, ta, ma
428 | 	vrgather.vv v20, v9, v16
429 | 	vrgather.vv v21, v9, v17
430 | 	vrgather.vv v22, v9, v18
431 | 	vrgather.vv v23, v9, v19
432 | 	vsetvli zero, a5, e8, m4, ta, ma
433 | 	vadd.vv v12, v12, v20
434 | 	vsetvli a3, zero, e8, m1, ta, ma
435 | 	vsseg4e8.v v12, (a1)
436 | 	add a1, a1, a5
437 | 	bgeu a4, a5, .LBB3_3
438 | 	sub a0, a1, a0
439 | 	mv a3, a4
440 | 	tail b64_encode_scalar_tail
441 | 
442 | #endif
443 | 
444 | 


--------------------------------------------------------------------------------
/bench/base64_encode.c:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | 
 3 | size_t
 4 | b64_encode_scalar(uint8_t *dst, const uint8_t *src, size_t length, const uint8_t LUTs[64+16])
 5 | {
 6 | 	uint8_t *dstBeg = dst;
 7 | 	for (; length >= 3; length -= 3, src += 3, dst += 4) {
 8 | 		uint32_t u32 = src[0] << 16 | src[1] << 8 | src[2];
 9 | 		dst[0] = LUTs[(u32 >> 18) & 63];
10 | 		dst[1] = LUTs[(u32 >> 12) & 63];
11 | 		dst[2] = LUTs[(u32 >>  6) & 63];
12 | 		dst[3] = LUTs[(u32 >>  0) & 63];
13 | 	}
14 | 	if (length > 0) {
15 | 		uint32_t u32 = src[0] << 8 | (length > 1 ? src[1] : 0);
16 | 		*dst++ =              LUTs[(u32 >> 10) & 63];
17 | 		*dst++ =              LUTs[(u32 >>  4) & 63];
18 | 		*dst++ = length > 1 ? LUTs[(u32 <<  2) & 63] : '=';
19 | 		*dst++ =                                       '=';
20 | 	}
21 | 	return dst - dstBeg;
22 | }
23 | 
24 | static uint8_t base64LUTs[64 + 16] =
25 | 	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
26 | 	"abcdefghijklmnopqrstuvwxyz"
27 | 	"0123456789"
28 | 	"+/"
29 | 	"\x47\xfc\xfc\xfc\xfc\xfc\xfc\xfc\xfc\xfc\xfc\xed\xf0\x41"
30 | 	// 'a'-26, 10x '0' - 52, '+' - 62, '/' - 63, 'A'
31 | ;
32 | 
33 | /* used to prevent GPR spill in vectorized implementations */
34 | size_t
35 | b64_encode_scalar_tail(size_t prefix, uint8_t *dst, const uint8_t *src, size_t length)
36 | {
37 | 	return prefix + b64_encode_scalar(dst, src, length, base64LUTs);
38 | }
39 | 
40 | 
41 | #define IMPLS(f) \
42 | 	f(scalar) \
43 | 	f(rvv_LUT64) f(rvv_LUT16) \
44 | 	f(rvv_seg_LUT64) f(rvv_seg_LUT16)
45 | 
46 | typedef size_t Func(uint8_t *dst, const uint8_t *src, size_t length, const uint8_t LUTs[64+16]);
47 | 
48 | #define DECLARE(f) extern Func b64_encode_##f;
49 | IMPLS(DECLARE)
50 | 
51 | #define EXTRACT(f) { #f, &b64_encode_##f },
52 | Impl impls[] = { IMPLS(EXTRACT) };
53 | 
54 | uint8_t *dest, *src;
55 | size_t last;
56 | 
57 | void init(void) { }
58 | 
59 | ux checksum(size_t n) {
60 | 	ux sum = last;
61 | 	for (size_t i = 0; i < last+9; ++i)
62 | 		sum = uhash(sum) + dest[i];
63 | 	return sum;
64 | }
65 | 
66 | BENCH_BEG(base) {
67 | 	src = mem;
68 | 	dest = mem + MAX_MEM/3;
69 | 	memset(dest, 0, n*2+9);
70 | 	TIME last = f(dest, src, n, base64LUTs);
71 | } BENCH_END
72 | 
73 | Bench benches[] = {
74 | 	BENCH( impls, MAX_MEM/3, "base64 encode", bench_base ),
75 | }; BENCH_MAIN(benches)
76 | 
77 | 


--------------------------------------------------------------------------------
/bench/bench.h:
--------------------------------------------------------------------------------
  1 | #include "config.h"
  2 | #include "../nolibc.h"
  3 | 
  4 | #ifndef BENCH_NEXT
  5 | #  define BENCH_NEXT NEXT
  6 | #endif
  7 | 
  8 | #define MX(f,F) f(F##_m1) f(F##_m2) f(F##_m4) f(F##_m8)
  9 | #define STR(x) STR_(x)
 10 | #define STR_(x) #x
 11 | 
 12 | #if defined(__clang__) || defined(__GNUC__) || defined(__INTEL_COMPILER)
 13 | 
 14 | # define BENCH_CLOBBER() ({__asm__ volatile("":::"memory");})
 15 | # define BENCH_VOLATILE(x) ({__asm__ volatile("" : "+g"(x) : "g"(x) : "memory");})
 16 | # define BENCH_VOLATILE_REG(x) ({__asm__ volatile("" : "+r"(x) : "r"(x) : "memory");})
 17 | # define BENCH_VOLATILE_MEM(x) ({__asm__ volatile("" : "+m"(x) : "m"(x) : "memory");})
 18 | 
 19 | #define BENCH_MAY_ALIAS __attribute__((__may_alias__))
 20 | 
 21 | #else
 22 | 
 23 | # define BENCH_CLOBBER()
 24 | # define BENCH_CLOBBER_WITH(x) (bench__use_ptr(&(x)), BENCH_CLOBBER())
 25 | # define BENCH_CLOBBER_WITH_REG(x) (bench__use_ptr(&(x)), BENCH_CLOBBER())
 26 | # define BENCH_CLOBBER_WITH_MEM(x) (bench__use_ptr(&(x)), BENCH_CLOBBER())
 27 | static void bench_use_ptr(char const volatile *x) {}
 28 | 
 29 | #define BENCH_MAY_ALIAS
 30 | 
 31 | #endif
 32 | 
 33 | 
 34 | static int
 35 | compare_ux(void const *a, void const *b)
 36 | {
 37 | 	ux A = *(ux*)a, B = *(ux*)b;
 38 | 	return A < B ? -1 : A > B ? 1 : 0;
 39 | }
 40 | 
 41 | static URand randState = { 123, 456, 789 };
 42 | static ux bench_urand(void) { return urand(&randState); }
 43 | static float bench_urandf(void) { return urandf(&randState); }
 44 | static void bench_memrand(void *ptr, size_t n) { return memrand(&randState, ptr, n); }
 45 | 
 46 | typedef struct {
 47 | 	char const *name; void *func;
 48 | } Impl;
 49 | typedef struct {
 50 | 	Impl *impls;
 51 | 	size_t nImpls;
 52 | 	size_t N;
 53 | 	char const *name;
 54 | 	ux (*func)(void *, size_t);
 55 | } Bench;
 56 | 
 57 | static unsigned char *mem = 0;
 58 | 
 59 | void bench_main(void);
 60 | ux checksum(size_t n);
 61 | void init(void);
 62 | 
 63 | #if __STDC_HOSTED__ && !defined(CUSTOM_HOST)
 64 | # include <stdlib.h>
 65 | #else
 66 | static ux heap[1 + MAX_MEM / sizeof(ux)];
 67 | #endif
 68 | 
 69 | 
 70 | int
 71 | main(void)
 72 | {
 73 | 
 74 | #if __STDC_HOSTED__ && !defined(CUSTOM_HOST)
 75 | 	mem = malloc(MAX_MEM);
 76 | #else
 77 | 	mem = (unsigned char*)heap;
 78 | #endif
 79 | 
 80 | 	size_t x;
 81 | 	randState.x ^= rv_cycles()*7;
 82 | 	randState.y += rv_cycles() ^ ((uintptr_t)&x + 666*(uintptr_t)mem);
 83 | 
 84 | 	/* initialize memory */
 85 | 	bench_memrand(mem, MAX_MEM);
 86 | 
 87 | 	init();
 88 | 	bench_main();
 89 | #if __STDC_HOSTED__ && !defined(CUSTOM_HOST)
 90 | 	free(mem);
 91 | #endif
 92 | 	return 0;
 93 | }
 94 | 
 95 | static fx
 96 | bench_time(size_t n, Impl impl, Bench bench)
 97 | {
 98 | 	static ux arr[MAX_REPEATS];
 99 | 	size_t total = 0, repeats = 0;
100 | 	for (; repeats < MAX_REPEATS; ++repeats) {
101 | 		total += arr[repeats] = bench.func(impl.func, n);
102 | 		if (repeats > MIN_REPEATS && total > STOP_CYCLES)
103 | 			break;
104 | 	}
105 | #if MAX_REPEATS > 4
106 | 	qsort(arr, repeats, sizeof *arr, compare_ux);
107 | 	ux sum = 0, count = 0;
108 | 	for (size_t i = repeats * 0.2f; i < repeats * 0.8f; ++i, ++count)
109 | 		sum += arr[i];
110 | #else
111 | 	ux sum = 0, count = repeats;
112 | 	for (size_t i = 0; i < repeats; ++i)
113 | 		sum += arr[i];
114 | #endif
115 | 	return n / ((fx)sum / count);
116 | }
117 | 
118 | static void
119 | bench_run(Bench *benches, size_t nBenches)
120 | {
121 | 	for (Bench *b = benches; b != benches + nBenches; ++b) {
122 | 		print("{\ntitle: \"")(s,b->name)("\",\n");
123 | 		print("labels: [\"0\",");
124 | 		for (size_t i = 0; i < b->nImpls; ++i)
125 | 			print("\"")(s,b->impls[i].name)("\",");
126 | 		print("],\n");
127 | 
128 | 		size_t N = b->N;
129 | 		print("data: [\n[");
130 | 		for (size_t n = 1; n < N; n = BENCH_NEXT(n))
131 | 			print(u,n)(",");
132 | 		print("],\n")(flush,);
133 | 
134 | 		for (Impl *i = b->impls; i != b->impls + b->nImpls; ++i) {
135 | 			print("[");
136 | 			for (size_t n = 1; n < N; n = BENCH_NEXT(n)) {
137 | #if VALIDATE
138 | 				ux si = 0, s0 = 0;
139 | 				if (i != b->impls) {
140 | 					URand seed = randState;
141 | 					(void)b->func(i->func, n);
142 | 					si = checksum(n);
143 | 
144 | 					randState = seed;
145 | 					(void)b->func(b->impls[0].func, n);
146 | 					s0 = checksum(n);
147 | 				}
148 | 
149 | 				if (si != s0) {
150 | 					print("ERROR: ")(s,i->name)(" in ")(s,b->name)(" at ")(u,n)(flush,);
151 | 					exit(EXIT_FAILURE);
152 | 				}
153 | #endif
154 | 
155 | 				print(f,bench_time(n, *i, *b))(",")(flush,);
156 | 			}
157 | 			print("],\n")(flush,);
158 | 		}
159 | 		print("]\n},\n");
160 | 	}
161 | }
162 | 
163 | #define TIME \
164 | 	for (ux beg = rv_cycles(), _once = 1; _once; \
165 | 	       rv_fencei(), \
166 | 	       _cycles += rv_cycles() - beg, _once = 0)
167 | 
168 | #define BENCH_BEG(name) \
169 | 	ux bench_##name(void *_func, size_t n) { \
170 | 		Func *f = _func; ux _cycles = 0;
171 | #define BENCH_END return _cycles; }
172 | 
173 | #define BENCH(impls, ...) { impls, ARR_LEN(impls), __VA_ARGS__ }
174 | 
175 | #define BENCH_MAIN(benches) \
176 | 	void bench_main(void) { \
177 | 		bench_run(benches, ARR_LEN(benches)); \
178 | 	}
179 | 
180 | 


--------------------------------------------------------------------------------
/bench/byteswap.S:
--------------------------------------------------------------------------------
 1 | 
 2 | #if defined(MX) && __riscv_zvbb
 3 | .global MX(byteswap32_rvv_vrev8_)
 4 | MX(byteswap32_rvv_vrev8_):
 5 | 1:
 6 | 	vsetvli t0, a1, e32, MX(), ta, ma
 7 | 	vle32.v v0, (a0)
 8 | 	vrev8.v v8, v0
 9 | 	vse32.v v8, (a0)
10 | 	sub a1, a1, t0
11 | 	slli t1, t0, 2
12 | 	add a0, a0, t1
13 | 	bnez a1, 1b
14 | 	ret
15 | #endif
16 | 
17 | #if MX_N == 4 || MX_N == 2 || MX_N == 1
18 | 
19 | # a0 = ptr, a1 = len
20 | .global MX(byteswap32_rvv_gatherei16_)
21 | MX(byteswap32_rvv_gatherei16_):
22 | 	vsetvli t0, x0, e16, MX2(), ta, ma
23 | 	vid.v v0
24 | 	vand.vi v8, v0, 3
25 | 	vrsub.vi v8, v8, 3
26 | 	vsrl.vi v0, v0, 2
27 | 	vsll.vi v0, v0, 2
28 | 	vadd.vv v0, v0, v8 # i/8*8 + (7-1%8)
29 | 1:
30 | 	vsetvli t0, a1, e32, MX(), ta, ma
31 | 	vle32.v v8, (a0)
32 | 	slli t1, t0, 2
33 | 	vsetvli x0, t1, e8, MX(), ta, ma
34 | 	vrgatherei16.vv v16, v8, v0
35 | 	vsetvli x0, t0, e32, MX(), ta, ma
36 | 	vse32.v v16, (a0)
37 | 	sub a1, a1, t0
38 | 	add a0, a0, t1
39 | 	bnez a1, 1b
40 | 	ret
41 | #endif
42 | 
43 | #if MX_N == 2
44 | 
45 | .macro byteswap32_rvv_m1_gatherei16s n
46 | 	.global byteswap32_rvv_m1_gatherei16s_m\n
47 | 	byteswap32_rvv_m1_gatherei16s_m\n:
48 | 		vsetvli t0, x0, e16, MX(), ta, ma
49 | 		vid.v v0
50 | 		vand.vi v8, v0, 3
51 | 		vrsub.vi v8, v8, 3
52 | 		vsrl.vi v0, v0, 2
53 | 		vsll.vi v0, v0, 2
54 | 		vadd.vv v0, v0, v8 # i/8*8 + (7-1%8)
55 | 	1:
56 | 		vsetvli t0, a1, e32, m\n, ta, ma
57 | 		vle32.v v8, (a0)
58 | 		vsetvli t1, x0, e8, m1, ta, ma
59 | 		vrgatherei16.vv v16, v8, v0
60 | 	.ifge \n-2
61 | 		vrgatherei16.vv v17, v9, v0
62 | 	.ifge \n-4
63 | 		vrgatherei16.vv v18, v10, v0
64 | 		vrgatherei16.vv v19, v11, v0
65 | 	.ifge \n-8
66 | 		vrgatherei16.vv v20, v12, v0
67 | 		vrgatherei16.vv v21, v13, v0
68 | 		vrgatherei16.vv v22, v14, v0
69 | 		vrgatherei16.vv v23, v15, v0
70 | 	.endif
71 | 	.endif
72 | 	.endif
73 | 		vsetvli x0, t0, e32, m\n, ta, ma
74 | 		vse32.v v16, (a0)
75 | 		sub a1, a1, t0
76 | 		slli t0, t0, 2
77 | 		add a0, a0, t0
78 | 		bnez a1, 1b
79 | 		ret
80 | .endm
81 | 
82 | byteswap32_rvv_m1_gatherei16s 2
83 | #endif
84 | #if MX_N == 4
85 | byteswap32_rvv_m1_gatherei16s 4
86 | #endif
87 | #if MX_N == 8
88 | byteswap32_rvv_m1_gatherei16s 8
89 | #endif
90 | 
91 | 


--------------------------------------------------------------------------------
/bench/byteswap.c:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | 
 3 | void
 4 | byteswap32_scalar(uint32_t *ptr, size_t n)
 5 | {
 6 | 	for (uint8_t *p = (uint8_t*)ptr; n--; p += 4) {
 7 | 		uint8_t p0 = p[0], p1 = p[1], p2 = p[2], p3 = p[3];
 8 | 		p[3] = p0; BENCH_CLOBBER();
 9 | 		p[2] = p1; BENCH_CLOBBER();
10 | 		p[1] = p2; BENCH_CLOBBER();
11 | 		p[0] = p3; BENCH_CLOBBER();
12 | 	}
13 | }
14 | 
15 | void
16 | byteswap32_scalar_autovec(uint32_t *ptr, size_t n)
17 | {
18 | 	for (uint8_t *p = (uint8_t*)ptr; n--; p += 4) {
19 | 		uint8_t p0 = p[0], p1 = p[1], p2 = p[2], p3 = p[3];
20 | 		p[3] = p0;
21 | 		p[2] = p1;
22 | 		p[1] = p2;
23 | 		p[0] = p3;
24 | 	}
25 | }
26 | 
27 | #if __riscv_zbb
28 | void
29 | byteswap32_SWAR_rev8(uint32_t *ptr, size_t n)
30 | {
31 | 	while (n--) {
32 | 		*ptr = __builtin_bswap32(*ptr);
33 | 		++ptr;
34 | 		BENCH_CLOBBER();
35 | 	}
36 | }
37 | #define REV8(f) f(SWAR_rev8)
38 | #else
39 | #define REV8(f)
40 | #endif
41 | 
42 | 
43 | /* we don't support these on XTheadVector */
44 | #ifndef __riscv_vector
45 | #define IMPLS_RVV(f)
46 | #else
47 | #define IMPLS_RVV(f) \
48 | 	f(rvv_gatherei16_m1) \
49 | 	f(rvv_gatherei16_m2) \
50 | 	f(rvv_gatherei16_m4) \
51 | 	f(rvv_m1_gatherei16s_m2) \
52 | 	f(rvv_m1_gatherei16s_m4) \
53 | 	f(rvv_m1_gatherei16s_m8)
54 | #endif
55 | 
56 | #if __riscv_zvbb
57 | #define IMPLS_ZVBB(f) MX(f,rvv_vrev8)
58 | #else
59 | #define IMPLS_ZVBB(f)
60 | #endif
61 | 
62 | 
63 | #define IMPLS(f) \
64 | 	f(scalar) \
65 | 	f(scalar_autovec) \
66 | 	REV8(f) \
67 | 	IMPLS_ZVBB(f) \
68 | 	IMPLS_RVV(f)
69 | 
70 | typedef void Func(uint32_t *ptr, size_t n);
71 | 
72 | #define DECLARE(f) extern Func byteswap32_##f;
73 | IMPLS(DECLARE)
74 | 
75 | #define EXTRACT(f) { #f, &byteswap32_##f },
76 | Impl impls[] = { IMPLS(EXTRACT) };
77 | 
78 | uint32_t *ptr;
79 | 
80 | void init(void) { ptr = (uint32_t*)mem; }
81 | 
82 | ux checksum(size_t n) {
83 | 	ux sum = 0;
84 | 	for (size_t i = 0; i < n; ++i)
85 | 		sum = uhash(sum) + ptr[i];
86 | 	return sum;
87 | }
88 | 
89 | BENCH_BEG(base) {
90 | 	bench_memrand(ptr, n * sizeof *ptr);
91 | 	TIME f(ptr, n);
92 | } BENCH_END
93 | 
94 | Bench benches[] = {
95 | 	BENCH( impls, MAX_MEM/4, "byteswap32", bench_base )
96 | }; BENCH_MAIN(benches)
97 | 
98 | 


--------------------------------------------------------------------------------
/bench/chacha20.S:
--------------------------------------------------------------------------------
1 | #ifndef MX
2 | #if __riscv_xlen != 32
3 | #include "../thirdparty/rvv-chacha-poly/vchacha.s"
4 | #endif
5 | #endif
6 | 


--------------------------------------------------------------------------------
/bench/chacha20.c:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | #if __riscv_xlen != 32
 3 | #include "../thirdparty/rvv-chacha-poly/boring.h"
 4 | 
 5 | uint8_t *dest, *src;
 6 | uint8_t key[32], nonce[12];
 7 | uint32_t counter;
 8 | 
 9 | 
10 | extern void vector_chacha20(
11 | 		uint8_t *out, const uint8_t *in,
12 | 		size_t in_len, const uint8_t key[32],
13 | 		const uint8_t nonce[12], uint32_t counter);
14 | 
15 | static void
16 | chacha20_boring(void *restrict dest, void const *restrict src, size_t n) {
17 | 	boring_chacha20(dest, src, n, key, nonce, counter);
18 | }
19 | 
20 | static void
21 | chacha20_rvv(void *restrict dest, void const *restrict src, size_t n) {
22 | 	vector_chacha20(dest, src, n, key, nonce, counter);
23 | }
24 | 
25 | typedef void *Func(void *restrict dest, void const *restrict src, size_t n);
26 | 
27 | Impl impls[] = {
28 | 	{ "boring", &chacha20_boring },
29 | 	{ "rvv", &chacha20_rvv },
30 | };
31 | 
32 | void init(void) {
33 | 	bench_memrand(key, sizeof key);
34 | 	bench_memrand(nonce, sizeof nonce);
35 | 	counter = 0;
36 | }
37 | 
38 | ux checksum(size_t n) {
39 | 	ux sum = 0;
40 | 	for (size_t i = 0; i < n+16; ++i)
41 | 		sum = uhash(sum) + mem[i];
42 | 	return sum;
43 | }
44 | 
45 | BENCH_BEG(aligned) {
46 | 	memset(mem, 0, n+16);
47 | 	TIME f(mem, mem + MAX_MEM/2 + 16, n);
48 | } BENCH_END
49 | 
50 | Bench benches[] = {
51 | 	BENCH( impls, MAX_MEM/2 - 16, "chacha20 aligned", bench_aligned )
52 | }; BENCH_MAIN(benches)
53 | 
54 | 
55 | #include "../thirdparty/rvv-chacha-poly/boring.c"
56 | #else
57 | void init(void) {}
58 | Impl impls[] = {};
59 | Bench benches[] = {};
60 | BENCH_MAIN(benches)
61 | #endif
62 | 


--------------------------------------------------------------------------------
/bench/config.h:
--------------------------------------------------------------------------------
 1 | /* the maximum number of bytes to allocate, minimum of 4096 */
 2 | #define MAX_MEM (1024*1024*32)
 3 | /* the byte count for the next run */
 4 | #define NEXT(c) (c + c/7 + 3)
 5 | 
 6 | /* minimum number of repeats, to sample median from */
 7 | #define MIN_REPEATS 10
 8 | /* maxium number of repeats, executed until more than STOP_TIME has elapsed */
 9 | #define MAX_REPEATS 64
10 | 
11 | /* stop repeats early afer this many cycles have elapsed */
12 | #define STOP_CYCLES (1024*1024*500)
13 | 
14 | /* validate against reference implementation on the first repetition */
15 | #define VALIDATE 1
16 | 
17 | /* custom scaling factors for benchmarks, these are used to make sure each
18 |  * benchmark approximately takes the same amount of time. */
19 | 
20 | #define SCALE_mandelbrot(N) ((N)/10)
21 | #define SCALE_mergelines(N) ((N)/10)
22 | 
23 | /* benchmark specific configurations */
24 | #define mandelbrot_ITER 100
25 | 


--------------------------------------------------------------------------------
/bench/hist.S:
--------------------------------------------------------------------------------
  1 | #if 0
  2 | 
  3 | void
  4 | hist_rvv_assume_no_conflict(uint32_t *hist, float *x, float *y, size_t n)
  5 | {
  6 | 	for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
  7 | 		vl = __riscv_vsetvl_e32m8(n);
  8 | 		vfloat32m8_t vx = __riscv_vle32_v_f32m8(x, vl);
  9 | 		vfloat32m8_t vy = __riscv_vle32_v_f32m8(y, vl);
 10 | 		vfloat32m8_t vsq = __riscv_vfmacc(__riscv_vfmul(vx, vx, vl), vy, vy, vl);
 11 | 		vfloat32m8_t v = __riscv_vfsqrt(vsq, vl);
 12 | 		vuint32m8_t vidx =  __riscv_vminu(__riscv_vfcvt_rtz_xu(v, vl), 100, vl);
 13 | 		vidx = __riscv_vsll(vidx, 2, vl);
 14 | 		vuint32m8_t vcnt =__riscv_vluxei32(hist, vidx, vl);
 15 | 		vcnt = __riscv_vadd(vcnt, 1, vl);
 16 | 		__riscv_vsuxei32(hist, vidx, vcnt, vl);
 17 | 	}
 18 | }
 19 | 
 20 | void
 21 | hist_rvv_slidedown(uint32_t *hist, float *x, float *y, size_t n)
 22 | {
 23 | 	for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
 24 | 		vl = __riscv_vsetvl_e32m8(n);
 25 | 		vfloat32m8_t vx = __riscv_vle32_v_f32m8(x, vl);
 26 | 		vfloat32m8_t vy = __riscv_vle32_v_f32m8(y, vl);
 27 | 		vfloat32m8_t vsq = __riscv_vfmacc(__riscv_vfmul(vx, vx, vl), vy, vy, vl);
 28 | 		vfloat32m8_t v = __riscv_vfsqrt(vsq, vl);
 29 | 		vuint16m4_t vidx =  __riscv_vminu(__riscv_vfncvt_rtz_xu(v, vl), 100, vl);
 30 | 
 31 | 		for (size_t i = 0; i < vl; ++i) {
 32 | 			size_t idx = __riscv_vmv_x(__riscv_vslidedown(vidx, i, 1));
 33 | 			++hist[idx];
 34 | 		}
 35 | 	}
 36 | }
 37 | #endif
 38 | 
 39 | #ifdef MX
 40 | 
 41 | .global MX(LUT4_rvv_vloxei8_)
 42 | MX(LUT4_rvv_vloxei8_):
 43 | 1:
 44 | 	vsetvli a3, a2, e8, MX(), ta, ma
 45 | 	vle8.v v8, (a1)
 46 | 	vand.vi v8, v8, 15
 47 | 	vloxei8.v v8, (a0), v8
 48 | 	vse8.v v8, (a1)
 49 | 	sub a2, a2, a3
 50 | 	add a1, a1, a3
 51 | 	bnez a2, 1b
 52 | 	ret
 53 | 
 54 | /* assumes no conflicts, which causes the wrong result */
 55 | .global MX(hist_rvv_assume_no_conflict_)
 56 | MX(hist_rvv_assume_no_conflict_):
 57 | 	li a4, 100
 58 | 1:
 59 | 	vsetvli a5, a3, e32, m8, ta, ma
 60 | 	vle32.v v8, (a1)
 61 | 	vle32.v v16, (a2)
 62 | 	vfmul.vv v8, v8, v8
 63 | 	vfmacc.vv v8, v16, v16
 64 | 	vfsqrt.v v8, v8
 65 | 	vfcvt.rtz.xu.f.v v8, v8
 66 | 	vminu.vx v8, v8, a4
 67 | 	vsll.vi v8, v8, 2
 68 | 	vluxei32.v v16, (a0), v8
 69 | 	vadd.vi v16, v16, 1
 70 | 	vsuxei32.v v16, (a0), v8
 71 | 	sub a3, a3, a5
 72 | 	slli a5, a5, 2
 73 | 	add a1, a1, a5
 74 | 	add a2, a2, a5
 75 | 	bnez a3, 1b
 76 | 	ret
 77 | 
 78 | 
 79 | 
 80 | .global MX(hist_rvv_slidedown_)
 81 | MX(hist_rvv_slidedown_):
 82 | 	li a6, 100
 83 | 	j 2f
 84 | 1:
 85 | 	sub a3, a3, a7
 86 | 	slli a5, a7, 2
 87 | 	add a1, a1, a5
 88 | 	add a2, a2, a5
 89 | 	beqz a3, 4f
 90 | 2:
 91 | 	vsetvli a7, a3, e32, MX(), ta, ma
 92 | 	beqz a7, 1b
 93 | 	vle32.v v8, (a1)
 94 | 	vle32.v v16, (a2)
 95 | 	li a4, 0
 96 | 	vfmul.vv v8, v8, v8
 97 | 	vfmacc.vv v8, v16, v16
 98 | 	vfsqrt.v v8, v8
 99 | 	vsetvli zero, zero, e16, MXf2(), ta, ma
100 | 	vfncvt.rtz.xu.f.w v16, v8
101 | 	vminu.vx v8, v16, a6
102 | 	vsll.vi v8, v8, 2
103 | 	vsetivli zero, 1, e16, MXf2(), ta, ma
104 | 3:
105 | 	vslidedown.vx v12, v8, a4
106 | 	vmv.x.s a5, v12
107 | 	add t0, a0, a5
108 | 	lw a5, 0(t0)
109 | 	addi a5, a5, 1
110 | 	addi a4, a4, 1
111 | 	sw a5, 0(t0)
112 | 	bne a7, a4, 3b
113 | 	j 1b
114 | 4:
115 | 	ret
116 | 
117 | 
118 | #endif
119 | 
120 | 
121 | #if MX_N == 1
122 | 
123 | .global MX(hist_rvv_dup_entries_)
124 | MX(hist_rvv_dup_entries_):
125 | 	vsetvli a6, zero, e32, m1, ta, ma
126 | 	beqz a3, 2f
127 | 	slli a5, a6, 2
128 | 	vmv.v.x v8, a5
129 | 	vid.v v9
130 | 	addi a5, a6, -1
131 | 	vand.vx v9, v9, a5
132 | 	vsll.vi v9, v9, 2
133 | 	li a5, 100
134 | 1:
135 | 	vsetvli a4, a3, e32, m1, ta, ma
136 | 	vle32.v v10, (a1)
137 | 	vle32.v v11, (a2)
138 | 	vfmul.vv v10, v10, v10
139 | 	vfmacc.vv v10, v11, v11
140 | 	vfsqrt.v v10, v10
141 | 	vfcvt.rtz.xu.f.v v10, v10
142 | 	vminu.vx v10, v10, a5
143 | 	vmadd.vv v10, v8, v9
144 | 	vluxei32.v v11, (a0), v10
145 | 	vadd.vi v11, v11, 1
146 | 	vsuxei32.v v11, (a0), v10
147 | 	sub a3, a3, a4
148 | 	slli a4, a4, 2
149 | 	add a1, a1, a4
150 | 	add a2, a2, a4
151 | 	bnez a3, 1b
152 | 2:
153 | 	vsetvli a1, zero, e32, m1, ta, ma
154 | 	vmv.v.i v8, 0
155 | 	slli a4, a6, 2
156 | 	addi a1, a0, 400
157 | 	mv a2, a0
158 | 	vsetvli a3, zero, e32, m1, ta, ma
159 | 3:
160 | 	vle32.v v9, (a0)
161 | 	vredsum.vs v9, v9, v8
162 | 	vmv.x.s t0, v9
163 | 	sw t0, (a2)
164 | 	addi a2, a2, 4
165 | 	add a0, a0, a4
166 | 	bne a2, a1, 3b
167 | 	ret
168 | 
169 | #endif
170 | 
171 | #if MX_N == 2
172 | 
173 | .global MX(hist_rvv_dup_entries_)
174 | MX(hist_rvv_dup_entries_):
175 | 	vsetvli a6, zero, e32, m1, ta, ma
176 | 	beqz a3, 2f
177 | 	slli a5, a6, 2
178 | 	slli a4, a6, 1
179 | 	vsetvli zero, a4, e32, m2, ta, ma
180 | 	vmv.v.x v8, a5
181 | 	vid.v v10
182 | 	addi a4, a6, -1
183 | 	vand.vx v10, v10, a4
184 | 	vsll.vi v10, v10, 2
185 | 	li a7, 100
186 | 1:
187 | 	vsetvli a4, a3, e32, m2, ta, ma
188 | 	vle32.v v12, (a1)
189 | 	vle32.v v14, (a2)
190 | 	vfmul.vv v12, v12, v12
191 | 	vfmacc.vv v12, v14, v14
192 | 	vfsqrt.v v12, v12
193 | 	vfcvt.rtz.xu.f.v v12, v12
194 | 	vminu.vx v12, v12, a7
195 | 	vmadd.vv v12, v8, v10
196 | 	vsetvli a5, a4, e32, m1, ta, ma
197 | 	vluxei32.v v14, (a0), v12
198 | 	sub a5, a4, a5
199 | 	vadd.vi v14, v14, 1
200 | 	vsuxei32.v v14, (a0), v12
201 | 	vsetvli zero, a5, e32, m1, ta, ma
202 | 	vluxei32.v v12, (a0), v13
203 | 	vadd.vi v12, v12, 1
204 | 	vsuxei32.v v12, (a0), v13
205 | 	sub a3, a3, a4
206 | 	slli a4, a4, 2
207 | 	add a1, a1, a4
208 | 	add a2, a2, a4
209 | 	bnez a3, 1b
210 | 2:
211 | 	vsetvli a1, zero, e32, m1, ta, ma
212 | 	vmv.v.i v8, 0
213 | 	slli a4, a6, 2
214 | 	addi a1, a0, 400
215 | 	mv a2, a0
216 | 	vsetvli a3, zero, e32, m1, ta, ma
217 | 3:
218 | 	vle32.v v9, (a0)
219 | 	vredsum.vs v9, v9, v8
220 | 	vmv.x.s t0, v9
221 | 	sw t0, (a2)
222 | 	addi a2, a2, 4
223 | 	add a0, a0, a4
224 | 	bne a2, a1, 3b
225 | 	ret
226 | 
227 | #endif
228 | 
229 | #if MX_N == 4
230 | 
231 | .global MX(hist_rvv_dup_entries_)
232 | MX(hist_rvv_dup_entries_):
233 | 	vsetvli a5, zero, e32, m1, ta, ma
234 | 	slli    a7, a5, 2
235 | 	beqz    a3, 2f
236 | 	vsetvli zero, a7, e32, m4, ta, ma
237 | 	vmv.v.x v8, a7
238 | 	vid.v   v12
239 | 	addi    a5, a5, -1
240 | 	vand.vx v12, v12, a5
241 | 	vsll.vi v12, v12, 2
242 | 	li      a6, 100
243 | 1:
244 | 	vsetvli a5, a3, e32, m4, ta, ma
245 | 	vle32.v v16, (a1)
246 | 	vle32.v v20, (a2)
247 | 	vfmul.vv        v16, v16, v16
248 | 	vfmacc.vv       v16, v20, v20
249 | 	vfsqrt.v        v16, v16
250 | 	vfcvt.rtz.xu.f.v        v16, v16
251 | 	vminu.vx        v16, v16, a6
252 | 	vmadd.vv        v16, v8, v12
253 | 	vsetvli a4, a5, e32, m1, ta, ma
254 | 	vluxei32.v      v20, (a0), v16
255 | 	sub     a4, a5, a4
256 | 	vadd.vi v20, v20, 1
257 | 	vsuxei32.v      v20, (a0), v16
258 | 	vsetvli t0, a4, e32, m1, ta, ma
259 | 	vluxei32.v      v16, (a0), v17
260 | 	sub     a4, a4, t0
261 | 	vadd.vi v16, v16, 1
262 | 	vsuxei32.v      v16, (a0), v17
263 | 	vsetvli t0, a4, e32, m1, ta, ma
264 | 	vluxei32.v      v16, (a0), v18
265 | 	sub     a4, a4, t0
266 | 	vadd.vi v16, v16, 1
267 | 	vsuxei32.v      v16, (a0), v18
268 | 	vsetvli zero, a4, e32, m1, ta, ma
269 | 	vluxei32.v      v16, (a0), v19
270 | 	vadd.vi v16, v16, 1
271 | 	vsuxei32.v      v16, (a0), v19
272 | 	sub     a3, a3, a5
273 | 	slli    a5, a5, 2
274 | 	add     a1, a1, a5
275 | 	add     a2, a2, a5
276 | 	bnez    a3, 1b
277 | 2:
278 | 	vsetvli a1, zero, e32, m1, ta, ma
279 | 	vmv.v.i v8, 0
280 | 	addi    a1, a0, 400
281 | 	mv      a2, a0
282 | 3:
283 | 	vsetvli a3, zero, e32, m1, ta, ma
284 | 	vle32.v v9, (a0)
285 | 	vredsum.vs      v9, v9, v8
286 | 	vsetivli        zero, 1, e32, m1, ta, ma
287 | 	vse32.v v9, (a2)
288 | 	addi    a2, a2, 4
289 | 	add     a0, a0, a7
290 | 	bne     a2, a1, 3b
291 | 	ret
292 | 
293 | #endif
294 | 


--------------------------------------------------------------------------------
/bench/hist.c:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | 
 3 | void
 4 | hist_scalar(uint32_t *hist, float *x, float *y, size_t n)
 5 | {
 6 | 	for (size_t i = 0; i < n; ++i) {
 7 | 		float dist = x[i]*x[i] + y[i]*y[i];
 8 | 		__asm__ ("fsqrt.s %0, %0\n" : "+f"(dist));
 9 | 		size_t idx = dist;
10 | 		idx = idx > 100 ? 100 : dist;
11 | 		++hist[idx];
12 | 	}
13 | }
14 | 
15 | #define IMPLS(f) \
16 | 	f(scalar) \
17 | 	MX(f, rvv_slidedown) \
18 | 	MX(f, rvv_assume_no_conflict) \
19 | 	f(rvv_dup_entries_m1) \
20 | 	f(rvv_dup_entries_m2) \
21 | 	f(rvv_dup_entries_m4) \
22 | 
23 | typedef void Func(uint32_t *hist, float *x, float *y, size_t n);
24 | 
25 | #define DECLARE(f) extern Func hist_##f;
26 | IMPLS(DECLARE)
27 | 
28 | #define EXTRACT(f) { #f, &hist_##f },
29 | Impl impls[] = { IMPLS(EXTRACT) };
30 | 
31 | static uint32_t hist[100 * (1<<16>>4)];
32 | float *inx, *iny;
33 | 
34 | void init(void) {
35 | 	inx = (float*)mem;
36 | 	iny = (float*)(mem + MAX_MEM/2);
37 | }
38 | 
39 | ux checksum(size_t n) {
40 | 	size_t sum = 0;
41 | 	for (size_t i = 0; i < 100; ++i)
42 | 		sum = hist[i];
43 | 	return sum <= n; // sanity check for no_conflict
44 | }
45 | 
46 | BENCH_BEG(base) {
47 | 	n /= sizeof(float);
48 | 	memset(hist, 0, sizeof hist);
49 | 	float max = 70.71; // approx. sqrtf(100*100/2);
50 | 	for (size_t i = 0; i < n; ++i) {
51 | 		inx[i] = bench_urandf() * 2 * max - max;
52 | 		iny[i] = bench_urandf() * 2 * max - max;
53 | 	}
54 | 	TIME f(hist, inx, iny, n);
55 | } BENCH_END
56 | 
57 | Bench benches[] = {
58 | 	BENCH( impls, MAX_MEM/2, "hist", bench_base)
59 | }; BENCH_MAIN(benches)
60 | 
61 | 


--------------------------------------------------------------------------------
/bench/mandelbrot.S:
--------------------------------------------------------------------------------
  1 | #if 0
  2 | 
  3 | void
  4 | mandelbrot_rvv(size_t width, size_t maxIter, uint32_t *res)
  5 | {
  6 | 	size_t VL2 = __riscv_vsetvlmax_e32m2();
  7 | 	vfloat32m2_t v4 = __riscv_vfmv_v_f_f32m2(4, VL2);
  8 | 	vuint32m2_t vid = __riscv_vid_v_u32m2(VL2);
  9 | 	vfloat32m2_t cx, cy, zx, zy, zx2, zy2;
 10 | 
 11 | 	for (size_t y = 0; y < width; ++y) {
 12 | 		cy = __riscv_vfmv_v_f_f32m2(y, VL2);
 13 | 		cy = __riscv_vfadd(__riscv_vfmul(cy, 2.0f / width, VL2), -1, VL2);
 14 | 
 15 | 		for (size_t vl, x = 0, n = width; n > 0; n -= vl, res += vl, x += vl) {
 16 | 			vl = __riscv_vsetvl_e32m2(n);
 17 | 
 18 | 			cx = __riscv_vfcvt_f(__riscv_vadd(vid, x, vl), vl);
 19 | 			cx = __riscv_vfadd(__riscv_vfmul(cx, 2.0f / width, vl), -1.5f, vl);
 20 | 
 21 | 			size_t iter = 0;
 22 | 			vuint32m2_t viter = __riscv_vmv_v_x_u32m2(0, vl);
 23 | 			vbool16_t mask = __riscv_vmset_m_b16(vl);
 24 | 			zx = zy = zx2 = zy2 = __riscv_vfmv_v_f_f32m2(0, vl);
 25 | 			do {
 26 | 				mask = __riscv_vmflt(__riscv_vfadd(zx2, zy2, vl), v4, vl);
 27 | 				viter = __riscv_vadc(viter, 0, mask, vl);
 28 | 				zy = __riscv_vfmacc(cy, __riscv_vfadd(zx, zx, vl), zy, vl);
 29 | 				zx = __riscv_vfadd(__riscv_vfsub(zx2, zy2, vl), cx, vl);
 30 | 				zx2 = __riscv_vfmul(zx, zx, vl);
 31 | 				zy2 = __riscv_vfmul(zy, zy, vl);
 32 | 				++iter;
 33 | 			} while (iter < maxIter && __riscv_vfirst(mask, vl) >= 0);
 34 | 			__riscv_vse32(res, viter, vl);
 35 | 		}
 36 | 	}
 37 | }
 38 | #endif
 39 | 
 40 | #if MX_N > 0 && MX_N <= 2
 41 | 
 42 | #if IF_VF16(1)+0
 43 | .global MX(mandelbrot_rvv_f16_) # generated by clang
 44 | .balign 2
 45 | MX(rvv_f16_m1p5):
 46 | 	.half 0xbe00 # half -1.5
 47 | MX(rvv_f16_m1):
 48 | 	.half 0xbc00 # half -1
 49 | MX(rvv_f16_p4):
 50 | 	.half 0x4400 # half 4
 51 | MX(mandelbrot_rvv_f16_):
 52 | 	beqz a0, 9f
 53 | 	li a6, 0
 54 | 	vsetvli a3, zero, e16, m2, ta, ma
 55 | 	la a3, MX(rvv_f16_p4)
 56 | 	fcvt.s.wu fa5, a0
 57 | 	flh fa4, (a3)
 58 | 	lui a3, 262144
 59 | 	fmv.w.x fa3, a3
 60 | 	la a3, MX(rvv_f16_m1)
 61 | 	fdiv.s fa3, fa3, fa5
 62 | 	flh fa5, (a3)
 63 | 	la a3, MX(rvv_f16_m1p5)
 64 | 	vfmv.v.f v12, fa4
 65 | 	flh fa4, (a3)
 66 | 	addi a3, a1, -1
 67 | 	sltu a1, a1, a3
 68 | 	addi a1, a1, -1
 69 | 	and a1, a1, a3
 70 | 	vid.v v14
 71 | 	fcvt.h.s fa3, fa3
 72 | 	addi a7, a1, 1
 73 | 	j 2f
 74 | 1:
 75 | 	addi a6, a6, 1
 76 | 	beq a6, a0, 9f
 77 | 2:
 78 | 	li a4, 0
 79 | 	fcvt.h.wu fa2, a6
 80 | 	vsetvli a1, zero, e16, m2, ta, ma
 81 | 	vfmv.v.f v8, fa2
 82 | 	vfmul.vf v8, v8, fa3
 83 | 	vfadd.vf v16, v8, fa5
 84 | 	mv a5, a0
 85 | 	j 4f
 86 | 3:
 87 | 	vsetvli zero, zero, e32, m4, ta, ma
 88 | 	vse32.v v8, (a2)
 89 | 	sub a5, a5, t0
 90 | 	slli a1, t0, 2
 91 | 	add a2, a2, a1
 92 | 	add a4, a4, t0
 93 | 	beqz a5, 1b
 94 | 4:
 95 | 	vsetvli t0, a5, e16, m2, ta, ma
 96 | 	vadd.vx v8, v14, a4
 97 | 	vfcvt.f.xu.v v8, v8
 98 | 	vfmul.vf v8, v8, fa3
 99 | 	vfadd.vf v18, v8, fa4
100 | 	vmv.v.i v20, 0
101 | 	vmv.v.i v22, 0
102 | 	vmv.v.i v24, 0
103 | 	vmv.v.i v26, 0
104 | 	vsetvli zero, zero, e32, m4, ta, ma
105 | 	vmv.v.i v8, 0
106 | 	mv a1, a7
107 | 5:
108 | 	vsetvli zero, zero, e16, m2, ta, ma
109 | 	vfadd.vv v28, v24, v20
110 | 	vmflt.vv v0, v28, v12
111 | 	addi a1, a1, -1
112 | 	vsetvli zero, zero, e32, m4, ta, ma
113 | 	vadc.vim v8, v8, 0, v0
114 | 	beqz a1, 3b
115 | 	vsetvli zero, zero, e16, m2, ta, ma
116 | 	vfadd.vv v26, v26, v26
117 | 	vfsub.vv v20, v24, v20
118 | 	vfmadd.vv v22, v26, v16
119 | 	vfadd.vv v26, v20, v18
120 | 	vfmul.vv v20, v22, v22
121 | 	vfirst.m a3, v0
122 | 	vfmul.vv v24, v26, v26
123 | 	bgez a3, 5b
124 | 	j 3b
125 | 9:
126 | 	ret
127 | #endif
128 | 
129 | .global MX(mandelbrot_rvv_f32_) # generated by clang
130 | MX(mandelbrot_rvv_f32_):
131 | 	beqz a0, 9f
132 | 	li a6, 0
133 | 	vsetvli a3, zero, e32, MX(), ta, ma
134 | 	lui a3, 264192
135 | 	fcvt.s.wu fa5, a0
136 | 	vmv.v.x v8, a3
137 | 	lui a3, 262144
138 | 	fmv.w.x fa4, a3
139 | 	fdiv.s fa5, fa4, fa5
140 | 	addi a3, a1, -1
141 | 	sltu a1, a1, a3
142 | 	addi a1, a1, -1
143 | 	and a1, a1, a3
144 | 	lui a3, 784384
145 | 	fmv.w.x fa4, a3
146 | 	lui a3, 785408
147 | 	fmv.w.x fa3, a3
148 | 	vid.v v10
149 | 	addi a7, a1, 1
150 | 	j 2f
151 | 1:
152 | 	addi a6, a6, 1
153 | 	beq a6, a0, 9f
154 | 2:
155 | 	li a4, 0
156 | 	fcvt.s.wu fa2, a6
157 | 	vsetvli a1, zero, e32, MX(), ta, ma
158 | 	vfmv.v.f v12, fa2
159 | 	vfmul.vf v12, v12, fa5
160 | 	vfadd.vf v12, v12, fa4
161 | 	mv a5, a0
162 | 	j 4f
163 | 3:
164 | 	vse32.v v14, (a2)
165 | 	sub a5, a5, t0
166 | 	slli a1, t0, 2
167 | 	add a2, a2, a1
168 | 	add a4, a4, t0
169 | 	beqz a5, 1b
170 | 4:
171 | 	vsetvli t0, a5, e32, MX(), ta, ma
172 | 	vadd.vx v14, v10, a4
173 | 	vmv.v.i v18, 0
174 | 	vfcvt.f.xu.v v14, v14
175 | 	vfmul.vf v14, v14, fa5
176 | 	vfadd.vf v16, v14, fa3
177 | 	vmv.v.i v14, 0
178 | 	mv a1, a7
179 | 	vmv.v.i v22, 0
180 | 	vmv.v.i v20, 0
181 | 	vmv.v.i v24, 0
182 | 5:
183 | 	vfadd.vv v26, v22, v18
184 | 	vmflt.vv v0, v26, v8
185 | 	addi a1, a1, -1
186 | 	vadc.vim v14, v14, 0, v0
187 | 	beqz a1, 3b
188 | 	vfadd.vv v24, v24, v24
189 | 	vfsub.vv v18, v22, v18
190 | 	vfmadd.vv v20, v24, v12
191 | 	vfadd.vv v24, v18, v16
192 | 	vfmul.vv v18, v20, v20
193 | 	vfirst.m a3, v0
194 | 	vfmul.vv v22, v24, v24
195 | 	bgez a3, 5b
196 | 	j 3b
197 | 9:
198 | 	ret
199 | 
200 | #if IF_VF64(1)+0
201 | .balign 8
202 | .global MX(mandelbrot_rvv_f64_) # generated by clang
203 | MX(rvv_f64_m1p5):
204 | 	.quad 0xbff8000000000000 # double -1.5
205 | MX(rvv_f64_m1):
206 | 	.quad 0xbff0000000000000 # double -1
207 | MX(rvv_f64_p4):
208 | 	.quad 0x4010000000000000 # double 4
209 | MX(mandelbrot_rvv_f64_):
210 | 	beqz a0, 9f
211 | 	li a6, 0
212 | 	vsetvli a3, zero, e64, m2, ta, ma
213 | 	la a3, MX(rvv_f64_p4)
214 | 	fcvt.s.wu fa5, a0
215 | 	fld fa4, (a3)
216 | 	lui a3, 262144
217 | 	fmv.w.x fa3, a3
218 | 	la a3, MX(rvv_f64_m1)
219 | 	fdiv.s fa3, fa3, fa5
220 | 	fld fa5, (a3)
221 | 	la a3, MX(rvv_f64_m1p5)
222 | 	vfmv.v.f v8, fa4
223 | 	fld fa4, (a3)
224 | 	addi a3, a1, -1
225 | 	sltu a1, a1, a3
226 | 	addi a1, a1, -1
227 | 	and a1, a1, a3
228 | 	vid.v v10
229 | 	fcvt.d.s fa3, fa3
230 | 	addi a7, a1, 1
231 | 	j 2f
232 | 1:
233 | 	addi a6, a6, 1
234 | 	beq a6, a0, 9f
235 | 2:
236 | 	li a4, 0
237 | 	fcvt.d.wu fa2, a6
238 | 	vsetvli a1, zero, e64, m2, ta, ma
239 | 	vfmv.v.f v12, fa2
240 | 	vfmul.vf v12, v12, fa3
241 | 	vfadd.vf v12, v12, fa5
242 | 	mv a5, a0
243 | 	j 4f
244 | 3:
245 | 	vsetvli zero, zero, e32, m1, ta, ma
246 | 	vse32.v v24, (a2)
247 | 	sub a5, a5, t0
248 | 	slli a1, t0, 2
249 | 	add a2, a2, a1
250 | 	add a4, a4, t0
251 | 	beqz a5, 1b
252 | 4:
253 | 	vsetvli t0, a5, e64, m2, ta, ma
254 | 	vadd.vx v14, v10, a4
255 | 	vfcvt.f.xu.v v14, v14
256 | 	vfmul.vf v14, v14, fa3
257 | 	vfadd.vf v14, v14, fa4
258 | 	vmv.v.i v16, 0
259 | 	vmv.v.i v18, 0
260 | 	vmv.v.i v20, 0
261 | 	vmv.v.i v22, 0
262 | 	vsetvli zero, zero, e32, m1, ta, ma
263 | 	vmv.v.i v24, 0
264 | 	mv a1, a7
265 | 5:
266 | 	vsetvli zero, zero, e64, m2, ta, ma
267 | 	vfadd.vv v26, v20, v16
268 | 	vmflt.vv v0, v26, v8
269 | 	addi a1, a1, -1
270 | 	vsetvli zero, zero, e32, m1, ta, ma
271 | 	vadc.vim v24, v24, 0, v0
272 | 	beqz a1, 3b
273 | 	vsetvli zero, zero, e64, m2, ta, ma
274 | 	vfadd.vv v22, v22, v22
275 | 	vfsub.vv v16, v20, v16
276 | 	vfmadd.vv v18, v22, v12
277 | 	vfadd.vv v22, v16, v14
278 | 	vfmul.vv v16, v18, v18
279 | 	vfirst.m a3, v0
280 | 	vfmul.vv v20, v22, v22
281 | 	bgez a3, 5b
282 | 	j 3b
283 | 9:
284 | 	ret
285 | #endif
286 | 
287 | #endif
288 | 
289 | 
290 | 


--------------------------------------------------------------------------------
/bench/mandelbrot.c:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | 
 3 | void
 4 | mandelbrot_scalar_f32(size_t width, size_t maxIter, uint32_t *res)
 5 | {
 6 | 	for (size_t y = 0; y < width; ++y)
 7 | 	for (size_t x = 0; x < width; ++x) {
 8 | 		float cx = x * 2.0f / width - 1.5;
 9 | 		float cy = y * 2.0f / width - 1;
10 | 		size_t iter = 0;
11 | 		float zx = 0, zy = 0, zxS = 0, zyS = 0;
12 | 
13 | 		BENCH_VOLATILE_REG(cy);
14 | 		while (zxS + zyS <= 4 && iter < maxIter) {
15 | 			zxS = zxS - zyS + cx;
16 | 			zy = 2 * zx * zy + cy;
17 | 			zx = zxS;
18 | 			zxS = zx*zx;
19 | 			zyS = zy*zy;
20 | 			++iter;
21 | 		}
22 | 		*res++ = iter;
23 | 	}
24 | }
25 | 
26 | #if __riscv_flen == 64
27 | void
28 | mandelbrot_scalar_f64(size_t width, size_t maxIter, uint32_t *res)
29 | {
30 | 	for (size_t y = 0; y < width; ++y)
31 | 	for (size_t x = 0; x < width; ++x) {
32 | 		double cx = x * 2.0 / width - 1.5;
33 | 		double cy = y * 2.0 / width - 1;
34 | 		size_t iter = 0;
35 | 		double zx = 0, zy = 0, zxS = 0, zyS = 0;
36 | 
37 | 		BENCH_VOLATILE_REG(cy);
38 | 		while (zxS + zyS <= 4 && iter < maxIter) {
39 | 			zxS = zxS - zyS + cx;
40 | 			zy = 2 * zx * zy + cy;
41 | 			zx = zxS;
42 | 			zxS = zx*zx;
43 | 			zyS = zy*zy;
44 | 			++iter;
45 | 		}
46 | 		*res++ = iter;
47 | 	}
48 | }
49 | #endif
50 | 
51 | #define IMPLS(f) \
52 | 	f(scalar_f32) \
53 | 	IF_F64(f(scalar_f64)) \
54 | 	IF_VF16(f(rvv_f16_m1)) \
55 | 	IF_VF16(f(rvv_f16_m2)) \
56 | 	f(rvv_f32_m1) \
57 | 	f(rvv_f32_m2) \
58 | 	IF_VF64(f(rvv_f64_m1)) \
59 | 	IF_VF64(f(rvv_f64_m2)) \
60 | 
61 | typedef void Func(size_t width, size_t maxIter, uint32_t *res);
62 | 
63 | #define DECLARE(f) extern Func mandelbrot_##f;
64 | IMPLS(DECLARE)
65 | 
66 | #define EXTRACT(f) { #f, &mandelbrot_##f },
67 | Impl impls[] = { IMPLS(EXTRACT) };
68 | 
69 | void init(void) { }
70 | 
71 | /* disabled, because of rounding errors, please independently verify */
72 | ux checksum(size_t n) {
73 | #if 0
74 | 	double sum = 0;
75 | 	uint32_t *ptr = (uint32_t*)mem;
76 | 	n = usqrt(n);
77 | 	for (size_t i = 0; i < n*n; ++i)
78 | 		sum += *ptr++;
79 | 	print("<")(f,sum/(n*n+1))(">");
80 | #endif
81 | 	return 0;
82 | }
83 | 
84 | BENCH_BEG(base) {
85 | 	n = usqrt(n);
86 | 	TIME f(n, mandelbrot_ITER, (uint32_t*)mem);
87 | } BENCH_END
88 | 
89 | Bench benches[] = {
90 | 	BENCH(
91 | 		impls,
92 | 		SCALE_mandelbrot(MAX_MEM / 4),
93 | 		"mandelbrot "STR(mandelbrot_ITER),
94 | 		bench_base
95 | 	)
96 | }; BENCH_MAIN(benches)
97 | 
98 | 


--------------------------------------------------------------------------------
/bench/memcpy.S:
--------------------------------------------------------------------------------
  1 | #if 0
  2 | void *memcpy_rvv(void *restrict dest, void const *restrict src, size_t n) {
  3 | 	unsigned char *d = dest;
  4 | 	unsigned char const *s = src;
  5 | 	for (size_t vl; n > 0; n -= vl, s += vl, d += vl) {
  6 | 		vl = __riscv_vsetvl_e8m8(n);
  7 | 		vuint8m8_t vec_src = __riscv_vle8_v_u8m8(s, vl);
  8 | 		__riscv_vse8_v_u8m8(d, vec_src, vl);
  9 | 	}
 10 | 	return dest;
 11 | }
 12 | #endif
 13 | 
 14 | 
 15 | #ifdef MX
 16 | 
 17 | # a0 = dest, a1 = src, a2 = len
 18 | .global MX(memcpy_rvv_)
 19 | MX(memcpy_rvv_):
 20 | 	mv a3, a0
 21 | 1:
 22 | 	vsetvli t0, a2, e8, MX(), ta, ma
 23 | 	vle8.v v0, (a1)
 24 | 	add a1, a1, t0
 25 | 	sub a2, a2, t0
 26 | 	vse8.v v0, (a3)
 27 | 	add a3, a3, t0
 28 | 	bnez a2, 1b
 29 | 	ret
 30 | 
 31 | .global MX(memcpy_rvv_align_dest_)
 32 | MX(memcpy_rvv_align_dest_):
 33 | 	mv a3, a0
 34 | 	vsetvli t0, zero, e8, MX(), ta, ma # vlenb
 35 | 	bltu a2, t0, 2f # len < vlenb
 36 | 	# align dest to vlenb
 37 | 	sub t1, zero, a0
 38 | 	addi t2, t0, -1
 39 | 	and t1, t1, t2 #align = (-dest) & (vlenb-1)
 40 | 	vsetvli t0, t1, e8, MX(), ta, ma
 41 | 1:
 42 | 	vle8.v v0, (a1)
 43 | 	add a1, a1, t0
 44 | 	sub a2, a2, t0
 45 | 	vse8.v v0, (a3)
 46 | 	add a3, a3, t0
 47 | 2:
 48 | 	vsetvli t0, a2, e8, MX(), ta, ma
 49 | 	bnez a2, 1b
 50 | 	ret
 51 | 
 52 | .global MX(memcpy_rvv_align_src_)
 53 | MX(memcpy_rvv_align_src_):
 54 | 	mv a3, a0
 55 | 	vsetvli t0, zero, e8, MX(), ta, ma # vlen
 56 | 	bltu a2, t0, 2f # len < vlen
 57 | 	# align src to vlen
 58 | 	sub t1, zero, a1
 59 | 	addi t2, t0, -1
 60 | 	and t1, t1, t2 # align = (-src) & (vlen-1)
 61 | 	vsetvli t0, t1, e8, MX(), ta, ma
 62 | 1:
 63 | 	vle8.v v0, (a1)
 64 | 	add a1, a1, t0
 65 | 	sub a2, a2, t0
 66 | 	vse8.v v0, (a3)
 67 | 	add a3, a3, t0
 68 | 2:
 69 | 	vsetvli t0, a2, e8, MX(), ta, ma
 70 | 	bnez a2, 1b
 71 | 	ret
 72 | 
 73 | # combination of memcpy_rvv_align_dest and memcpy_rvv
 74 | .global MX(memcpy_rvv_align_dest_hybrid_)
 75 | MX(memcpy_rvv_align_dest_hybrid_):
 76 | 	mv a3, a0
 77 | 	vsetvli t0, zero, e8, MX(), ta, ma # vlen
 78 | 	slli t1, t0, 8 # skip costly division for more values
 79 | 	bltu a2, t1, 2f # len < vlen
 80 | 	sub t1, zero, a0
 81 | 	addi t2, t0, -1
 82 | 	and t1, t1, t2 # align = (-dest) & (vlen-1)
 83 | 	vsetvli t0, t1, e8, MX(), ta, ma # align dest to vlen
 84 | 1:
 85 | 	vle8.v v0, (a1)
 86 | 	add a1, a1, t0
 87 | 	sub a2, a2, t0
 88 | 	vse8.v v0, (a3)
 89 | 	add a3, a3, t0
 90 | 2:
 91 | 	vsetvli t0, a2, e8, MX(), ta, ma
 92 | 	bnez a2, 1b
 93 | 	ret
 94 | 
 95 | 
 96 | .global MX(memcpy_rvv_tail_)
 97 | MX(memcpy_rvv_tail_):
 98 | 	vsetvli t0, a2, e8, MX(), ta, ma
 99 | 	remu a3, a2, t0 # tail = n % vlenb
100 | 	sub a2, a2, a3 # n -= tail
101 | 	add a4, a0, a2 # end = dest + n
102 | 	mv a2, a0     # n = dest
103 | 1:
104 | 	vle8.v v8, (a1)
105 | 	add a1, a1, t0 # src += vlenb
106 | 	vse8.v v8, (a2)
107 | 	add a2, a2, t0 # dest += vlenb
108 | 	bltu a2, a4, 1b # dest < end
109 | 	# copy tail
110 | 	vsetvli zero, a3, e8, MX(), ta, ma
111 | 	vle8.v v8, (a1)
112 | 	vse8.v v8, (a2)
113 | 	ret
114 | 
115 | # this is supposed to test how well the implementation handles
116 | # operations with an vl smaller than VLMAX
117 | .global MX(memcpy_rvv_128_)
118 | MX(memcpy_rvv_128_):
119 | 	li t0, 128/8
120 | 	bgt a2, t0, 1f
121 | 	mv t0, a2
122 | 1:
123 | 	vsetvli t0, t0, e8, MX(), ta, ma
124 | 	remu a3, a2, t0 # tail = n % vlenb
125 | 	sub a2, a2, a3 # n -= tail
126 | 	add a4, a0, a2 # end = dest + n
127 | 	mv a2, a0     # n = dest
128 | 1:
129 | 	vle8.v v8, (a1)
130 | 	add a1, a1, t0 # src += vlenb
131 | 	vse8.v v8, (a2)
132 | 	add a2, a2, t0 # dest += vlenb
133 | 	bltu a2, a4, 1b # dest < end
134 | 	# copy tail
135 | 	vsetvli zero, a3, e8, MX(), ta, ma
136 | 	vle8.v v8, (a1)
137 | 	vse8.v v8, (a2)
138 | 	ret
139 | 
140 | #endif
141 | 
142 | 


--------------------------------------------------------------------------------
/bench/memcpy.c:
--------------------------------------------------------------------------------
  1 | #include "bench.h"
  2 | 
  3 | void *
  4 | memcpy_scalar(void *restrict dest, void const *restrict src, size_t n)
  5 | {
  6 | 	unsigned char *d = dest;
  7 | 	unsigned char const *s = src;
  8 | 	while (n--) *d++ = *s++, BENCH_CLOBBER();
  9 | 	return dest;
 10 | }
 11 | 
 12 | void *
 13 | memcpy_scalar_autovec(void *restrict dest, void const *restrict src, size_t n)
 14 | {
 15 | 	unsigned char *d = dest;
 16 | 	unsigned char const *s = src;
 17 | 	while (n--) *d++ = *s++;
 18 | 	return dest;
 19 | }
 20 | 
 21 | /* https://git.musl-libc.org/cgit/musl/tree/src/string/memcpy.c */
 22 | void *
 23 | memcpy_musl(void *restrict dest, void const *restrict src, size_t n)
 24 | {
 25 | 	unsigned char *d = dest;
 26 | 	unsigned char const *s = src;
 27 | 
 28 | #ifdef __GNUC__
 29 | 
 30 | #if __BYTE_ORDER == __LITTLE_ENDIAN
 31 | #define LS >>
 32 | #define RS <<
 33 | #else
 34 | #define LS <<
 35 | #define RS >>
 36 | #endif
 37 | 
 38 | 	typedef uint32_t __attribute__((__may_alias__)) u32;
 39 | 	uint32_t w, x;
 40 | 
 41 | 	for (; (uintptr_t)s % 4 && n; n--) *d++ = *s++;
 42 | 
 43 | 	if ((uintptr_t)d % 4 == 0) {
 44 | 		for (; n>=16; s+=16, d+=16, n-=16) {
 45 | 			*(u32 *)(d+0) = *(u32 *)(s+0);
 46 | 			*(u32 *)(d+4) = *(u32 *)(s+4);
 47 | 			*(u32 *)(d+8) = *(u32 *)(s+8);
 48 | 			*(u32 *)(d+12) = *(u32 *)(s+12);
 49 | 		}
 50 | 		if (n&8) {
 51 | 			*(u32 *)(d+0) = *(u32 *)(s+0);
 52 | 			*(u32 *)(d+4) = *(u32 *)(s+4);
 53 | 			d += 8; s += 8;
 54 | 		}
 55 | 		if (n&4) {
 56 | 			*(u32 *)(d+0) = *(u32 *)(s+0);
 57 | 			d += 4; s += 4;
 58 | 		}
 59 | 		if (n&2) {
 60 | 			*d++ = *s++; *d++ = *s++;
 61 | 		}
 62 | 		if (n&1) {
 63 | 			*d = *s;
 64 | 		}
 65 | 		return dest;
 66 | 	}
 67 | 
 68 | 	if (n >= 32) switch ((uintptr_t)d % 4) {
 69 | 	case 1:
 70 | 		w = *(u32 *)s;
 71 | 		*d++ = *s++;
 72 | 		*d++ = *s++;
 73 | 		*d++ = *s++;
 74 | 		n -= 3;
 75 | 		for (; n>=17; s+=16, d+=16, n-=16) {
 76 | 			x = *(u32 *)(s+1);
 77 | 			*(u32 *)(d+0) = (w LS 24) | (x RS 8);
 78 | 			w = *(u32 *)(s+5);
 79 | 			*(u32 *)(d+4) = (x LS 24) | (w RS 8);
 80 | 			x = *(u32 *)(s+9);
 81 | 			*(u32 *)(d+8) = (w LS 24) | (x RS 8);
 82 | 			w = *(u32 *)(s+13);
 83 | 			*(u32 *)(d+12) = (x LS 24) | (w RS 8);
 84 | 		}
 85 | 		break;
 86 | 	case 2:
 87 | 		w = *(u32 *)s;
 88 | 		*d++ = *s++;
 89 | 		*d++ = *s++;
 90 | 		n -= 2;
 91 | 		for (; n>=18; s+=16, d+=16, n-=16) {
 92 | 			x = *(u32 *)(s+2);
 93 | 			*(u32 *)(d+0) = (w LS 16) | (x RS 16);
 94 | 			w = *(u32 *)(s+6);
 95 | 			*(u32 *)(d+4) = (x LS 16) | (w RS 16);
 96 | 			x = *(u32 *)(s+10);
 97 | 			*(u32 *)(d+8) = (w LS 16) | (x RS 16);
 98 | 			w = *(u32 *)(s+14);
 99 | 			*(u32 *)(d+12) = (x LS 16) | (w RS 16);
100 | 		}
101 | 		break;
102 | 	case 3:
103 | 		w = *(u32 *)s;
104 | 		*d++ = *s++;
105 | 		n -= 1;
106 | 		for (; n>=19; s+=16, d+=16, n-=16) {
107 | 			x = *(u32 *)(s+3);
108 | 			*(u32 *)(d+0) = (w LS 8) | (x RS 24);
109 | 			w = *(u32 *)(s+7);
110 | 			*(u32 *)(d+4) = (x LS 8) | (w RS 24);
111 | 			x = *(u32 *)(s+11);
112 | 			*(u32 *)(d+8) = (w LS 8) | (x RS 24);
113 | 			w = *(u32 *)(s+15);
114 | 			*(u32 *)(d+12) = (x LS 8) | (w RS 24);
115 | 		}
116 | 		break;
117 | 	}
118 | 	if (n&16) {
119 | 		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
120 | 		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
121 | 		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
122 | 		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
123 | 	}
124 | 	if (n&8) {
125 | 		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
126 | 		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
127 | 	}
128 | 	if (n&4) {
129 | 		*d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
130 | 	}
131 | 	if (n&2) {
132 | 		*d++ = *s++; *d++ = *s++;
133 | 	}
134 | 	if (n&1) {
135 | 		*d = *s;
136 | 	}
137 | 	return dest;
138 | #endif
139 | 
140 | 	while (n--) { *d++ = *s++; BENCH_CLOBBER(); }
141 | 	return dest;
142 | }
143 | 
144 | #define memcpy_libc memcpy
145 | 
146 | #define IMPLS(f) \
147 | 	IFHOSTED(f(libc)) \
148 | 	f(musl) \
149 | 	f(scalar) \
150 | 	f(scalar_autovec) \
151 | 	MX(f, rvv) \
152 | 	MX(f, rvv_align_dest) \
153 | 	MX(f, rvv_align_src) \
154 | 	MX(f, rvv_align_dest_hybrid) \
155 | 	MX(f, rvv_tail) \
156 | 	MX(f, rvv_128) \
157 | 
158 | typedef void *Func(void *restrict dest, void const *restrict src, size_t n);
159 | 
160 | #define DECLARE(f) extern Func memcpy_##f;
161 | IMPLS(DECLARE)
162 | 
163 | #define EXTRACT(f) { #f, &memcpy_##f },
164 | Impl impls[] = { IMPLS(EXTRACT) };
165 | 
166 | uint8_t *dest, *src;
167 | ux last;
168 | 
169 | void init(void) { }
170 | 
171 | ux checksum(size_t n) {
172 | 	ux sum = last;
173 | 	for (size_t i = 0; i < n+9; ++i)
174 | 		sum = uhash(sum) + dest[i];
175 | 	return sum;
176 | }
177 | 
178 | void common(size_t n, size_t dOff, size_t sOff) {
179 | 	dest = mem + dOff; src = dest + MAX_MEM/2 + sOff + 9;
180 | 	memset(dest, 0, n+9);
181 | }
182 | 
183 | BENCH_BEG(base) {
184 | 	common(n, bench_urand() & 255, bench_urand() & 255);
185 | 	TIME last = (uintptr_t)f(dest, src, n);
186 | } BENCH_END
187 | 
188 | BENCH_BEG(aligned) {
189 | 	common(n, 0, 0);
190 | 	TIME last = (uintptr_t)f(dest, src, n);
191 | } BENCH_END
192 | 
193 | Bench benches[] = {
194 | 	BENCH( impls, MAX_MEM/2 - 521, "memcpy", bench_base ),
195 | 	BENCH( impls, MAX_MEM/2 - 521, "memcpy aligned", bench_aligned )
196 | }; BENCH_MAIN(benches)
197 | 
198 | 


--------------------------------------------------------------------------------
/bench/memset.S:
--------------------------------------------------------------------------------
 1 | #if 0
 2 | void *memset(void *dst, int n, size_t len) {
 3 | 	unsigned char *d = dst;
 4 | 	vuint8m8_t v = __riscv_vmv_v_x_u8m8((uint8_t)n, __riscv_vsetvlmax_e8m8());
 5 | 	for (size_t vl; len > 0; len -= vl, d += vl) {
 6 | 		vl = __riscv_vsetvl_e8m8(len);
 7 | 		__riscv_vse8_v_u8m8(d, v, vl);
 8 | 	}
 9 | 	return dst;
10 | }
11 | #endif
12 | 
13 | #ifdef MX
14 | 
15 | .global MX(memset_rvv_)
16 | MX(memset_rvv_):
17 | 	vsetvli a3, zero, e8, MX(), ta, ma
18 | 	vmv.v.x v8, a1
19 | 	mv a1, a0
20 | 1:
21 | 	vsetvli a3, a2, e8, MX(), ta, ma
22 | 	vse8.v v8, (a1)
23 | 	sub a2, a2, a3
24 | 	add a1, a1, a3
25 | 	bnez a2, 1b
26 | 	ret
27 | 
28 | 
29 | .global MX(memset_rvv_align_)
30 | MX(memset_rvv_align_):
31 | 	vsetvli t0, zero, e8, MX(), ta, ma # vlen
32 | 	vmv.v.x v8, a1
33 | 	mv a1, a0
34 | 	vsetvli t0, zero, e8, MX(), ta, ma # vlen
35 | 	bltu a2, t0, 2f # len < vlen
36 | 	# align dest to vlen
37 | 	sub t1, zero, a0
38 | 	addi t2, t0, -1
39 | 	and t1, t1, t2 #align = (-dest) & (vlenb-1)
40 | 	vsetvli t0, t1, e8, MX(), ta, ma
41 | 1:
42 | 	vse8.v v8, (a1)
43 | 	sub a2, a2, t0
44 | 	add a1, a1, t0
45 | 2:
46 | 	vsetvli t0, a2, e8, MX(), ta, ma
47 | 	bnez a2, 1b
48 | 	ret
49 | 
50 | .global MX(memset_rvv_tail_)
51 | MX(memset_rvv_tail_):
52 | 	vsetvli t0, a2, e8, MX(), ta, ma
53 | 	vmv.v.x v8, a1
54 | 	remu a3, a2, t0 # tail = n % vlenb
55 | 	sub a2, a2, a3 # n -= tail
56 | 	add a4, a0, a2 # end = dest + n
57 | 	mv a2, a0     # n = dest
58 | 1:
59 | 	vse8.v v8, (a2)
60 | 	add a2, a2, t0 # dest += vlenb
61 | 	bltu a2, a4, 1b # dest < end
62 | 	# handle tail
63 | 	vsetvli zero, a3, e8, MX(), ta, ma
64 | 	vse8.v v8, (a2)
65 | 	ret
66 | 
67 | .global MX(memset_rvv_tail_4x_)
68 | MX(memset_rvv_tail_4x_):
69 | 	vsetvli t0, a2, e8, MX(), ta, ma
70 | 	vmv.v.x v8, a1
71 | 	slli t1, t0, 2
72 | 	mv a5, a0
73 | 	mv a3, a2
74 | 	bltu a2, t1, 2f
75 | 	remu a3, a2, t1 # tail = n % (vlenb*4)
76 | 	sub a2, a2, a3 # n -= tail
77 | 	add a4, a0, a2 # end = dest + n
78 | 1:
79 | 	vse8.v v8, (a5)
80 | 	add a5, a5, t0 # dest += vlenb
81 | 	vse8.v v8, (a5)
82 | 	add a5, a5, t0 # dest += vlenb
83 | 	vse8.v v8, (a5)
84 | 	add a5, a5, t0 # dest += vlenb
85 | 	vse8.v v8, (a5)
86 | 	add a5, a5, t0 # dest += vlenb
87 | 	bltu a5, a4, 1b # dest < end
88 | 	# handle tail
89 | 2:
90 | 	vsetvli a4, a3, e8, MX(), ta, ma
91 | 	vse8.v v8, (a5)
92 | 	sub a3, a3, a4
93 | 	add a5, a5, a4
94 | 	bnez a3, 2b
95 | 	ret
96 | 
97 | #endif
98 | 


--------------------------------------------------------------------------------
/bench/memset.c:
--------------------------------------------------------------------------------
  1 | #include "bench.h"
  2 | 
  3 | void *
  4 | memset_scalar(void *dest, int c, size_t n)
  5 | {
  6 | 	unsigned char *d = dest;
  7 | 	while (n--) *d++ = c, BENCH_CLOBBER();
  8 | 	return dest;
  9 | }
 10 | 
 11 | void *
 12 | memset_scalar_autovec(void *dest, int c, size_t n)
 13 | {
 14 | 	unsigned char *d = dest;
 15 | 	while (n--) *d++ = c;
 16 | 	return dest;
 17 | }
 18 | 
 19 | /* https://git.musl-libc.org/cgit/musl/tree/src/string/memset.c */
 20 | #if __riscv_xlen != 32
 21 | void *
 22 | memset_musl(void *dest, int c, size_t n)
 23 | {
 24 | 	unsigned char *s = dest;
 25 | 	size_t k;
 26 | 
 27 | 	/* Fill head and tail with minimal branching. Each
 28 | 	 * conditional ensures that all the subsequently used
 29 | 	 * offsets are well-defined and in the dest region. */
 30 | 
 31 | 	if (!n) return dest;
 32 | 	s[0] = c;
 33 | 	s[n-1] = c;
 34 | 	if (n <= 2) return dest;
 35 | 	s[1] = c;
 36 | 	s[2] = c;
 37 | 	s[n-2] = c;
 38 | 	s[n-3] = c;
 39 | 	if (n <= 6) return dest;
 40 | 	s[3] = c;
 41 | 	s[n-4] = c;
 42 | 	if (n <= 8) return dest;
 43 | 
 44 | 	/* Advance pointer to align it at a 4-byte boundary,
 45 | 	 * and truncate n to a multiple of 4. The previous code
 46 | 	 * already took care of any head/tail that get cut off
 47 | 	 * by the alignment. */
 48 | 
 49 | 	k = -(uintptr_t)s & 3;
 50 | 	s += k;
 51 | 	n -= k;
 52 | 	n &= -4;
 53 | 
 54 | #ifdef __GNUC__
 55 | 	typedef uint32_t __attribute__((__may_alias__)) u32;
 56 | 	typedef uint64_t __attribute__((__may_alias__)) u64;
 57 | 
 58 | 	u32 c32 = ((u32)-1)/255 * (unsigned char)c;
 59 | 
 60 | 	/* In preparation to copy 32 bytes at a time, aligned on
 61 | 	 * an 8-byte bounary, fill head/tail up to 28 bytes each.
 62 | 	 * As in the initial byte-based head/tail fill, each
 63 | 	 * conditional below ensures that the subsequent offsets
 64 | 	 * are valid (e.g. !(n<=24) implies n>=28). */
 65 | 
 66 | 	*(u32 *)(s+0) = c32;
 67 | 	*(u32 *)(s+n-4) = c32;
 68 | 	if (n <= 8) return dest;
 69 | 	*(u32 *)(s+4) = c32;
 70 | 	*(u32 *)(s+8) = c32;
 71 | 	*(u32 *)(s+n-12) = c32;
 72 | 	*(u32 *)(s+n-8) = c32;
 73 | 	if (n <= 24) return dest;
 74 | 	*(u32 *)(s+12) = c32;
 75 | 	*(u32 *)(s+16) = c32;
 76 | 	*(u32 *)(s+20) = c32;
 77 | 	*(u32 *)(s+24) = c32;
 78 | 	*(u32 *)(s+n-28) = c32;
 79 | 	*(u32 *)(s+n-24) = c32;
 80 | 	*(u32 *)(s+n-20) = c32;
 81 | 	*(u32 *)(s+n-16) = c32;
 82 | 
 83 | 	/* Align to a multiple of 8 so we can fill 64 bits at a time,
 84 | 	 * and avoid writing the same bytes twice as much as is
 85 | 	 * practical without introducing additional branching. */
 86 | 
 87 | 	k = 24 + ((uintptr_t)s & 4);
 88 | 	s += k;
 89 | 	n -= k;
 90 | 
 91 | 	/* If this loop is reached, 28 tail bytes have already been
 92 | 	 * filled, so any remainder when n drops below 32 can be
 93 | 	 * safely ignored. */
 94 | 
 95 | 	u64 c64 = c32 | ((u64)c32 << 32);
 96 | 	for (; n >= 32; n-=32, s+=32) {
 97 | 		*(u64 *)(s+0) = c64;
 98 | 		*(u64 *)(s+8) = c64;
 99 | 		*(u64 *)(s+16) = c64;
100 | 		*(u64 *)(s+24) = c64;
101 | 	}
102 | #else
103 | 	/* Pure C fallback with no aliasing violations. */
104 | 	while (n--) *s++ = c;
105 | #endif
106 | 
107 | 	return dest;
108 | }
109 | #endif
110 | 
111 | #define memset_libc memset
112 | 
113 | #define IMPLS(f) \
114 | 	IFHOSTED(f(libc)) \
115 | 	IF64(f(musl)) \
116 | 	f(scalar) \
117 | 	f(scalar_autovec) \
118 | 	MX(f, rvv) \
119 | 	MX(f, rvv_align) \
120 | 	MX(f, rvv_tail) \
121 | 	MX(f, rvv_tail_4x) \
122 | 
123 | typedef void *Func(void *dest, int c, size_t n);
124 | 
125 | #define DECLARE(f) extern Func memset_##f;
126 | IMPLS(DECLARE)
127 | 
128 | #define EXTRACT(f) { #f, &memset_##f },
129 | Impl impls[] = { IMPLS(EXTRACT) };
130 | 
131 | uint8_t *dest;
132 | ux last;
133 | char c;
134 | 
135 | void init(void) { c = bench_urand(); }
136 | 
137 | ux checksum(size_t n) {
138 | 	ux sum = last;
139 | 	for (size_t i = 0; i < n+9; ++i)
140 | 		sum = uhash(sum) + dest[i];
141 | 	return sum;
142 | }
143 | 
144 | void common(size_t n, size_t off) {
145 | 	dest = mem + off;
146 | 	memset(dest, c+3, n+9);
147 | }
148 | 
149 | BENCH_BEG(base) {
150 | 	common(n, bench_urand() & 511);
151 | 	TIME last = (uintptr_t)f(dest, c, n);
152 | } BENCH_END
153 | 
154 | BENCH_BEG(aligned) {
155 | 	common(n, 0);
156 | 	TIME last = (uintptr_t)f(dest, c, n);
157 | } BENCH_END
158 | 
159 | Bench benches[] = {
160 | 	BENCH( impls, MAX_MEM - 521, "memset", bench_base ),
161 | 	BENCH( impls, MAX_MEM - 521, "memset aligned", bench_aligned )
162 | }; BENCH_MAIN(benches)
163 | 
164 | 


--------------------------------------------------------------------------------
/bench/mergelines.S:
--------------------------------------------------------------------------------
  1 | #if 0
  2 | size_t
  3 | mergelines_rvv_vslide(char *str, size_t len)
  4 | {
  5 | 	uint8_t *dest = (uint8_t*)str;
  6 | 	uint8_t *src = (uint8_t*)str;
  7 | 	char last = 0;
  8 | 
  9 | 	vuint8m8_t v, u, d;
 10 | 	vbool1_t m;
 11 | 
 12 | 	for (size_t vl, VL; len > 1; ) {
 13 | 		VL = vl = __riscv_vsetvl_e8m8(len);
 14 | 
 15 | 		char next = len > vl ? src[vl] : 0;
 16 | 		v = __riscv_vle8_v_u8m8(src, vl);
 17 | 		u = __riscv_vslide1up_vx_u8m8(v, last, vl);
 18 | 		d = __riscv_vslide1down_vx_u8m8(v, next, vl);
 19 | 
 20 | 		m = __riscv_vmor_mm_b1(__riscv_vmsne_vx_u8m8_b1(u, '\\', vl), __riscv_vmsne_vx_u8m8_b1(v, '\n', vl), vl);
 21 | 	#if DO_SKIP
 22 | 		if (likely(__riscv_vcpop_m_b1(m, vl) == vl && next != '\n'))
 23 | 			goto skip;
 24 | 	#endif
 25 | 		m = __riscv_vmand_mm_b1(
 26 | 			m,
 27 | 			__riscv_vmor_mm_b1(__riscv_vmsne_vx_u8m8_b1(v, '\\', vl), __riscv_vmsne_vx_u8m8_b1(d, '\n', vl), vl),
 28 | 			vl);
 29 | 
 30 | 		v = __riscv_vcompress_vm_u8m8(v, m, vl);
 31 | 		vl = __riscv_vcpop_m_b1(m, vl);
 32 | 	skip:
 33 | 		__riscv_vse8_v_u8m8(dest, v, vl);
 34 | 		dest += vl; src += VL; len -= VL;
 35 | 		last = src[-1];
 36 | 	}
 37 | 
 38 | 	if (len > 0 && !(last == '\\' && *src == '\n')) *dest++ = *src++;
 39 | 	return (dest - (uint8_t*)str);
 40 | }
 41 | 
 42 | size_t
 43 | mergelines_rvv_mshift(char *str, size_t count)
 44 | {
 45 | 	if (count < 2) return count;
 46 | 	uint8_t *dest = (uint8_t*)str;
 47 | 	uint8_t *src = 1+(uint8_t*)str;
 48 | 	char last = src[-1];
 49 | 	size_t len = count-1;
 50 | 
 51 | 	vuint8m8_t v, u, d;
 52 | 	vbool1_t m;
 53 | 
 54 | 	for (size_t vl, VL; len > 0; dest += vl, src += VL, len -= VL, last = src[-1]) {
 55 | 		vl = VL = __riscv_vsetvl_e8m8(len);
 56 | 
 57 | 		v = __riscv_vle8_v_u8m8(src, vl);
 58 | 		u = __riscv_vslide1up_vx_u8m8(v, last, vl);
 59 | 
 60 | 		m = __riscv_vmor_mm_b1(
 61 | 				__riscv_vmsne_vx_u8m8_b1(u, '\\', vl),
 62 | 				__riscv_vmsne_vx_u8m8_b1(v, '\n', vl), vl);
 63 | 	#if DO_SKIP
 64 | 		if (__riscv_vcpop_m_b1(m, vl) == vl) goto skip;
 65 | 	#endif
 66 | 
 67 | 		vuint8m1_t m1 = __riscv_vreinterpret_v_b1_u8m1(m);
 68 | 		size_t vlmax8 = __riscv_vsetvlmax_e8m1();
 69 | 		m1 = __riscv_vor_vv_u8m1(
 70 | 			__riscv_vsrl_vx_u8m1(__riscv_vslide1up_vx_u8m1(m1, 0xFF, vlmax8), 7, vlmax8),
 71 | 			__riscv_vsll_vx_u8m1(m1, 1, vlmax8), vlmax8);
 72 | 		m = __riscv_vmand_mm_b1(m, __riscv_vreinterpret_v_u8m1_b1(m1), vl);
 73 | 
 74 | 		u = __riscv_vcompress_vm_u8m8(u, m, vl);
 75 | 
 76 | 		vl = __riscv_vcpop_m_b1(m, vl);
 77 | 		VL += (VL^vl)&1&(VL < len); // missing bit in mask, so skip 1
 78 | skip:
 79 | 		__riscv_vse8_v_u8m8(dest, u, vl);
 80 | 	}
 81 | 	if (count > 1 && !(src[-2] == '\\' && src[-1] == '\n')) *dest++ = last;
 82 | 	return (dest - (uint8_t*)str);
 83 | }
 84 | 
 85 | #endif
 86 | 
 87 | #ifdef MX
 88 | 
 89 | .global MX(mergelines_rvv_vslide_) # generated by clang
 90 | MX(mergelines_rvv_vslide_):
 91 | 	li a2, 2
 92 | 	bltu a1, a2, MX(rvv_6)
 93 | 	li t0, 0
 94 | 	li a7, 92
 95 | 	li a6, 1
 96 | 	mv a2, a0
 97 | 	mv a4, a0
 98 | 	j MX(rvv_4)
 99 | MX(rvv_2):
100 | 	add a3, a4, a5
101 | 	lbu t1, 0(a3)
102 | MX(rvv_3):
103 | 	vle8.v v8, (a4)
104 | 	add a3, a4, a5
105 | 	vslide1up.vx v16, v8, t0
106 | 	vslide1down.vx v24, v8, t1
107 | 	vmsne.vx v0, v16, a7
108 | 	vmsne.vi v16, v8, 10
109 | 	vmor.mm v16, v0, v16
110 | 	vmsne.vx v17, v8, a7
111 | 	vmsne.vi v18, v24, 10
112 | 	vmor.mm v17, v17, v18
113 | 	vmand.mm v16, v16, v17
114 | 	vcompress.vm v24, v8, v16
115 | 	vcpop.m a4, v16
116 | 	vsetvli zero, a4, e8, MX(), ta, ma
117 | 	vse8.v v24, (a2)
118 | 	lbu t0, -1(a3)
119 | 	sub a1, a1, a5
120 | 	add a2, a2, a4
121 | 	mv a4, a3
122 | 	bgeu a6, a1, MX(rvv_8)
123 | MX(rvv_4):
124 | 	vsetvli a5, a1, e8, MX(), ta, ma
125 | 	bltu a5, a1, MX(rvv_2)
126 | 	li t1, 0
127 | 	j MX(rvv_3)
128 | MX(rvv_6):
129 | 	mv a2, a0
130 | 	beqz a1, MX(rvv_10)
131 | 	lbu a1, 0(a0)
132 | 	mv a2, a0
133 | 	j MX(rvv_11)
134 | MX(rvv_8):
135 | 	beqz a1, MX(rvv_10)
136 | 	lbu a1, 0(a3)
137 | 	xori a3, t0, 92
138 | 	xori a4, a1, 10
139 | 	or a3, a3, a4
140 | 	bnez a3, MX(rvv_11)
141 | MX(rvv_10):
142 | 	sub a0, a2, a0
143 | 	ret
144 | MX(rvv_11):
145 | 	addi a3, a2, 1
146 | 	sb a1, 0(a2)
147 | 	sub a0, a3, a0
148 | 	ret
149 | 
150 | 
151 | .global MX(mergelines_rvv_vslide_skip_) # generated by clang
152 | MX(mergelines_rvv_vslide_skip_):
153 | 	li a2, 2
154 | 	bltu a1, a2, MX(rvv_skip_9)
155 | 	li a5, 0
156 | 	li a6, 92
157 | 	li a7, 1
158 | 	mv t1, a0
159 | 	mv a3, a0
160 | MX(rvv_skip_2):
161 | 	vsetvli a4, a1, e8, MX(), ta, ma
162 | 	bgeu a4, a1, MX(rvv_skip_4)
163 | 	add a2, a3, a4
164 | 	lbu t0, 0(a2)
165 | 	j MX(rvv_skip_5)
166 | MX(rvv_skip_4):
167 | 	li t0, 0
168 | MX(rvv_skip_5):
169 | 	vle8.v v8, (a3)
170 | 	vslide1up.vx v16, v8, a5
171 | 	vmsne.vx v24, v16, a6
172 | 	vmsne.vi v16, v8, 10
173 | 	vmor.mm v16, v24, v16
174 | 	vcpop.m a2, v16
175 | 	xor a2, a2, a4
176 | 	seqz a2, a2
177 | 	addi a5, t0, -10
178 | 	snez a5, a5
179 | 	and a2, a2, a5
180 | 	beqz a2, MX(rvv_skip_8)
181 | 	mv a2, a4
182 | MX(rvv_skip_7):
183 | 	add a3, a3, a4
184 | 	vsetvli zero, a2, e8, MX(), ta, ma
185 | 	vse8.v v8, (t1)
186 | 	lbu a5, -1(a3)
187 | 	sub a1, a1, a4
188 | 	add t1, t1, a2
189 | 	bltu a7, a1, MX(rvv_skip_2)
190 | 	j MX(rvv_skip_11)
191 | MX(rvv_skip_8):
192 | 	vslide1down.vx v24, v8, t0
193 | 	vmsne.vx v17, v8, a6
194 | 	vmsne.vi v18, v24, 10
195 | 	vmor.mm v17, v17, v18
196 | 	vmand.mm v16, v16, v17
197 | 	vcompress.vm v24, v8, v16
198 | 	vcpop.m a2, v16
199 | 	vmv.v.v v8, v24
200 | 	j MX(rvv_skip_7)
201 | MX(rvv_skip_9):
202 | 	mv t1, a0
203 | 	beqz a1, MX(rvv_skip_13)
204 | 	lbu a1, 0(a0)
205 | 	mv t1, a0
206 | 	j MX(rvv_skip_14)
207 | MX(rvv_skip_11):
208 | 	beqz a1, MX(rvv_skip_13)
209 | 	lbu a1, 0(a3)
210 | 	xori a2, a5, 92
211 | 	xori a3, a1, 10
212 | 	or a2, a2, a3
213 | 	bnez a2, MX(rvv_skip_14)
214 | MX(rvv_skip_13):
215 | 	sub a0, t1, a0
216 | 	ret
217 | MX(rvv_skip_14):
218 | 	addi a2, t1, 1
219 | 	sb a1, 0(t1)
220 | 	sub a0, a2, a0
221 | 	ret
222 | 
223 | .global MX(mergelines_rvv_mshift_)
224 | MX(mergelines_rvv_mshift_):
225 | 	li a2, 2
226 | 	bltu a1, a2, 1f
227 | 	addi a2, a0, 1
228 | 	addi a3, a1, -1
229 | 	lbu a4, 0(a0)
230 | 	li a6, 92
231 | 	vsetvli a1, zero, e8, MXf8e8(), ta, ma
232 | 	li a7, -1
233 | 	mv t0, a0
234 | 2:
235 | 	vsetvli a5, a3, e8, MX(), ta, ma
236 | 	vle8.v v16, (a2)
237 | 	vslide1up.vx v8, v16, a4
238 | 	vmsne.vx v24, v8, a6
239 | 	vmsne.vi v25, v16, 10
240 | 	vmor.mm v16, v24, v25
241 | 	vsetvli a4, zero, e8, MXf8e8(), ta, ma
242 | 	vslide1up.vx v17, v16, a7
243 | 	vsrl.vi v17, v17, 7
244 | 	vadd.vv v18, v16, v16
245 | 	vor.vv v17, v17, v18
246 | 	vsetvli zero, a5, e8, MX(), ta, ma
247 | 	vmand.mm v16, v16, v17
248 | 	vcompress.vm v24, v8, v16
249 | 	vcpop.m a1, v16
250 | 	xor t1, a1, a5
251 | 	sltu a4, a5, a3
252 | 	and a4, a4, t1
253 | 	add a5, a5, a4
254 | 	vsetvli zero, a1, e8, MX(), ta, ma
255 | 	vse8.v v24, (t0)
256 | 	add a2, a2, a5
257 | 	lbu a4, -1(a2)
258 | 	sub a3, a3, a5
259 | 	add t0, t0, a1
260 | 	bnez a3, 2b
261 | 	lbu a1, -2(a2)
262 | 	li a3, 92
263 | 	bne a1, a3, 3f
264 | 	lbu a1, -1(a2)
265 | 	li a2, 10
266 | 	beq a1, a2, 4f
267 | 3:
268 | 	addi a1, t0, 1
269 | 	sb a4, 0(t0)
270 | 	mv t0, a1
271 | 4:
272 | 	sub a1, t0, a0
273 | 1:
274 | 	mv a0, a1
275 | 	ret
276 | 
277 | .global MX(mergelines_rvv_mshift_skip_)
278 | MX(mergelines_rvv_mshift_skip_):
279 | 	li a2, 2
280 | 	bltu a1, a2, 1f
281 | 	addi a2, a0, 1
282 | 	addi a3, a1, -1
283 | 	lbu t0, 0(a0)
284 | 	li a7, 92
285 | 	vsetvli a1, zero, e8, MXf8e8(), ta, ma
286 | 	li a6, -1
287 | 	mv a5, a0
288 | 	j 4f
289 | 2:
290 | 	vsetvli a4, zero, e8, MXf8e8(), ta, ma
291 | 	vslide1up.vx v17, v16, a6
292 | 	vsrl.vi v17, v17, 7
293 | 	vadd.vv v18, v16, v16
294 | 	vor.vv v17, v17, v18
295 | 	vsetvli zero, a1, e8, MX(), ta, ma
296 | 	vmand.mm v16, v16, v17
297 | 	vcompress.vm v24, v8, v16
298 | 	vcpop.m t0, v16
299 | 	xor t1, t0, a1
300 | 	sltu a4, a1, a3
301 | 	and a4, a4, t1
302 | 	add a4, a4, a1
303 | 	mv a1, t0
304 | 	vmv.v.v v8, v24
305 | 3:
306 | 	vsetvli zero, a1, e8, MX(), ta, ma
307 | 	vse8.v v8, (a5)
308 | 	add a2, a2, a4
309 | 	lbu t0, -1(a2)
310 | 	sub a3, a3, a4
311 | 	add a5, a5, a1
312 | 	beqz a3, 5f
313 | 4:
314 | 	vsetvli a1, a3, e8, MX(), ta, ma
315 | 	vle8.v v16, (a2)
316 | 	vslide1up.vx v8, v16, t0
317 | 	vmsne.vx v24, v8, a7
318 | 	vmsne.vi v25, v16, 10
319 | 	vmor.mm v16, v24, v25
320 | 	vcpop.m a4, v16
321 | 	bne a4, a1, 2b
322 | 	mv a4, a1
323 | 	j 3b
324 | 5:
325 | 	lbu a1, -2(a2)
326 | 	li a3, 92
327 | 	bne a1, a3, 6f
328 | 	lbu a1, -1(a2)
329 | 	li a2, 10
330 | 	beq a1, a2, 7f
331 | 6:
332 | 	addi a1, a5, 1
333 | 	sb t0, 0(a5)
334 | 	mv a5, a1
335 | 7:
336 | 	sub a1, a5, a0
337 | 1:
338 | 	mv a0, a1
339 | 	ret
340 | 
341 | 
342 | #endif
343 | 


--------------------------------------------------------------------------------
/bench/mergelines.c:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | 
 3 | size_t
 4 | mergelines_scalar(char *str, size_t len)
 5 | {
 6 | 	char *dest = str;
 7 | 	char *src = str;
 8 | 
 9 | 	while (len > 1) {
10 | 		if (src[0] == '\\' && src[1] == '\n')
11 | 			src += 2, len -= 2;
12 | 		else
13 | 			*dest++ = *src++, --len;
14 | 		BENCH_CLOBBER();
15 | 	}
16 | 	if (len > 0)
17 | 		*dest++ = *src++;
18 | 	return dest - str;
19 | }
20 | 
21 | #define IMPLS(f) \
22 | 	f(scalar) \
23 | 	MX(f, rvv_vslide) \
24 | 	MX(f, rvv_vslide_skip) \
25 | 	MX(f, rvv_mshift) \
26 | 	MX(f, rvv_mshift_skip) \
27 | 
28 | typedef size_t Func(char *buf, size_t len);
29 | 
30 | #define DECLARE(f) extern Func mergelines_##f;
31 | IMPLS(DECLARE)
32 | 
33 | #define EXTRACT(f) { #f, &mergelines_##f },
34 | Impl impls[] = { IMPLS(EXTRACT) };
35 | 
36 | char *str;
37 | ux last;
38 | 
39 | void init(void) { }
40 | ux checksum(size_t n) { return last; }
41 | 
42 | void common(size_t n, char const *chars, size_t nChars) {
43 | 	str = (char*)mem + (bench_urand() & 255);
44 | 	for (size_t i = 0; i < n; ++i)
45 | 		str[i] = chars[bench_urand() % nChars];
46 | }
47 | 
48 | BENCH_BEG(2_3) {
49 | 	common(n, "\\\na", 3);
50 | 	TIME last = (uintptr_t)f(str, n);
51 | } BENCH_END
52 | 
53 | BENCH_BEG(2_16) {
54 | 	common(n, "\\\nabcdefgh", 16);
55 | 	TIME last = (uintptr_t)f(str, n);
56 | } BENCH_END
57 | 
58 | BENCH_BEG(2_32) {
59 | 	common(n, "\\\nabcdefgh123456789", 32);
60 | 	TIME last = (uintptr_t)f(str, n);
61 | } BENCH_END
62 | 
63 | BENCH_BEG(2_256) {
64 | 	str = (char*)mem + (bench_urand() & 255);
65 | 	for (size_t i = 0; i < n; ++i)
66 | 		str[i] = bench_urand() & 0xff;
67 | 	TIME last = (uintptr_t)f(str, n);
68 | } BENCH_END
69 | 
70 | #define COUNT SCALE_mergelines(MAX_MEM) - 256
71 | Bench benches[] = {
72 | 	BENCH( impls, COUNT, "mergelines 2/3", bench_2_3 ),
73 | 	BENCH( impls, COUNT, "mergelines 2/16", bench_2_16 ),
74 | 	BENCH( impls, COUNT, "mergelines 2/32", bench_2_32 ),
75 | 	BENCH( impls, COUNT, "mergelines 2/256", bench_2_256 )
76 | }; BENCH_MAIN(benches)
77 | 
78 | 


--------------------------------------------------------------------------------
/bench/poly1305.S:
--------------------------------------------------------------------------------
1 | #ifndef MX
2 | #if __riscv_xlen != 32
3 | #include "../thirdparty/rvv-chacha-poly/vpoly.s"
4 | #endif
5 | #endif
6 | 


--------------------------------------------------------------------------------
/bench/poly1305.c:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | #if __riscv_xlen != 32
 3 | #include "../thirdparty/rvv-chacha-poly/boring.h"
 4 | 
 5 | uint8_t *src;
 6 | uint8_t key[32], sig[16];
 7 | 
 8 | extern uint64_t
 9 | vector_poly1305(const uint8_t* in, size_t len,
10 |                 const uint8_t key[32], uint8_t sig[16]);
11 | 
12 | static void
13 | poly1305_boring(void const *src, size_t n) {
14 | 	poly1305_state state;
15 | 	boring_poly1305_init(&state, key);
16 | 	boring_poly1305_update(&state, src, n);
17 | 	boring_poly1305_finish(&state, sig);
18 | }
19 | 
20 | static void
21 | poly1305_rvv(void const *src, size_t n) {
22 | 	vector_poly1305(src, n, key, sig);
23 | }
24 | 
25 | typedef void *Func(void const *src, size_t n);
26 | 
27 | Impl impls[] = {
28 | 	{ "boring", &poly1305_boring },
29 | 	IF_VE64({ "rvv", &poly1305_rvv },)
30 | };
31 | 
32 | void init(void) {
33 | 	bench_memrand(key, sizeof key);
34 | 	bench_memrand(sig, sizeof sig);
35 | }
36 | 
37 | ux checksum(size_t n) {
38 | 	ux sum = 0;
39 | 	for (size_t i = 0; i < ARR_LEN(sig); ++i)
40 | 		sum = uhash(sum) + sig[i];
41 | 	return sum;
42 | }
43 | 
44 | BENCH_BEG(aligned) {
45 | 	for (size_t i = 0; i < 256; ++i)
46 | 		mem[bench_urand()%n] = bench_urand();
47 | 	n = (15+n) & -16;
48 | 	TIME f(mem, n);
49 | } BENCH_END
50 | 
51 | Bench benches[] = {
52 | 	BENCH( impls, MAX_MEM, "poly1305 aligned", bench_aligned )
53 | }; BENCH_MAIN(benches)
54 | 
55 | 
56 | #include "../thirdparty/rvv-chacha-poly/boring.c"
57 | #else
58 | void init(void) {}
59 | Impl impls[] = {};
60 | Bench benches[] = {};
61 | BENCH_MAIN(benches)
62 | #endif
63 | 


--------------------------------------------------------------------------------
/bench/strlen.S:
--------------------------------------------------------------------------------
 1 | #if 0
 2 | size_t strlen_rvv(char *src) {
 3 | 	size_t vlmax = __riscv_vsetvlmax_e8m8();
 4 | 	char *p = src;
 5 | 	long first = -1;
 6 | 	size_t vl;
 7 | 	while (first < 0) {
 8 | 		vuint8m8_t v = __riscv_vle8ff_v_u8m8((uint8_t*)p, &vl, vlmax);
 9 | 		first = __riscv_vfirst_m_b1(__riscv_vmseq_vx_u8m8_b1(v, 0, vl), vl);
10 | 		p += vl;
11 | 	}
12 | 	p -= vl - first;
13 | 	return (size_t)(p - src);
14 | }
15 | 
16 | #define PAGE_SIZE 4096
17 | size_t strlen_rvv_page_aligned_(char *src) {
18 | 	char *p = src;
19 | 	long first = 0;
20 | 
21 | 	size_t n = 0 - ((uintptr_t)src | -4096);
22 | 	size_t vl;
23 | 	for (; n > 0; n -= vl) {
24 | 		vl = __riscv_vsetvl_e8m8(n);
25 | 		vuint8m8_t v = __riscv_vle8_v_u8m8((uint8_t*)p, vl);
26 | 		first = __riscv_vfirst_m_b1(__riscv_vmseq_vx_u8m8_b1(v, 0, vl), vl);
27 | 		p += vl;
28 | 		if (first >= 0) {
29 | 			goto end;
30 | 		}
31 | 	}
32 | 	vl = __riscv_vsetvlmax_e8m8();
33 | 	do {
34 | 		vuint8m8_t v = __riscv_vle8_v_u8m8((uint8_t*)p, vl);
35 | 		first = __riscv_vfirst_m_b1(__riscv_vmseq_vx_u8m8_b1(v, 0, vl), vl);
36 | 		p += vl;
37 | 	} while (first < 0);
38 | end:
39 | 	p -= vl - first;
40 | 	return (size_t)(p - src);
41 | }
42 | #endif
43 | 
44 | 
45 | #ifdef MX
46 | 
47 | .global MX(strlen_rvv_)
48 | MX(strlen_rvv_):
49 | 	mv a3, a0
50 | 1:
51 | 	vsetvli a1, x0, e8, MX(), ta, ma
52 | 	vle8ff.v v8, (a3)
53 | 	csrr a1, vl
54 | 	vmseq.vi v0, v8, 0
55 | 	vfirst.m a2, v0
56 | 	add a3, a3, a1 # end += vl
57 | 	bltz a2, 1b
58 | 	add a0, a0, a1 # start += vl
59 | 	add a3, a3, a2 # end += idx
60 | 	sub a0, a3, a0 # start - end
61 | 	ret
62 | 
63 | .global MX(strlen_rvv_page_aligned_) # generated by clang
64 | MX(strlen_rvv_page_aligned_):
65 | 	lui a1, 1048575
66 | 	or a1, a1, a0
67 | 	neg a4, a1
68 | 	mv a1, a0
69 | 1:
70 | 	vsetvli a2, a4, e8, MX(), ta, ma
71 | 	vle8.v v8, (a1)
72 | 	vmseq.vi v16, v8, 0
73 | 	vfirst.m a3, v16
74 | 	add a1, a1, a2
75 | 	bgez a3, 1f
76 | 	sub a4, a4, a2
77 | 	bnez a4, 1b
78 | 	vsetvli a2, zero, e8, MX(), ta, ma
79 | 2:
80 | 	vle8.v v8, (a1)
81 | 	vmseq.vi v16, v8, 0
82 | 	vfirst.m a3, v16
83 | 	add a1, a1, a2
84 | 	bltz a3, 2b
85 | 1:
86 | 	sub a1, a1, a2
87 | 	sub a0, a3, a0
88 | 	add a0, a0, a1
89 | 	ret
90 | 
91 | #endif
92 | 


--------------------------------------------------------------------------------
/bench/strlen.c:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | 
 3 | size_t
 4 | strlen_scalar(char const *s)
 5 | {
 6 | 	char const *a = s;
 7 | 	while (*s) ++s, BENCH_CLOBBER();
 8 | 	return s - a;
 9 | }
10 | 
11 | size_t
12 | strlen_scalar_autovec(char const *s)
13 | {
14 | 	char const *a = s;
15 | 	while (*s) ++s;
16 | 	return s - a;
17 | }
18 | 
19 | /* https://git.musl-libc.org/cgit/musl/tree/src/string/strlen.c */
20 | #define ONES ((size_t)-1/UCHAR_MAX)
21 | #define HIGHS (ONES * (UCHAR_MAX/2+1))
22 | #define HASZERO(x) (((x)-ONES) & ~(x) & HIGHS)
23 | size_t
24 | strlen_musl(char const *s)
25 | {
26 | 	char const *a = s;
27 | #ifdef __GNUC__
28 | 	typedef size_t __attribute__((__may_alias__)) word;
29 | 	word const *w;
30 | 	for (; (uintptr_t)s % sizeof *w; s++) if (!*s) return s-a;
31 | 	for (w = (void const*)s; !HASZERO(*w); w++);
32 | 	s = (void const*)w;
33 | #endif
34 | 	for (; *s; s++);
35 | 	return s-a;
36 | }
37 | 
38 | #define strlen_libc strlen
39 | 
40 | #define IMPLS(f) \
41 | 	f(scalar) \
42 | 	f(scalar_autovec) \
43 | 	IFHOSTED(f(libc)) \
44 | 	f(musl) \
45 | 	MX(f, rvv_page_aligned) \
46 | 	MX(f, rvv) \
47 | 
48 | 
49 | typedef size_t Func(char const *s);
50 | 
51 | #define DECLARE(f) extern Func strlen_##f;
52 | IMPLS(DECLARE)
53 | 
54 | #define EXTRACT(f) { #f, &strlen_##f },
55 | Impl impls[] = { IMPLS(EXTRACT) };
56 | 
57 | ux last;
58 | 
59 | void init(void) {
60 | 	for (size_t i = 0; i < MAX_MEM; ++i)
61 | 		mem[i] += !mem[i]; // remove null bytes
62 | }
63 | 
64 | ux checksum(size_t n) { return last; }
65 | 
66 | BENCH_BEG(base) {
67 | 	char *p = (char*)mem + (bench_urand() % 511);
68 | 	p[n] = 0;
69 | 	TIME last = f(p);
70 | 	p[n] = bench_urand() | 1;
71 | } BENCH_END
72 | 
73 | Bench benches[] = {
74 | 	BENCH( impls, MAX_MEM - 521, "strlen", bench_base ),
75 | }; BENCH_MAIN(benches)
76 | 
77 | 


--------------------------------------------------------------------------------
/bench/template.S:
--------------------------------------------------------------------------------
 1 | #define NOLIBC_DEFINE_ONLY
 2 | #include "../nolibc.h"
 3 | #include "config.h"
 4 | .text
 5 | .balign 8
 6 | 
 7 | #define CAT_(a,b) a##b
 8 | #define CAT(a,b) CAT_(a,b)
 9 | 
10 | #define STR(x) #x
11 | #define STRe(x) STR(x)
12 | 
13 | #define MX_N 0
14 | #include STRe(INC)
15 | 
16 | #undef MX_N
17 | 
18 | #define MX_N 1
19 | #define MX8(x) x##m8
20 | #define MX4(x) x##m4
21 | #define MX2(x) x##m2
22 | #define MX(x) x##m1
23 | #define MXf2(x) x##mf2
24 | #define MXf4(x) x##mf4
25 | #if __riscv_v_elen >= 64
26 | #define MXf8e8(x) x##mf8
27 | #else
28 | #define MXf8e8(x) x##mf4
29 | #endif
30 | #include STRe(INC)
31 | 
32 | #undef MX_N
33 | #undef MX8
34 | #undef MX4
35 | #undef MX2
36 | #undef MX
37 | #undef MXf2
38 | #undef MXf4
39 | #undef MXf8
40 | #undef MXf8e8
41 | 
42 | #define MX_N 2
43 | #define MX4(x) x##m8
44 | #define MX2(x) x##m4
45 | #define MX(x) x##m2
46 | #define MXf2(x) x##m1
47 | #define MXf4(x) x##mf2
48 | #define MXf8(x) x##mf4
49 | #define MXf8e8(x) x##mf4
50 | #include STRe(INC)
51 | 
52 | #undef MX_N
53 | #undef MX4
54 | #undef MX2
55 | #undef MX
56 | #undef MXf2
57 | #undef MXf4
58 | #undef MXf8
59 | #undef MXf8e8
60 | 
61 | #define MX_N 4
62 | #define MX2(x) x##m8
63 | #define MX(x) x##m4
64 | #define MXf2(x) x##m2
65 | #define MXf4(x) x##m1
66 | #define MXf8(x) x##mf2
67 | #define MXf8e8(x) x##mf2
68 | #include STRe(INC)
69 | 
70 | #undef MX_N
71 | #undef MX2
72 | #undef MX
73 | #undef MXf2
74 | #undef MXf4
75 | #undef MXf8
76 | #undef MXf8e8
77 | 
78 | #define MX_N 8
79 | #define MX(x) x##m8
80 | #define MXf2(x) x##m4
81 | #define MXf4(x) x##m2
82 | #define MXf8(x) x##m1
83 | #define MXf8e8(x) x##m1
84 | #include STRe(INC)
85 | 
86 | 


--------------------------------------------------------------------------------
/bench/utf8_count.S:
--------------------------------------------------------------------------------
  1 | #if 0
  2 | size_t utf8_count_rvv(char const *buf, size_t len) {
  3 | 	size_t sum = 0;
  4 | 	for (size_t vl; len > 0; len -= vl, buf += vl) {
  5 | 		vl = __riscv_vsetvl_e8m8(len);
  6 | 		vint8m8_t v = __riscv_vle8_v_i8m8((void*)buf, vl);
  7 | 		vbool1_t mask = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl);
  8 | 		sum += __riscv_vcpop_m_b1(mask, vl);
  9 | 	}
 10 | 	return sum;
 11 | }
 12 | #endif
 13 | 
 14 | #ifdef MX
 15 | 
 16 | .global MX(utf8_count_rvv_)
 17 | MX(utf8_count_rvv_):
 18 | 	li a2, 0
 19 | 	li a3, -65
 20 | 1:
 21 | 	vsetvli a4, a1, e8, MX(), ta, ma
 22 | 	vle8.v v8, (a0)
 23 | 	vmsgt.vx v16, v8, a3
 24 | 	vcpop.m a5, v16
 25 | 	add a2, a2, a5
 26 | 	sub a1, a1, a4
 27 | 	add a0, a0, a4
 28 | 	bnez a1, 1b
 29 | 	mv a0, a2
 30 | 	ret
 31 | 
 32 | .global MX(utf8_count_rvv_align_)
 33 | MX(utf8_count_rvv_align_):
 34 | 	mv a2, a0
 35 | 	li a0, 0
 36 | 	li a3, -65
 37 | 	vsetvli t0, zero, e8, MX(), ta, ma # vlen
 38 | 	bltu a1, t0, 2f # len < vlen
 39 | 	# align dest to vlen
 40 | 	sub t1, zero, a2
 41 | 	remu t1, t1, t0 # align = (-dest) % vlen
 42 | 	vsetvli t0, t1, e8, MX(), ta, ma
 43 | 1:
 44 | 	vle8.v v8,(a2)
 45 | 	vmsgt.vx v16, v8, a3
 46 | 	vcpop.m a4, v16
 47 | 	add a0, a0, a4
 48 | 	sub a1, a1, t0
 49 | 	add a2, a2, t0
 50 | 2:
 51 | 	vsetvli t0, a1, e8, MX(), ta, ma
 52 | 	bnez a1, 1b
 53 | 	ret
 54 | 
 55 | .global MX(utf8_count_rvv_tail_)
 56 | MX(utf8_count_rvv_tail_):
 57 | 	vsetvli t0, a1, e8, MX(), ta, ma
 58 | 	remu a2, a1, t0 # tail = n % vlenb
 59 | 	sub a1, a1, a2 # n -= tail
 60 | 	add a3, a0, a1 # end = dest + n
 61 | 	mv a1, a0     # n = dest
 62 | 	li a0, 0
 63 | 	li t1, -65
 64 | 1:
 65 | 	vle8.v v8, (a1)
 66 | 	vmsgt.vx v16, v8, t1
 67 | 	vcpop.m t2, v16
 68 | 	add a0, a0, t2
 69 | 	add a1, a1, t0 # src += vlenb
 70 | 	bltu a1, a3, 1b # dest < end
 71 | 	# copy tail
 72 | 	vsetvli zero, a2, e8, MX(), ta, ma
 73 | 	vle8.v v8, (a1)
 74 | 	vmsgt.vx v16, v8, t1
 75 | 	vcpop.m t2, v16
 76 | 	add a0, a0, t2
 77 | 	ret
 78 | 
 79 | # this is supposed to test how well the implementation handles
 80 | # operations with an vl smaller than VLMAX
 81 | .global MX(utf8_count_rvv_128_)
 82 | MX(utf8_count_rvv_128_):
 83 | 	li t0, 128/8
 84 | 	bgt a1, t0, 1f
 85 | 	mv t0, a1
 86 | 1:
 87 | 	vsetvli t0, t0, e8, MX(), ta, ma
 88 | 	remu a2, a1, t0 # tail = n % vlenb
 89 | 	sub a1, a1, a2 # n -= tail
 90 | 	add a3, a0, a1 # end = dest + n
 91 | 	mv a1, a0     # n = dest
 92 | 	li a0, 0
 93 | 	li t1, -65
 94 | 1:
 95 | 	vle8.v v8, (a1)
 96 | 	vmsgt.vx v16, v8, t1
 97 | 	vcpop.m t2, v16
 98 | 	add a0, a0, t2
 99 | 	add a1, a1, t0 # src += vlenb
100 | 	bltu a1, a3, 1b # dest < end
101 | 	# copy tail
102 | 	vsetvli zero, a2, e8, MX(), ta, ma
103 | 	vle8.v v8, (a1)
104 | 	vmsgt.vx v16, v8, t1
105 | 	vcpop.m t2, v16
106 | 	add a0, a0, t2
107 | 	ret
108 | 
109 | 
110 | .global MX(utf8_count_rvv_4x_)
111 | MX(utf8_count_rvv_4x_):
112 | 	mv a2, a0
113 | 	li a0, 0
114 | 	li a6, -65
115 | 1:
116 | 	vsetvli a4, a1, e8, MX(), ta, ma
117 | 	vle8.v v8, (a2)
118 | 	vmsgt.vx v16, v8, a6
119 | 	vcpop.m a7, v16
120 | 	sub a1, a1, a4
121 | 	add a2, a2, a4
122 | 	vsetvli a4, a1, e8, MX(), ta, ma
123 | 	vle8.v v8, (a2)
124 | 	vmsgt.vx v16, v8, a6
125 | 	vcpop.m a3, v16
126 | 	sub a1, a1, a4
127 | 	add a2, a2, a4
128 | 	vsetvli a4, a1, e8, MX(), ta, ma
129 | 	vle8.v v8, (a2)
130 | 	vmsgt.vx v16, v8, a6
131 | 	vcpop.m a5, v16
132 | 	sub a1, a1, a4
133 | 	add a2, a2, a4
134 | 	vsetvli a4, a1, e8, MX(), ta, ma
135 | 	vle8.v v8, (a2)
136 | 	add a0, a0, a7
137 | 	add a0, a0, a3
138 | 	add a0, a0, a5
139 | 	vmsgt.vx v16, v8, a6
140 | 	vcpop.m a3, v16
141 | 	add a0, a0, a3
142 | 	sub a1, a1, a4
143 | 	add a2, a2, a4
144 | 	bnez a1, 1b
145 | 	ret
146 | 
147 | // gcc generated from unrolled intrinsics implementation:
148 | // https://godbolt.org/z/q75c6r3Ta
149 | .global MX(utf8_count_rvv_4x_tail_)
150 | MX(utf8_count_rvv_4x_tail_):
151 | 	vsetvli a5, zero, e8, MX(), ta, ma
152 | 	slli t3, a5, 2
153 | 	add a1, a0, a1
154 | 	add a2, a0, t3
155 | 	mv a4, a0
156 | 	bltu a1, a2, 5f
157 | 	slli t4, a5, 1
158 | 	add t5, t4, a5
159 | 	li a0, 0
160 | 	li a6, -65
161 | 1:
162 | 	add a3, a5, a4
163 | 	vsetvli zero, zero, e8, MX(), ta, ma
164 | 	add a7, t4, a4
165 | 	vle8.v v8, (a4)
166 | 	vle8.v v16, (a3)
167 | 	vmsgt.vx v8, v8, a6
168 | 	vmsgt.vx v16, v16, a6
169 | 	vcpop.m a3, v8
170 | 	vcpop.m t1, v16
171 | 	add a3, a3, t1
172 | 	vle8.v v8, (a7)
173 | 	add a4, t5, a4
174 | 	vmsgt.vx v8, v8, a6
175 | 	vcpop.m a7, v8
176 | 	add a3, a3, a7
177 | 	vle8.v v8, (a4)
178 | 	mv a4, a2
179 | 	vmsgt.vx v8, v8, a6
180 | 	add a2, a2, t3
181 | 	vcpop.m a7, v8
182 | 	add a3, a3, a7
183 | 	add a0, a0, a3
184 | 	bgeu a1, a2, 1b
185 | 2:
186 | 	sub a3, a1, a4
187 | 	beq a1, a4, 4f
188 | 	li a2, 0
189 | 	li a1, -65
190 | 3:
191 | 	vsetvli a5, a3, e8, MX(), ta, ma
192 | 	sub a3, a3, a5
193 | 	vle8.v v8, (a4)
194 | 	add a4, a4, a5
195 | 	vmsgt.vx v8, v8, a1
196 | 	vcpop.m a5, v8
197 | 	add a2, a2, a5
198 | 	bne a3, zero, 3b
199 | 	add a0, a0, a2
200 | 4:
201 | 	ret
202 | 5:
203 | 	li a0, 0
204 | 	j 2b
205 | 
206 | 
207 | 
208 | 
209 | #endif
210 | 
211 | 
212 | 
213 | 
214 | 


--------------------------------------------------------------------------------
/bench/utf8_count.c:
--------------------------------------------------------------------------------
  1 | #include "bench.h"
  2 | 
  3 | size_t
  4 | utf8_count_scalar(char const *str, size_t len)
  5 | {
  6 | 	uint8_t const *p = (uint8_t const*)str;
  7 | 	size_t count = 0;
  8 | 	while (len--) count += (*p++ & 0xc0) != 0x80, BENCH_CLOBBER();
  9 | 	return count;
 10 | }
 11 | 
 12 | size_t
 13 | utf8_count_scalar_autovec(char const *str, size_t len)
 14 | {
 15 | 	uint8_t const *p = (uint8_t const*)str;
 16 | 	size_t count = 0;
 17 | 	while (len--) count += (*p++ & 0xc0) != 0x80;
 18 | 	return count;
 19 | }
 20 | 
 21 | #define GEN_SWAR(name, popc, clobber)  \
 22 | 	size_t \
 23 | 	utf8_count_##name(char const *str, size_t len) \
 24 | 	{ \
 25 | 		ux const BENCH_MAY_ALIAS *u; \
 26 | 		size_t count = 0, tail = 0; \
 27 | \
 28 | 		uint8_t const *u8 = (uint8_t const*)str; \
 29 | 		if (len < sizeof *u) { \
 30 | 			tail = len; \
 31 | 			goto skip; \
 32 | 		} \
 33 | \
 34 | 		tail = sizeof *u - (uintptr_t)str % sizeof *u; \
 35 | \
 36 | 		len -= tail; \
 37 | 		while (tail--) \
 38 | 			count += (*u8++ & 0xC0) != 0x80, clobber; \
 39 | \
 40 | 		u = (ux const*)u8; \
 41 | 		tail = len % sizeof *u; \
 42 | \
 43 | 		for (len /= sizeof *u; len--; ++u) { \
 44 | 			ux b1 =  ~*u & (ux)0x8080808080808080; \
 45 | 			ux b2 =  *u & (ux)0x4040404040404040; \
 46 | 			count += popc((b1 >> 1) | b2); \
 47 | 			clobber; \
 48 | 		} \
 49 | \
 50 | 		u8 = (uint8_t const*)u; \
 51 | 	skip: \
 52 | 		while (tail--) \
 53 | 			count += (*u8++ & 0xC0) != 0x80, clobber; \
 54 | 		return count; \
 55 | 	}
 56 | 
 57 | #if __riscv_zbb
 58 | GEN_SWAR(SWAR_popc,__builtin_popcountll,BENCH_CLOBBER())
 59 | GEN_SWAR(SWAR_popc_autovec,__builtin_popcountll,(void)0)
 60 | # define POPC(f) f(SWAR_popc) f(SWAR_popc_autovec)
 61 | #else
 62 | # define POPC(f)
 63 | #endif
 64 | 
 65 | static inline int
 66 | upopcnt(ux x)
 67 | {
 68 | 	/* 2-bit sums */
 69 | 	x -= (x >> 1) & (-(ux)1/3);
 70 | 	/* 4-bit sums */
 71 | 	x = (x & (-(ux)1/15*3)) + ((x >> 2) & (-(ux)1/15*3));
 72 | 	/* 8-bit sums */
 73 | 	x = (x + (x >> 4)) & (-(ux)1/255*15);
 74 | 	BENCH_CLOBBER();
 75 | 	/* now we can just add the sums together, because can't overflow,
 76 | 	 * since there can't be more than 255 bits set */
 77 | 	x += (x >>  8); /* 16-bit sums */
 78 | 	x += (x >> 16); /* sum 16-bit sums */
 79 | 	IF64(x += (x >> 32)); /* sum 32-bit sums */
 80 | 	return x & 127;
 81 | }
 82 | 
 83 | 
 84 | GEN_SWAR(SWAR_popc_bithack,upopcnt,BENCH_CLOBBER())
 85 | GEN_SWAR(SWAR_popc_bithack_autovec,upopcnt,(void)0)
 86 | 
 87 | 
 88 | #define IMPLS(f) \
 89 | 	f(scalar) \
 90 | 	f(scalar_autovec) \
 91 | 	POPC(f) \
 92 | 	f(SWAR_popc_bithack) \
 93 | 	f(SWAR_popc_bithack_autovec) \
 94 | 	MX(f, rvv) \
 95 | 	MX(f, rvv_align) \
 96 | 	MX(f, rvv_tail) \
 97 | 	MX(f, rvv_128) \
 98 | 	MX(f, rvv_4x) \
 99 | 	MX(f, rvv_4x_tail) \
100 | 
101 | typedef size_t Func(char const *str, size_t len);
102 | 
103 | #define DECLARE(f) extern Func utf8_count_##f;
104 | IMPLS(DECLARE)
105 | 
106 | #define EXTRACT(f) { #f, &utf8_count_##f },
107 | Impl impls[] = { IMPLS(EXTRACT) };
108 | 
109 | char *str;
110 | ux last;
111 | 
112 | void init(void) { }
113 | ux checksum(size_t n) { return last; }
114 | 
115 | void common(size_t n, size_t off) {
116 | 	str = (char*)mem + off;
117 | 	bench_memrand(str, n + 9);
118 | }
119 | 
120 | BENCH_BEG(base) {
121 | 	common(n, bench_urand() & 511);
122 | 	TIME last = (uintptr_t)f(str, n);
123 | } BENCH_END
124 | 
125 | BENCH_BEG(aligned) {
126 | 	common(n, 0);
127 | 	TIME last = (uintptr_t)f(str, n);
128 | } BENCH_END
129 | 
130 | Bench benches[] = {
131 | 	BENCH( impls, MAX_MEM - 521, "utf8 count", bench_base ),
132 | 	BENCH( impls, MAX_MEM - 521, "utf8 count aligned", bench_aligned )
133 | }; BENCH_MAIN(benches)
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/config.mk:
--------------------------------------------------------------------------------
 1 | WARN=-Wall -Wextra -Wno-unused-function -Wno-unused-parameter
 2 | 
 3 | # append -DUSE_PERF_EVENT to CFLAGS, if the cycle csr isn't exposed
 4 | # try -DUSE_PERF_EVENT_SLOW if the abvoe doesn't work
 5 | 
 6 | # freestanding using any recent clang build
 7 | CC=clang
 8 | CFLAGS=--target=riscv64 -march=rv64gcv_zba_zbb_zbs -O3 ${WARN} -nostdlib -fno-builtin -ffreestanding
 9 | #CFLAGS=--target=riscv32 -march=rv32gc_zve32f_zba_zbb_zbs -O3 ${WARN} -nostdlib -fno-builtin -ffreestanding
10 | 
11 | 
12 | # full cross compilation toolchain
13 | #CC=riscv64-linux-gnu-gcc
14 | #CFLAGS=-march=rv64gcv -O3 ${WARN}
15 | 
16 | # native build
17 | #CC=cc
18 | #CFLAGS=-march=rv64gcv -O3 ${WARN}
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/instructions/rvv/Makefile:
--------------------------------------------------------------------------------
 1 | .POSIX:
 2 | 
 3 | include ../../config.mk
 4 | 
 5 | all: rvv
 6 | 
 7 | rvv: gen.S main.c config.h
 8 | 	m4 gen.S > main.S
 9 | 	${CC} ${CFLAGS} main.S main.c -o $@
10 | 	rm main.S
11 | 
12 | clean:
13 | 	rm -f rvv
14 | 
15 | run: rvv
16 | 	../../run.sh ./rvv
17 | 


--------------------------------------------------------------------------------
/instructions/rvv/config.h:
--------------------------------------------------------------------------------
 1 | #define WARMUP 1000
 2 | #define UNROLL 8 // automatically *8
 3 | #define LOOP 512
 4 | #define RUNS 32
 5 | 
 6 | // processor specific configs
 7 | //               m8  m4  m2  m1  mf2 mf4 mf8
 8 | //          SEW: 6310    6310    6310    6310
 9 | //               4268... 4268... 4268... 4268...
10 | #define T_A    0b1111111111111111111111111111 // all
11 | #define T_W    0b0000011101110111011101110111 // widen
12 | #define T_WR   0b0111011101110111011101110111 // widen reduction
13 | #define T_N    0b0000011101110111011101110111 // narrow
14 | #define T_F    0b1110111011101110111011101110 // float
15 | #define T_FW   0b0000011001100110011001100110 // float widen
16 | #define T_FWR  0b0110011001100110011001100110 // float widen reduction
17 | #define T_FN   0b0000011001100110011001100110 // float narrow
18 | 
19 | #define T_E2   0b1110111011101110111011101110 // extend 2
20 | #define T_E4   0b1100110011001100110011001100 // extend 4
21 | #define T_E8   0b1000100010001000100010001000 // extend 8
22 | #define T_ei16 0b1110111111111111111111111111 // no m8
23 | 
24 | // special:
25 | #define T_m1 ((1 << 28) | T_A) // emul<=1
26 | 


--------------------------------------------------------------------------------
/instructions/rvv/main.c:
--------------------------------------------------------------------------------
  1 | #include "../../nolibc.h"
  2 | #include "config.h"
  3 | 
  4 | static ux seed = 123456;
  5 | 
  6 | typedef ux (*BenchFunc)(void);
  7 | extern size_t bench_count;
  8 | extern char bench_names;
  9 | extern ux bench_types;
 10 | extern BenchFunc bench_mf8, bench_mf4, bench_mf2, bench_m1, bench_m2, bench_m4, bench_m8;
 11 | static BenchFunc *benches[] = { &bench_mf8, &bench_mf4, &bench_mf2, &bench_m1, &bench_m2, &bench_m4, &bench_m8 };
 12 | 
 13 | extern ux run_bench(ux (*bench)(void), ux type, ux vl, ux seed);
 14 | 
 15 | 
 16 | static int
 17 | compare_ux(void const *a, void const *b)
 18 | {
 19 | 	return (*(ux*)a > *(ux*)b) - (*(ux*)a < *(ux*)b);
 20 | }
 21 | 
 22 | 
 23 | static void
 24 | run_all_types(char const *name, ux bIdx, ux vl, int ta, int ma)
 25 | {
 26 | 	ux arr[RUNS];
 27 | 
 28 | 
 29 | 	print("<tr><td>")(s,name)("</td>");
 30 | 	ux mask = bIdx[&bench_types];
 31 | 
 32 | 	ux lmuls[] = { 5, 6, 7, 0, 1, 2, 3 };
 33 | 
 34 | 	for (ux sew = 0; sew < 4; ++sew)
 35 | 	for (ux lmul_idx = 0; lmul_idx < 7; ++lmul_idx) {
 36 | 		ux lmul = lmuls[lmul_idx];
 37 | 		ux vtype = lmul | (sew<<3) | (!!ta << 6) | (!!ma << 7);
 38 | 
 39 | 		if (!(mask >> (lmul_idx*4 + sew) & 1)) {
 40 | 			print("<td></td>");
 41 | 			continue;
 42 | 		}
 43 | 
 44 | 		ux lmul_val = 1 << lmul_idx; // fixed-point, denum 8
 45 | 		ux sew_val = 1 << (sew + 3);
 46 | 		// > For a given supported fractional LMUL setting,
 47 | 		// > implementations must support SEW settings between SEWMIN
 48 | 		// > and LMUL * ELEN, inclusive.
 49 | 		if (sew_val * 8 > lmul_val * __riscv_v_elen) {
 50 | 			print("<td></td>");
 51 | 			continue;
 52 | 		}
 53 | 
 54 | 		ux emul = lmul_idx;
 55 | 		if (mask == T_W || mask == T_FW || mask == T_N || mask == T_FN)
 56 | 			emul += 1;
 57 | 		if (mask == T_ei16 && sew == 0)
 58 | 			emul = emul < 7 ? emul+1 : 7;
 59 | 		if (mask == T_m1)
 60 | 			emul = 4; // m2
 61 | 		BenchFunc bench = benches[emul][bIdx];
 62 | 
 63 | 		for (ux i = 0; i < RUNS; ++i) {
 64 | 			arr[i] = run_bench(bench, vtype, vl, seed);
 65 | 			if (~arr[i] == 0) goto skip;
 66 | 			seed = seed*7 + 13;
 67 | 		}
 68 | #if RUNS > 4
 69 | 		qsort(arr, RUNS, sizeof *arr, compare_ux);
 70 | 		ux sum = 0, count = 0;
 71 | 		for (ux i = RUNS * 0.2f; i < RUNS * 0.8f; ++i, ++count)
 72 | 			sum += arr[i];
 73 | #else
 74 | 		ux sum = 0, count = RUNS;
 75 | 		for (ux i = 0; i < RUNS; ++i)
 76 | 			sum += arr[i];
 77 | #endif
 78 | 		print("<td>")(fn,2,sum * 1.0f/(UNROLL*LOOP*count*8))("</td>");
 79 | 		continue;
 80 | skip:
 81 | 		print("<td></td>");
 82 | 	}
 83 | 	print("</tr>\n")(flush,);
 84 | }
 85 | 
 86 | int
 87 | main(void)
 88 | {
 89 | 	size_t x;
 90 | 	seed = rv_cycles();
 91 | 	seed ^= (uintptr_t)&x;
 92 | 
 93 | 	ux vlarr[] = { 0, 1 };
 94 | 	for (ux i = 0; i < 2; ++i) {
 95 | 		for (ux j = 4; j--; ) {
 96 | 			print("\n");
 97 | 			if (vlarr[i] != 0)
 98 | 				print("vl=")(u,vlarr[i]);
 99 | 			else
100 | 				print("vl=VLMAX");
101 | 			print(s,j & 2 ? " ta" : " tu")(s,j & 1 ? " ma" : " mu")("\n\n");
102 | 			char const *name = &bench_names;
103 | 			for (ux bIdx = 0; bIdx < bench_count; ++bIdx) {
104 | 				run_all_types(name, bIdx, vlarr[i], j >> 1, j & 1);
105 | 				while (*name++);
106 | 			}
107 | 		}
108 | 	}
109 | 	return 0;
110 | }
111 | 


--------------------------------------------------------------------------------
/instructions/scalar/Makefile:
--------------------------------------------------------------------------------
 1 | .POSIX:
 2 | 
 3 | include ../../config.mk
 4 | 
 5 | all: scalar
 6 | 
 7 | scalar: main.S main.c config.h
 8 | 	${CC} ${CFLAGS} main.S main.c -o $@
 9 | 
10 | clean:
11 | 	rm -f scalar
12 | 
13 | run: scalar
14 | 	../../run.sh ./scalar
15 | 


--------------------------------------------------------------------------------
/instructions/scalar/config.h:
--------------------------------------------------------------------------------
1 | #define WARMUP 1000
2 | #define UNROLL 64
3 | #define LOOP 512
4 | #define RUNS 64
5 | 
6 | 


--------------------------------------------------------------------------------
/instructions/scalar/main.c:
--------------------------------------------------------------------------------
 1 | #include "../../nolibc.h"
 2 | #include "config.h"
 3 | 
 4 | static ux mem[128];
 5 | static ux seed = 123456;
 6 | 
 7 | extern char const benchmark_names;
 8 | extern ux (*benchmarks)(void);
 9 | extern ux run_bench(ux (*bench)(void), void *ptr, ux seed);
10 | 
11 | 
12 | static int
13 | compare_ux(void const *a, void const *b)
14 | {
15 | 	return (*(ux*)a > *(ux*)b) - (*(ux*)a < *(ux*)b);
16 | }
17 | 
18 | void
19 | run(char const *name, ux (*bench)(void)) {
20 | 	ux arr[RUNS];
21 | 
22 | 	print("<tr><td>")(s,name)("</td>");
23 | 	for (ux i = 0; i < RUNS; ++i) {
24 | 		arr[i] = run_bench(bench, mem, seed);
25 | 		seed = seed*7 + 13;
26 | 	}
27 | 
28 | #if RUNS > 4
29 | 	qsort(arr, RUNS, sizeof *arr, compare_ux);
30 | 	ux sum = 0, count = 0;
31 | 	for (ux i = RUNS * 0.2f; i < RUNS * 0.8f; ++i, ++count)
32 | 		sum += arr[i];
33 | #else
34 | 	ux sum = 0, count = RUNS;
35 | 	for (ux i = 0; i < RUNS; ++i)
36 | 		sum += arr[i];
37 | #endif
38 | 
39 | 	print("<td>")(fn,2,sum * 1.0f/(UNROLL*LOOP*count))("</td>");
40 | 	print("</tr>\n")(flush,);
41 | }
42 | 
43 | 
44 | int
45 | main(void)
46 | {
47 | 	size_t x;
48 | 	seed = rv_cycles();
49 | 	seed ^= (uintptr_t)&x;
50 | 
51 | 	ux (**it)(void) = &benchmarks;
52 | 	char const *name = &benchmark_names;
53 | 	while (*it) {
54 | 		run(name, *it);
55 | 		++it;
56 | 		while (*name++);
57 | 	}
58 | 	return 0;
59 | }
60 | 


--------------------------------------------------------------------------------
/instructions/xtheadvector/Makefile:
--------------------------------------------------------------------------------
 1 | .POSIX:
 2 | 
 3 | include ../../config.mk
 4 | 
 5 | all: xtheadvector
 6 | 
 7 | xtheadvector: main.S main.c config.h
 8 | 	${CC} ${CFLAGS} main.S main.c -o $@
 9 | 
10 | clean:
11 | 	rm -f xtheadvector
12 | 
13 | run: xtheadvector
14 | 	../../run.sh ./xtheadvector
15 | 


--------------------------------------------------------------------------------
/instructions/xtheadvector/config.h:
--------------------------------------------------------------------------------
 1 | #define WARMUP 1000
 2 | #define UNROLL 64
 3 | #define LOOP 16
 4 | #define RUNS 1000
 5 | 
 6 | /* processor specific configs */
 7 | #if 1
 8 | // C920/C906 (some boards, e.g. BL808): e8/16/32/64 f16/32/64
 9 | #define T_A  0b11111111
10 | #define T_W  0b01110111
11 | #define T_N  0b01110111
12 | #define T_F  0b11101111
13 | #define T_FW 0b01100111
14 | #define T_FN 0b01100111
15 | #else
16 | // C906: e8/16/32 f16/32
17 | #define T_A  0b1111111
18 | #define T_W  0b0110111
19 | #define T_N  0b0110111
20 | #define T_F  0b1101111
21 | #define T_FW 0b0100111
22 | #define T_FN 0b0100111
23 | #endif
24 | 


--------------------------------------------------------------------------------
/instructions/xtheadvector/main.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <inttypes.h>
 4 | #include "config.h"
 5 | 
 6 | #include <fcntl.h>
 7 | #include <unistd.h>
 8 | 
 9 | typedef uint64_t u64;
10 | 
11 | static u64 seed = 128;
12 | 
13 | extern char const *benchmark_names;
14 | extern u64 benchmark_types;
15 | extern u64 (*benchmarks)(void);
16 | extern u64 run_bench(u64 (*bench)(void), u64 type, u64 vl, u64 seed);
17 | 
18 | static int
19 | compare_u64(void const *a, void const *b)
20 | {
21 | 	return *(clock_t*)a - *(clock_t*)b;
22 | }
23 | 
24 | void
25 | run_all_types(char const *name, u64 (*bench)(void), u64 types, u64 vl) {
26 | 	static u64 arr[RUNS];
27 | 
28 | 	printf("<tr><td>%s</td>", name);
29 | 	// m1..m8, e8..e64
30 | 	for (u64 vtype = 0; vtype < 16; ++vtype) {
31 | 
32 | 		if (!((1 << (vtype & 3)) & types) ||
33 | 		    !((1 << (vtype >> 2)) & (types >> 4))) {
34 | 			printf("<td></td>");
35 | 			continue;
36 | 		}
37 | 
38 | 		for (u64 i = 0; i < RUNS; ++i) {
39 | 			arr[i] = run_bench(bench, vtype, vl, seed);
40 | 			seed = seed*7 + 13;
41 | 		}
42 | 
43 | #if RUNS > 4
44 | 		qsort(arr, RUNS, sizeof *arr, compare_u64);
45 | 		u64 sum = 0, count = 0;
46 | 		for (u64 i = RUNS * 0.2f; i < RUNS * 0.8f; ++i, ++count)
47 | 			sum += arr[i];
48 | #else
49 | 		u64 sum = 0, count = RUNS;
50 | 		for (u64 i = 0; i < RUNS; ++i)
51 | 			sum += arr[i];
52 | #endif
53 | 		printf("<td>%2.1f</td>", sum * 1.0/(UNROLL*LOOP*count));
54 | 	}
55 | 	puts("</tr>");
56 | }
57 | 
58 | int
59 | main(void)
60 | {
61 | 
62 | 	int fd = open("/dev/urandom", O_RDONLY);
63 | 	read(fd, &seed, sizeof seed);
64 | 	close(fd);
65 | 
66 | 	u64 vlarr[] = { 0, 1 };
67 | 	for (u64 i = 0; i < 2; ++i) {
68 | 		puts("");
69 | 		if (vlarr[i] != 0)
70 | 			printf("vl=%"PRIu64"\n\n", vlarr[i]);
71 | 		else
72 | 			puts("vl=VLMAX\n");
73 | 		u64 (**it)(void) = &benchmarks;
74 | 		char const **name = &benchmark_names;
75 | 		u64 *types = &benchmark_types;
76 | 		while (*it) {
77 | 			run_all_types(*name, *it, *types, vlarr[i]);
78 | 			++it;
79 | 			++name;
80 | 			++types;
81 | 		}
82 | 	}
83 | }
84 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # configure `make run` to work with your setup
 4 | 
 5 | # local execution
 6 | #./$@
 7 | 
 8 | # using qemu
 9 | qemu-riscv64-static -cpu rv64,b=on,v=on,vlen=256,rvv_ta_all_1s=on,rvv_ma_all_1s=on $@
10 | #qemu-riscv32-static -cpu rv32,zve32f=on,vext_spec=v1.0,vlen=256,rvv_ta_all_1s=on,rvv_ma_all_1s=on,zfh=true,zvfh=true $@
11 | 


--------------------------------------------------------------------------------
/single/Makefile:
--------------------------------------------------------------------------------
 1 | .POSIX:
 2 | 
 3 | include ../config.mk
 4 | 
 5 | EXECS=veclibm
 6 | 
 7 | all: ${EXECS}
 8 | 
 9 | veclibm: veclibm.c
10 | 	${CC} ${CFLAGS} -o $@ $< ../thirdparty/veclibm/src/*.c -I ../thirdparty/veclibm/include -lm -Wno-unused -Wno-maybe-uninitialized
11 | 
12 | clean:
13 | 	rm -f ${EXECS}
14 | 
15 | 


--------------------------------------------------------------------------------
/single/veclibm.c:
--------------------------------------------------------------------------------
 1 | #include <riscv_vector.h>
 2 | 
 3 | #include <math.h>
 4 | #include <stdlib.h>
 5 | #include <stdio.h>
 6 | 
 7 | #ifndef N
 8 | #define N (1024*128) /* change me */
 9 | #endif
10 | 
11 | static void
12 | rvvlm_sqrt(size_t x_len, const double *x, double *y)
13 | {
14 |     for (size_t vl; x_len > 0; x_len -= vl, x += vl, y += vl) {
15 |         vl = __riscv_vsetvl_e64m8(x_len);
16 |         vfloat64m8_t v = __riscv_vle64_v_f64m8(x, vl);
17 |         __riscv_vse64(y, __riscv_vfsqrt(v, vl), vl);
18 |     }
19 | }
20 | 
21 | #define APPLY(X) \
22 | X(exp) X(exp2) X(expm1) X(log) X(log10) X(log2) X(log1p) \
23 | X(sqrt) X(cbrt) \
24 | X(sin) X(cos) X(tan) X(asin) X(acos) X(atan) \
25 | X(sinh) X(cosh) X(tanh) X(asinh) X(acosh) X(atanh) \
26 | X(erf) X(erfc) X(tgamma) X(lgamma)
27 | 
28 | #define DECLARE(f) void rvvlm_##f(size_t x_len, const double *x, double *y);
29 | APPLY(DECLARE)
30 | 
31 | #define DEFINE(f) \
32 | 	static void lm_##f(size_t x_len, const double *x, double *y) { \
33 | 		for (size_t i = 0; i < x_len; ++i) y[i] = f(x[i]); \
34 | 	}
35 | APPLY(DEFINE)
36 | struct Func {
37 | 	void (*rvvlm)(size_t, const double*, double*);
38 | 	void (*lm)(size_t, const double*, double*);
39 | 	const char *name;
40 | };
41 | 
42 | struct Func funcs[] = {
43 | #define ENTRY(f) { rvvlm_##f, lm_##f, #f },
44 | APPLY(ENTRY)
45 | };
46 | 
47 | typedef struct { uint64_t x, y, z; } URand;
48 | 
49 | /* RomuDuoJr, see https://romu-random.org/ */
50 | static inline uint64_t
51 | urand(URand *r)
52 | {
53 | #define ROTL(x,n) (((x) << (n)) | ((x) >> (8*sizeof(x) - (n))))
54 | 	uint64_t xp = r->x, yp = r->y, zp = r->z;
55 | 	r->x = 15241094284759029579u * zp;
56 | 	r->y = ROTL(yp - xp, 12);
57 | 	r->z = ROTL(zp - yp, 44);
58 | 	return xp;
59 | }
60 | 
61 | 
62 | int
63 | main(void)
64 | {
65 | 	double *in = malloc(N*sizeof *in), *out = malloc(N*sizeof *out);
66 | 	URand r = {123, (uintptr_t)&in, (uintptr_t)&out};
67 | 
68 | 	for (size_t i = 0; i < N; ++i)
69 | 		in[i] = (urand(&r) >> (64 - 53)) * (1.0 / (1ull << 53));
70 | 
71 | 	for (size_t i = 0; i < sizeof funcs / sizeof *funcs; ++i) {
72 | 		size_t beg, end;
73 | 		struct Func f = funcs[i];
74 | 		printf("%s libm: ", f.name);
75 | 		for (size_t i = 0; i < 3; ++i) {
76 | 			__asm__ volatile("fence.i");
77 | 			__asm__ volatile ("csrr %0, cycle" : "=r"(beg));
78 | 			f.lm(N, in, out);
79 | 			__asm__ volatile("fence.i");
80 | 			__asm__ volatile ("csrr %0, cycle" : "=r"(end));
81 | 			printf(" %f", ((double)N) / (end-beg));
82 | 		}
83 | 		printf(" elements/cycle\n%s rvvlm:", f.name);
84 | 		for (size_t i = 0; i < 3; ++i) {
85 | 			__asm__ volatile("fence.i");
86 | 			__asm__ volatile ("csrr %0, cycle" : "=r"(beg));
87 | 			f.rvvlm(N, in, out);
88 | 			__asm__ volatile("fence.i");
89 | 			__asm__ volatile ("csrr %0, cycle" : "=r"(end));
90 | 			printf(" %f", ((double)N) / (end-beg));
91 | 		}
92 | 		printf(" elements/cycle\n");
93 | 	}
94 | 	free(in);
95 | 	free(out);
96 | 	return 0;
97 | }
98 | 
99 | 


--------------------------------------------------------------------------------
/thirdparty/rvv-chacha-poly/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | I can accept your patches and contributions to this project with the
 4 | following caveats from my employer:
 5 | 
 6 | ## Contributor License Agreement
 7 | 
 8 | Contributions to this project must be accompanied by a Contributor License
 9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to <https://cla.developers.google.com/> to see
12 | your current agreements on file or to sign a new one.
13 | 
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 | 
18 | ## Community Guidelines
19 | 
20 | Treat people with respect.
21 | 


--------------------------------------------------------------------------------
/thirdparty/rvv-chacha-poly/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/thirdparty/rvv-chacha-poly/README.md:
--------------------------------------------------------------------------------
 1 | NOTE: code from https://github.com/edre/rvv-chacha-poly
 2 | 
 3 | RISC-V vector extension implementation of chacha20 and poly1305
 4 | cryptographic primitives.
 5 | 
 6 | Chacha20 and poly1305 are simple to vectorize without specialized
 7 | instructions. This project implements them in assembly, and verifies them
 8 | against the BoringSSL C implementation. As expected the executed instruction
 9 | count go down a lot, but I don't have real hardware to see if the runtime does
10 | too.
11 | 
12 | This is not an officially supported Google product.
13 | 
14 | This is a proof of concept crypto library. Those words should sound very scary
15 | together. Don't use this.
16 | 


--------------------------------------------------------------------------------
/thirdparty/rvv-chacha-poly/boring.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2014, Google Inc.
 2 |  *
 3 |  * Permission to use, copy, modify, and/or distribute this software for any
 4 |  * purpose with or without fee is hereby granted, provided that the above
 5 |  * copyright notice and this permission notice appear in all copies.
 6 |  *
 7 |  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 8 |  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 9 |  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10 |  * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11 |  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12 |  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13 |  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14 | 
15 | #include <stdint.h>
16 | #include <stddef.h>
17 | 
18 | void boring_chacha20(uint8_t *out, const uint8_t *in,
19 | 		     size_t in_len, const uint8_t key[32],
20 | 		     const uint8_t nonce[12], uint32_t counter);
21 | 
22 | typedef uint8_t poly1305_state[512];
23 | 
24 | void boring_poly1305_init(poly1305_state *state,
25 | 			  const uint8_t key[32]);
26 | 
27 | void boring_poly1305_update(poly1305_state *state,
28 | 			    const uint8_t *in, size_t in_len);
29 | 
30 | void boring_poly1305_finish(poly1305_state *state,
31 | 			    uint8_t mac[16]);
32 | 


--------------------------------------------------------------------------------
/thirdparty/rvv-chacha-poly/main.c:
--------------------------------------------------------------------------------
  1 | /* Copyright 2020 Google LLC
  2 |  *
  3 |  * Licensed under the Apache License, Version 2.0 (the "License") ;
  4 |  * you may not use this file except in compliance with the License.
  5 |  * You may obtain a copy of the License at
  6 |  *
  7 |  *    https://www.apache.org/licenses/LICENSE-2.0
  8 |  *
  9 |  * Unless required by applicable law or agreed to in writing, software
 10 |  * distributed under the License is distributed on an "AS IS" BASIS,
 11 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 |  * See the License for the specific language governing permissions and
 13 |  * limitations under the License. */
 14 | 
 15 | #include <stdbool.h>
 16 | #include <stdint.h>
 17 | #include <stdio.h>
 18 | #include <stdlib.h>
 19 | #include "boring.h"
 20 | 
 21 | void println_hex(uint8_t* data, int size) {
 22 |   while (size > 0) {
 23 |     printf("%02x", *data);
 24 |     data++;
 25 |     size--;
 26 |   }
 27 |   printf("\n");
 28 | }
 29 | 
 30 | // TODO: test the vector doesn't write past the end
 31 | // test function with multiple length inputs (optional printing)
 32 | // test non-block sized lengths
 33 | 
 34 | extern uint64_t instruction_counter();
 35 | 
 36 | const char* pass_str = "\x1b[32mPASS\x1b[0m";
 37 | const char* fail_str = "\x1b[31mFAIL\x1b[0m";
 38 | 
 39 | bool test_chacha(const uint8_t* data, size_t len, const uint8_t key[32], const uint8_t nonce[12], bool verbose) {
 40 |   extern void vector_chacha20(uint8_t *out, const uint8_t *in,
 41 | 			      size_t in_len, const uint8_t key[32],
 42 | 			      const uint8_t nonce[12], uint32_t counter);
 43 |   uint8_t* golden = malloc(len);
 44 |   memset(golden, 0, len);
 45 |   uint64_t start = instruction_counter();
 46 |   boring_chacha20(golden, data, len, key, nonce, 0);
 47 |   uint64_t end = instruction_counter();
 48 |   uint64_t boring_count = end - start;
 49 | 
 50 |   uint8_t* vector = malloc(len + 4);
 51 |   memset(vector, 0, len+4);
 52 |   start = instruction_counter();
 53 |   vector_chacha20(vector, data, len, key, nonce, 0);
 54 |   end = instruction_counter();
 55 | 
 56 |   bool pass = memcmp(golden, vector, len) == 0;
 57 | 
 58 |   if (verbose || !pass) {
 59 |     printf("golden: ");
 60 |     println_hex(golden, 32);
 61 |     printf("inst_count=%d, inst/byte=%.02f\n", boring_count, (float)(boring_count)/len);
 62 |     printf("vector: ");
 63 |     println_hex(vector, 32);
 64 |     printf("inst_count=%d, inst/byte=%.02f\n", end - start, (float)(end - start)/len);
 65 |   }
 66 | 
 67 |   uint32_t past_end = vector[len];
 68 |   if (past_end != 0) {
 69 |     printf("vector wrote past end %08x\n", past_end);
 70 |     pass = false;
 71 |   }
 72 | 
 73 |   free(golden);
 74 |   free(vector);
 75 | 
 76 |   return pass;
 77 | }
 78 | 
 79 | void test_chachas(FILE* f) {
 80 |   int len = 1024 - 11;
 81 |   uint8_t* data = malloc(len);
 82 |   uint32_t rand = 1;
 83 |   for (int i = 0; i < len; i++) {
 84 |     rand *= 101;
 85 |     rand %= 16777213; // random prime
 86 |     data[i] = (uint8_t)(rand);
 87 |   }
 88 |   uint8_t key[32] = "Setec astronomy;too many secrets";
 89 |   uint8_t nonce[12] = "BurnAfterUse";
 90 |   int counter = 0;
 91 | 
 92 |   bool pass = test_chacha(data, len, key, nonce, true);
 93 | 
 94 |   if (pass) {
 95 |     for (int i = 1, len = 1; len < 1000; len += i++) {
 96 |       fread(key, 32, 1, f);
 97 |       fread(nonce, 12, 1, f);
 98 |       if (!test_chacha(data, len, key, nonce, false)) {
 99 | 	printf("Failed with len=%d\n", len);
100 | 	pass = false;
101 | 	break;
102 |       }
103 |     }
104 |   }
105 | 
106 |   if (pass) {
107 |     printf("chacha %s\n", pass_str);
108 |   } else {
109 |     printf("chacha %s\n", fail_str);
110 |   }
111 | }
112 | 
113 | bool test_poly(const uint8_t* data, size_t len, const uint8_t key[32], bool verbose) {
114 |   extern uint64_t vector_poly1305(const uint8_t* in, size_t len,
115 | 				  const uint8_t key[32], uint8_t sig[16]);
116 | 
117 |   poly1305_state state;
118 |   uint8_t *sig = malloc(16); // gets corrupted if I define it on the stack?
119 |   uint64_t start = instruction_counter();
120 |   boring_poly1305_init(&state, key);
121 |   boring_poly1305_update(&state, data, len);
122 |   boring_poly1305_finish(&state, sig);
123 |   uint64_t end = instruction_counter();
124 |   uint64_t boring_count = end - start;
125 | 
126 |   uint8_t *sig2 = malloc(16);
127 |   start = instruction_counter();
128 |   uint64_t mid = vector_poly1305(data, len, key, sig2);
129 |   end = instruction_counter();
130 | 
131 |   bool pass = memcmp(sig, sig2, 16) == 0;
132 | 
133 |   if (verbose || !pass) {
134 |     printf("boring mac: ");
135 |     println_hex(sig, 16);
136 |     printf("inst_count=%d, inst/byte=%.02f\n", boring_count, (float)(boring_count)/len);
137 |     printf("vector mac: ");
138 |     println_hex(sig2, 16);
139 |     printf("precomputation=%d, processing=%d, inst/byte=%.02f\n",
140 | 	   mid - start, end - mid, (float)(end - mid)/len);
141 |   }
142 | 
143 |   free(sig);
144 |   free(sig2);
145 |   return pass;
146 | }
147 | 
148 | void test_polys(FILE* f) {
149 |   const int big_len = 1024;
150 |   uint8_t *zero = malloc(2000);
151 |   uint8_t *max_bits = malloc(big_len);
152 |   memset(max_bits, 0xff, big_len);
153 |   const uint8_t one[32] = {1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
154 |   const uint8_t key[32] = {1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 255,
155 |   			   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
156 |   const uint8_t data[272] = "Setec astronomy;too many secrets";
157 |   bool pass = test_poly(max_bits, big_len, max_bits, true);
158 | 
159 |   if (!pass)
160 |     goto end;
161 | 
162 |   // random test
163 |   const int max_len = 1000;
164 |   uint8_t *rand = malloc(max_len);
165 |   for (int len = 16; len <= max_len; len += 16) {
166 |     fread((uint8_t*)key, 32, 1, f);
167 |     fread((uint8_t*)rand, len, 1, f);
168 |     if (!test_poly(data, len, key, false)) {
169 |       printf("failed random input len=%d\n", len);
170 |       pass = false;
171 |       break;
172 |     }
173 |   }
174 |   free(rand);
175 | 
176 |  end:
177 |   if (pass) {
178 |     printf("poly %s\n", pass_str);
179 |   } else {
180 |     printf("poly %s\n", fail_str);
181 |   }
182 | 
183 |   free(zero);
184 |   free(max_bits);
185 | }
186 | 
187 | int main(int argc, uint8_t *argv[]) {
188 |   extern uint32_t vlmax_u32();
189 |   printf("VLMAX in blocks: %d\n", vlmax_u32());
190 |   FILE* rand = fopen("/dev/urandom", "r");
191 |   test_chachas(rand);
192 |   printf("\n");
193 |   test_polys(rand);
194 |   fclose(rand);
195 | }
196 | 


--------------------------------------------------------------------------------
/thirdparty/rvv-chacha-poly/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2020 Google LLC
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License") ;
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    https://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # Dependencies to be installed and on the PATH:
18 | # https://github.com/riscv/riscv-gnu-toolchain
19 | # https://github.com/riscv/riscv-isa-sim
20 | #   configure --prefix=$RISCV --with-varch=v512:e64
21 | # https://github.com/riscv/riscv-pk
22 | 
23 | ISA=rv64gcv
24 | 
25 | riscv64-unknown-elf-gcc -march=$ISA main.c boring.c vchacha.s vpoly.s -o main -O &&
26 |     spike --isa=$ISA `which pk` main
27 | 


--------------------------------------------------------------------------------
/thirdparty/rvv-chacha-poly/vchacha.s:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License") ;
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #    https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | .global instruction_counter
 16 | .global vector_chacha20
 17 | .global vlmax_u32
 18 | 
 19 | instruction_counter:
 20 | 	rdinstret a0
 21 | 	ret
 22 | 
 23 | vlmax_u32:
 24 | 	vsetvli a0, x0, e32, m1, ta, ma
 25 | 	ret
 26 | 
 27 | 
 28 | # Cell-based implementation strategy:
 29 | # v0-v15: Cell vectors. Each element is from a different block
 30 | 
 31 | ## Function initialization
 32 | # Using the same order as the boring chacha arguments:
 33 | # a0 = uint8_t *out
 34 | # a1 = uint8_t *in
 35 | # a2 = size_t in_len
 36 | # a3 = uint8_t key[32]
 37 | # a4 = uint8_t nonce[12]
 38 | # a5 = uint32_t counter
 39 | vector_chacha20:
 40 | 	# a2 = initial length in bytes
 41 | 	# t3 = remaining 64-byte blocks to mix
 42 | 	# t4 = remaining full blocks to read/write
 43 | 	#  (if t3 and t4 are different by one, there is a partial block to manually xor)
 44 | 	# t1 = vl in 64-byte blocks
 45 | 	srli t4, a2, 6
 46 | 	addi t0, a2, 63
 47 | 	srli t3, t0, 6
 48 | encrypt_blocks:
 49 | 	# initialize vector state
 50 | 	vsetvli t1, t3, e32, m1, ta, ma
 51 | 	# Load 128 bit constant
 52 | 	li t0, 0x61707865 # "expa" little endian
 53 | 	vmv.v.x v0, t0
 54 | 	li t0, 0x3320646e # "nd 3" little endian
 55 | 	vmv.v.x v1, t0
 56 | 	li t0, 0x79622d32 # "2-by" little endian
 57 | 	vmv.v.x v2, t0
 58 | 	li t0, 0x6b206574 # "te k" little endian
 59 | 	vmv.v.x v3, t0
 60 | 	# Load key
 61 | 	lw t0, 0(a3)
 62 | 	vmv.v.x v4, t0
 63 | 	lw t0, 4(a3)
 64 | 	vmv.v.x v5, t0
 65 | 	lw t0, 8(a3)
 66 | 	vmv.v.x v6, t0
 67 | 	lw t0, 12(a3)
 68 | 	vmv.v.x v7, t0
 69 | 	lw t0, 16(a3)
 70 | 	vmv.v.x v8, t0
 71 | 	lw t0, 20(a3)
 72 | 	vmv.v.x v9, t0
 73 | 	lw t0, 24(a3)
 74 | 	vmv.v.x v10, t0
 75 | 	lw t0, 28(a3)
 76 | 	vmv.v.x v11, t0
 77 | 	# Load counter, and increment for each element
 78 | 	vid.v v12
 79 | 	vadd.vx v12, v12, a5
 80 | 	# Load nonce
 81 | 	lw t0, 0(a4)
 82 | 	vmv.v.x v13, t0
 83 | 	lw t0, 4(a4)
 84 | 	vmv.v.x v14, t0
 85 | 	lw t0, 8(a4)
 86 | 	vmv.v.x v15, t0
 87 | 
 88 | 	li t2, 10 # loop counter
 89 | round_loop:
 90 | 
 91 | 	.macro vrotl a i r
 92 | #if __riscv_zvbb
 93 | 	vror.vi \a, \a, 32-\i
 94 | #else
 95 | 	vsll.vi v16, \a, \i
 96 | 	vsrl.vi \a, \a, 32-\i
 97 | 	vor.vv \a, \a, v16
 98 | #endif
 99 | 	.endm
100 | 
101 | 	.macro quarterround a b c d
102 | 	# a += b; d ^= a; d <<<= 16;
103 | 	vadd.vv \a, \a, \b
104 | 	vxor.vv \d, \d, \a
105 | 	vrotl \d, 16, t6
106 | 	# c += d; b ^= c; b <<<= 12;
107 | 	vadd.vv \c, \c, \d
108 | 	vxor.vv \b, \b, \c
109 | 	vrotl \b, 12, t7
110 | 	# a += b; d ^= a; d <<<= 8;
111 | 	vadd.vv \a, \a, \b
112 | 	vxor.vv \d, \d, \a
113 | 	vrotl \d, 8, t8
114 | 	# c += d; b ^= c; b <<<= 7;
115 | 	vadd.vv \c, \c, \d
116 | 	vxor.vv \b, \b, \c
117 | 	vrotl \b, 7, t9
118 | 	.endm
119 | 
120 | 	# Mix columns.
121 | 	quarterround v0, v4, v8, v12
122 | 	quarterround v1, v5, v9, v13
123 | 	quarterround v2, v6, v10, v14
124 | 	quarterround v3, v7, v11, v15
125 | 	# Mix diagonals.
126 | 	quarterround v0, v5, v10, v15
127 | 	quarterround v1, v6, v11, v12
128 | 	quarterround v2, v7, v8, v13
129 | 	quarterround v3, v4, v9, v14
130 | 
131 | 	addi t2, t2, -1
132 | 	bnez t2, round_loop
133 | 
134 | 	# Add in initial block values.
135 | 	# 128 bit constant
136 | 	li t0, 0x61707865 # "expa" little endian
137 | 	vadd.vx v0, v0, t0
138 | 	li t0, 0x3320646e # "nd 3" little endian
139 | 	vadd.vx v1, v1, t0
140 | 	li t0, 0x79622d32 # "2-by" little endian
141 | 	vadd.vx v2, v2, t0
142 | 	li t0, 0x6b206574 # "te k" little endian
143 | 	vadd.vx v3, v3, t0
144 | 	# Add key
145 | 	lw t0, 	0(a3)
146 | 	vadd.vx v4, v4, t0
147 | 	lw t0, 4(a3)
148 | 	vadd.vx v5, v5, t0
149 | 	lw t0, 8(a3)
150 | 	vadd.vx v6, v6, t0
151 | 	lw t0, 12(a3)
152 | 	vadd.vx v7, v7, t0
153 | 	lw t0, 16(a3)
154 | 	vadd.vx v8, v8, t0
155 | 	lw t0, 20(a3)
156 | 	vadd.vx v9, v9, t0
157 | 	lw t0, 24(a3)
158 | 	vadd.vx v10, v10, t0
159 | 	lw t0, 28(a3)
160 | 	vadd.vx v11, v11, t0
161 | 	# Add counter
162 | 	vid.v v16
163 | 	vadd.vv v12, v12, v16
164 | 	vadd.vx v12, v12, a5
165 | 	# Load nonce
166 | 	lw t0, 0(a4)
167 | 	vadd.vx v13, v13, t0
168 | 	lw t0, 4(a4)
169 | 	vadd.vx v14, v14, t0
170 | 	lw t0, 8(a4)
171 | 	vadd.vx v15, v15, t0
172 | 
173 | 	# load in vector lanes with two strided segment loads
174 | 	# in case this is the final block, reset vl to full blocks
175 | 	vsetvli t5, t4, e32, m1, ta, ma
176 | 	li t0, 64
177 | 	vlsseg8e32.v v16, (a1), t0
178 | 	add a1, a1, 32
179 | 	vlsseg8e32.v v24, (a1), t0
180 | 	add a1, a1, -32
181 | 
182 | 	# xor in state
183 | 	vxor.vv v16, v16, v0
184 | 	vxor.vv v17, v17, v1
185 | 	vxor.vv v18, v18, v2
186 | 	vxor.vv v19, v19, v3
187 | 	vxor.vv v20, v20, v4
188 | 	vxor.vv v21, v21, v5
189 | 	vxor.vv v22, v22, v6
190 | 	vxor.vv v23, v23, v7
191 | 	vxor.vv v24, v24, v8
192 | 	vxor.vv v25, v25, v9
193 | 	vxor.vv v26, v26, v10
194 | 	vxor.vv v27, v27, v11
195 | 	vxor.vv v28, v28, v12
196 | 	vxor.vv v29, v29, v13
197 | 	vxor.vv v30, v30, v14
198 | 	vxor.vv v31, v31, v15
199 | 
200 | 	# write back out with 2 strided segment stores
201 | 	vssseg8e32.v v16, (a0), t0
202 | 	add a0, a0, 32
203 | 	vssseg8e32.v v24, (a0), t0
204 | 	add a0, a0, -32
205 | 
206 | 	# update counters/pointers
207 | 	slli t0, t5, 6 # current VL in bytes
208 | 	add a0, a0, t0 # advance output pointer
209 | 	add a1, a1, t0 # advance input pointer
210 | 	sub a2, a2, t0 # decrement remaining bytes
211 | 	sub t3, t3, t1 # decrement remaining blocks
212 | 	sub t4, t4, t1 # decrement remaining blocks
213 | 	# TODO: crash if counter overflows
214 | 	add a5, a5, t1 # increment counter
215 | 
216 | 	# loop again if we have remaining blocks
217 | 	bne x0, t3, encrypt_blocks
218 | 
219 | 	# we're done if there are no more remaining bytes from a partial block
220 | 	beq zero, a2, return
221 | 
222 | 	# to get the remaining partial block, we transfer the nth element of
223 | 	# all the state vectors into contiguous stack memory with vsseg, then
224 | 	# read them with byte-granularity vl
225 | 
226 | 	# reconstruct vl for all computed blocks
227 | 	add t0, t3, t1
228 | 	vsetvli t0, t0, e32, m1, ta, ma
229 | 	add t0, t0, -1
230 | 
231 | 	#vse.v v4, (a0)
232 | 	#j return
233 | 
234 | 	# use a masked vsseg instead of sliding everything down?
235 | 	# both options seem like they might touch a lot of vector state...
236 | 	vslidedown.vx v16, v0, t0
237 | 	vslidedown.vx v17, v1, t0
238 | 	vslidedown.vx v18, v2, t0
239 | 	vslidedown.vx v19, v3, t0
240 | 	vslidedown.vx v20, v4, t0
241 | 	vslidedown.vx v21, v5, t0
242 | 	vslidedown.vx v22, v6, t0
243 | 	vslidedown.vx v23, v7, t0
244 | 	vslidedown.vx v24, v8, t0
245 | 	vslidedown.vx v25, v9, t0
246 | 	vslidedown.vx v26, v10, t0
247 | 	vslidedown.vx v27, v11, t0
248 | 	vslidedown.vx v28, v12, t0
249 | 	vslidedown.vx v29, v13, t0
250 | 	vslidedown.vx v30, v14, t0
251 | 	vslidedown.vx v31, v15, t0
252 | 	li t0, 1
253 | 	vsetvli zero, t0, e32, m1, ta, ma
254 | 	addi t0, sp, -64
255 | 	addi t1, sp, -32
256 | 	vsseg8e32.v v16, (t0)
257 | 	vsseg8e32.v v24, (t1)
258 | 
259 | 	vsetvli a2, a2, e8, m8, ta, ma
260 | 	vle8.v v0, (a1)
261 | 	vle8.v v8, (t0)
262 | 	vxor.vv v0, v0, v8
263 | 	vse8.v v0, (a0)
264 | 
265 | 
266 | return:
267 | 	ret
268 | 


--------------------------------------------------------------------------------
/thirdparty/rvv-rollback.S:
--------------------------------------------------------------------------------
  1 | # rvv-rollback.S -- A minimal benchmarking library
  2 | # Olaf Bernstein <camel-cdr@protonmail.com>
  3 | # Distributed under the MIT license, see license at the end of the file.
  4 | # New versions available at https://gist.github.com/camel-cdr/cfd9ba2b8754b521edf4892fe19c7031
  5 | # Conversions taken from https://github.com/RISCVtestbed/rvv-rollback
  6 | 
  7 | .macro vle32.v a:vararg
  8 | 	vlw.v \a
  9 | .endm
 10 | .macro vle16.v a:vararg
 11 | 	vlh.v \a
 12 | .endm
 13 | .macro vle8.v a:vararg
 14 | 	vlb.v \a
 15 | .endm
 16 | .macro vle32ff.v a:vararg
 17 | 	vlwff.v \a
 18 | .endm
 19 | .macro vle16ff.v a:vararg
 20 | 	vlhff.v \a
 21 | .endm
 22 | .macro vle8ff.v a:vararg
 23 | 	vlbff.v \a
 24 | .endm
 25 | .macro vse32.v a:vararg
 26 | 	vsw.v \a
 27 | .endm
 28 | .macro vse16.v a:vararg
 29 | 	vsh.v \a
 30 | .endm
 31 | .macro vse8.v a:vararg
 32 | 	vsb.v \a
 33 | .endm
 34 | .macro vluxei32.v a:vararg
 35 | 	vlxw.v \a
 36 | .endm
 37 | .macro vluxei16.v a:vararg
 38 | 	vlxh.v \a
 39 | .endm
 40 | .macro vluxei8.v a:vararg
 41 | 	vlxb.v \a
 42 | .endm
 43 | .macro vsuxei32.v a:vararg
 44 | 	vsuxw.v \a
 45 | .endm
 46 | .macro vsuxei16.v a:vararg
 47 | 	vsuxh.v \a
 48 | .endm
 49 | .macro vsuxei8.v a:vararg
 50 | 	vsuxb.v \a
 51 | .endm
 52 | .macro vlse32.v a:vararg
 53 | 	vlsw.v \a
 54 | .endm
 55 | .macro vlse16.v a:vararg
 56 | 	vlsh.v \a
 57 | .endm
 58 | .macro vlse8.v a:vararg
 59 | 	vlsb.v \a
 60 | .endm
 61 | .macro vsse32.v a:vararg
 62 | 	vssw.v \a
 63 | .endm
 64 | .macro vsse16.v a:vararg
 65 | 	vssh.v \a
 66 | .endm
 67 | .macro vsse8.v a:vararg
 68 | 	vssb.v \a
 69 | .endm
 70 | .macro vloxei32.v a:vararg
 71 | 	vlxw.v \a
 72 | .endm
 73 | .macro vloxei16.v a:vararg
 74 | 	vlxh.v \a
 75 | .endm
 76 | .macro vloxei8.v a:vararg
 77 | 	vlxb.v \a
 78 | .endm
 79 | .macro vsoxei32.v a:vararg
 80 | 	vsxw.v \a
 81 | .endm
 82 | .macro vsoxei16.v a:vararg
 83 | 	vsxh.v \a
 84 | .endm
 85 | .macro vsoxei8.v a:vararg
 86 | 	vsxb.v \a
 87 | .endm
 88 | .macro vfncvt.xu.f.w a:vararg
 89 | 	vfncvt.xu.f.v \a
 90 | .endm
 91 | .macro vfncvt.x.f.w a:vararg
 92 | 	vfncvt.x.f.v \a
 93 | .endm
 94 | .macro vfncvt.f.xu.w a:vararg
 95 | 	vfncvt.f.xu.v \a
 96 | .endm
 97 | .macro vfncvt.f.x.w a:vararg
 98 | 	vfncvt.f.x.v \a
 99 | .endm
100 | .macro vfncvt.f.f.w a:vararg
101 | 	vfncvt.f.f.v \a
102 | .endm
103 | .macro vfredusum a:vararg
104 | 	vfredsum \a
105 | .endm
106 | .macro vfwredusum.vs a:vararg
107 | 	vfwredsum.vs \a
108 | .endm
109 | .macro vnclip.wv a:vararg
110 | 	vnclip.vv \a
111 | .endm
112 | .macro vnclip.wx a:vararg
113 | 	vnclip.vx \a
114 | .endm
115 | .macro vnclip.wi a:vararg
116 | 	vnclip.vi \a
117 | .endm
118 | .macro vnclipu.wv a:vararg
119 | 	vnclipu.vv \a
120 | .endm
121 | .macro vnclipu.wx a:vararg
122 | 	vnclipu.vx \a
123 | .endm
124 | .macro vnclipu.wi a:vararg
125 | 	vnclipu.vi \a
126 | .endm
127 | .macro vnsra.wv a:vararg
128 | 	vnsra.vv \a
129 | .endm
130 | .macro vnsra.wx a:vararg
131 | 	vnsra.vx \a
132 | .endm
133 | .macro vnsra.wi a:vararg
134 | 	vnsra.vi \a
135 | .endm
136 | .macro vnsrl.wv a:vararg
137 | 	vnsrl.vv \a
138 | .endm
139 | .macro vnsrl.wx a:vararg
140 | 	vnsrl.vx \a
141 | .endm
142 | .macro vnsrl.wi a:vararg
143 | 	vnsrl.vi \a
144 | .endm
145 | .macro vmandn.mm a:vararg
146 | 	vmandnot.mm \a
147 | .endm
148 | .macro vmorn.mm a:vararg
149 | 	vmornot.mm \a
150 | .endm
151 | .macro vmmv.m a:vararg
152 | 	vmcpy.m \a
153 | .endm
154 | .macro vcpop.m a:vararg
155 | 	vmpopc.m \a
156 | .endm
157 | .macro vpop.m a:vararg
158 | 	vmpopc.m \a
159 | .endm
160 | .macro vfirst.m a:vararg
161 | 	vmfirst.m \a
162 | .endm
163 | 
164 | .macro define_for_all_nf prefix suffix prefix2 suffix2
165 | 	.macro \prefix\()2\suffix a:vararg
166 | 		\prefix2\()2\suffix2 \a
167 | 	.endm
168 | 	.macro \prefix\()3\suffix a:vararg
169 | 		\prefix2\()3\suffix2 \a
170 | 	.endm
171 | 	.macro \prefix\()4\suffix a:vararg
172 | 		\prefix2\()4\suffix2 \a
173 | 	.endm
174 | 	.macro \prefix\()5\suffix a:vararg
175 | 		\prefix2\()5\suffix2 \a
176 | 	.endm
177 | 	.macro \prefix\()6\suffix a:vararg
178 | 		\prefix2\()6\suffix2 \a
179 | 	.endm
180 | 	.macro \prefix\()7\suffix a:vararg
181 | 		\prefix2\()7\suffix2 \a
182 | 	.endm
183 | 	.macro \prefix\()8\suffix a:vararg
184 | 		\prefix2\()8\suffix2 \a
185 | 	.endm
186 | .endm
187 | define_for_all_nf vlseg e8.v  vlseg b.v
188 | define_for_all_nf vlseg e16.v vlseg h.v
189 | define_for_all_nf vlseg e32.v vlseg w.v
190 | 
191 | define_for_all_nf vsseg e8.v  vsseg b.v
192 | define_for_all_nf vsseg e16.v vsseg h.v
193 | define_for_all_nf vsseg e32.v vsseg w.v
194 | 
195 | define_for_all_nf vlsseg e8.v  vlsseg bu.v
196 | define_for_all_nf vlsseg e16.v vlsseg hu.v
197 | define_for_all_nf vlsseg e32.v vlsseg wu.v
198 | 
199 | define_for_all_nf vssseg e8.v  vssseg b.v
200 | define_for_all_nf vssseg e16.v vssseg h.v
201 | define_for_all_nf vssseg e32.v vssseg w.v
202 | 
203 | define_for_all_nf vloxseg e8.v  vlxseg b.v
204 | define_for_all_nf vloxseg e16.v vlxseg h.v
205 | define_for_all_nf vloxseg e32.v vlxseg w.v
206 | define_for_all_nf vluxseg e8.v  vlxseg b.v
207 | define_for_all_nf vluxseg e16.v vlxseg h.v
208 | define_for_all_nf vluxseg e32.v vlxseg w.v
209 | 
210 | define_for_all_nf vsoxseg e8.v  vsxseg b.v
211 | define_for_all_nf vsoxseg e16.v vsxseg h.v
212 | define_for_all_nf vsoxseg e32.v vsxseg w.v
213 | define_for_all_nf vsuxseg e8.v  vsxseg b.v
214 | define_for_all_nf vsuxseg e16.v vsxseg h.v
215 | define_for_all_nf vsuxseg e32.v vsxseg w.v
216 | 
217 | 
218 | .macro vsetvl0p7 rd, rs1, rs2, T=1, M=1
219 | 	vsetvl \rd, \rs1, \rs2
220 | .endm
221 | .macro vsetvli0p7 rd, rs1, e=e8, m=m1, T=1, M=1
222 | 	.ifc \m, mf2
223 | 	NOT SUPPORTED IN rvv0.7
224 | 	.endif
225 | 	.ifc \m, mf4
226 | 	NOT SUPPORTED IN rvv0.7
227 | 	.endif
228 | 	.ifc \m, mf8
229 | 	NOT SUPPORTED IN rvv0.7
230 | 	.endif
231 | 	vsetvli \rd, \rs1, \e, \m
232 | .endm
233 | 
234 | #define vsetvl vsetvl0p7
235 | #define vsetvli vsetvli0p7
236 | 
237 | 
238 | 
239 | # Copyright (c) 2023 Olaf Berstein
240 | # Permission is hereby granted, free of charge, to any person obtaining a copy
241 | # of this software and associated documentation files (the "Software"), to deal
242 | # in the Software without restriction, including without limitation the rights
243 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
244 | # copies of the Software, and to permit persons to whom the Software is
245 | # furnished to do so, subject to the following conditions:
246 | # The above copyright notice and this permission notice shall be included in
247 | # all copies or substantial portions of the Software.
248 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
249 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
250 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
251 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
252 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
253 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
254 | # SOFTWARE.
255 | 
256 | 


--------------------------------------------------------------------------------
/vector-utf/16to8_gather.c:
--------------------------------------------------------------------------------
  1 | #include <riscv_vector.h>
  2 | 
  3 | size_t utf16_to_utf8_scalar(uint16_t const *src, size_t count, char *dest);
  4 | 
  5 | size_t
  6 | utf16_to_utf8_rvv(uint16_t const *src, size_t count, char *dest)
  7 | {
  8 | 	size_t n = count;
  9 | 	char *const destBeg = dest;
 10 | 	size_t vl8m4 = __riscv_vsetvlmax_e8m4();
 11 | 	vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(__riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4);
 12 | 
 13 | 	for (size_t vl, vlOut; n > 0; ) {
 14 | 
 15 | 		vl = __riscv_vsetvl_e16m2(n);
 16 | 
 17 | 		vuint16m2_t v = __riscv_vle16_v_u16m2(src, vl);
 18 | 		vbool8_t m234 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x80-1, vl);
 19 | 
 20 | 		if (__riscv_vfirst_m_b8(m234,vl) < 0) { /* 1 byte utf8 */
 21 | 			vlOut = vl;
 22 | 			__riscv_vse8_v_u8m1((uint8_t*)dest, __riscv_vncvt_x_x_w_u8m1(v, vlOut), vlOut);
 23 | 			n -= vl, src += vl, dest += vlOut;
 24 | 			continue;
 25 | 		}
 26 | 
 27 | 		vbool8_t m34  = __riscv_vmsgtu_vx_u16m2_b8(v, 0x800-1, vl);
 28 | 
 29 | 		if (__riscv_vfirst_m_b8(m34,vl) < 0) { /* 1/2 byte utf8 */
 30 | 			/* 0: [     aaa|aabbbbbb]
 31 | 			 * 1: [aabbbbbb|        ] vsll 8
 32 | 			 * 2: [        |   aaaaa] vsrl 6
 33 | 			 * 3: [00111111|00111111]
 34 | 			 * 4: [  bbbbbb|000aaaaa] (1|2)&3
 35 | 			 * 5: [11000000|11000000]
 36 | 			 * 6: [10bbbbbb|110aaaaa] 4|5 */
 37 | 			vuint16m2_t twoByte  =
 38 | 				__riscv_vand_vx_u16m2(__riscv_vor_vv_u16m2(
 39 | 					__riscv_vsll_vx_u16m2(v, 8, vl),
 40 | 					__riscv_vsrl_vx_u16m2(v, 6, vl),
 41 | 				vl), 0b0011111100111111, vl);
 42 | 			v = __riscv_vor_vx_u16m2_mu(m234, v, twoByte, 0b1000000011000000, vl);
 43 | 			vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(v);
 44 | 
 45 | 			/* Every high byte that is zero should be compressed
 46 | 			 * low bytes should never be compressed, so we set them
 47 | 			 * to all ones, and then create a non-zero bytes mask */
 48 | 			vbool4_t mcomp = __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(__riscv_vor_vx_u16m2(v, 0xFF, vl)), 0, vl*2);
 49 | 			vlOut = __riscv_vcpop_m_b4(mcomp, vl*2);
 50 | 
 51 | 			vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl*2);
 52 | 			__riscv_vse8_v_u8m2((uint8_t*)dest, vout, vlOut);
 53 | 
 54 | 			n -= vl, src += vl, dest += vlOut;
 55 | 			continue;
 56 | 		}
 57 | 
 58 | 		//vbool8_t sur = __riscv_vmsgtu_vx_u16m2_b8(v, 0xD800-1, vl);
 59 | 		vbool8_t sur = __riscv_vmseq_vx_u16m2_b8(__riscv_vand_vx_u16m2(v, 0xF800, vl), 0xD800, vl);
 60 | 		long first = __riscv_vfirst_m_b8(sur, vl);
 61 | 		size_t tail = vl - first;
 62 | 		vl = first < 0 ? vl : first;
 63 | 
 64 | 		if (vl > 0) { /* 1/2/3 byte utf8 */
 65 | 			/* in: [aaaabbbb|bbcccccc]
 66 | 			 * v1: [0bcccccc|        ] vsll  8
 67 | 			 * v1: [10cccccc|        ] vsll  8 & 0b00111111 | 0b10000000
 68 | 			 * v2: [        |110bbbbb] vsrl  6 & 0b00111111 | 0b11000000
 69 | 			 * v2: [        |10bbbbbb] vsrl  6 & 0b00111111 | 0b10000000
 70 | 			 * v3: [        |1110aaaa] vsrl 12 | 0b11100000
 71 | 			 *  1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc]
 72 | 			 *  2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc]
 73 | 			 *  3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb] [10cccccc]
 74 | 			 */
 75 | 			vuint16m2_t v1, v2, v3, v12;
 76 | 			v1 = __riscv_vor_vx_u16m2_mu(m234, v, __riscv_vand_vx_u16m2(v, 0b00111111, vl), 0b10000000, vl);
 77 | 			v1 = __riscv_vsll_vx_u16m2(v1, 8, vl);
 78 | 
 79 | 			v2 = __riscv_vor_vx_u16m2(__riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 6, vl), 0b00111111, vl), 0b10000000, vl);
 80 | 			v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34,vl), v2, v2, 0b01000000, vl);
 81 | 			v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 12, vl), 0b11100000, vl);
 82 | 			v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl);
 83 | 
 84 | 			vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1<<8, vl);
 85 | 			vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl);
 86 | 			vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123);
 87 | 
 88 | 			vbool2_t mcomp = __riscv_vmor_mm_b2(m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl*4), vl*4);
 89 | 			vlOut = __riscv_vcpop_m_b2(mcomp, vl*4);
 90 | 
 91 | 			vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl*4);
 92 | 			__riscv_vse8_v_u8m4((uint8_t*)dest, vout, vlOut);
 93 | 
 94 | 			n -= vl, src += vl, dest += vlOut;
 95 | 		}
 96 | 
 97 | 		if (tail) while (n) {
 98 | 			uint16_t word = *src;
 99 | 			if((word & 0xFF80)==0) {
100 | 				break;
101 | 			} else if((word & 0xF800)==0) {
102 | 				break;
103 | 			} else if ((word & 0xF800) != 0xD800) {
104 | 				break;
105 | 			} else {
106 | 				// must be a surrogate pair
107 | 				if (n <= 1) return 0;
108 | 				uint16_t diff = word - 0xD800;
109 | 				if (diff > 0x3FF) return 0;
110 | 				uint16_t diff2 = src[1] - 0xDC00;
111 | 				if (diff2 > 0x3FF) return 0;
112 | 
113 | 				uint32_t value = ((diff + 0x40) << 10) + diff2 ;
114 | 				// uint32_t value = (diff << 10) + diff2 + 0x10000;
115 | 
116 | 				// will generate four UTF-8 bytes
117 | 				// we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
118 | 				*dest++ = (char)((value>>18) | 0b11110000);
119 | 				*dest++ = (char)(((value>>12) & 0b111111) | 0b10000000);
120 | 				*dest++ = (char)(((value>>6) & 0b111111) | 0b10000000);
121 | 				*dest++ = (char)((value & 0b111111) | 0b10000000);
122 | 				src += 2;
123 | 				n-=2;
124 | 			}
125 | 		}
126 | 	}
127 | 
128 | 	return (size_t)(dest - destBeg);
129 | }
130 | 
131 | 


--------------------------------------------------------------------------------
/vector-utf/8toN_gather.c:
--------------------------------------------------------------------------------
  1 | #include <riscv_vector.h>
  2 | 
  3 | 
  4 | #if TO_16
  5 | # define uintOut_t uint16_t
  6 | # define utf8_to_utf32_scalar utf8_to_utf16_scalar
  7 | # define utf8_to_utf32_rvv utf8_to_utf16_rvv
  8 | #else
  9 | # define uintOut_t uint32_t
 10 | #endif
 11 | 
 12 | size_t utf8_to_utf32_scalar(char const *src, size_t count, uintOut_t *dest);
 13 | 
 14 | size_t
 15 | utf8_to_utf32_rvv(char const *src, size_t count, uintOut_t *dest)
 16 | {
 17 | 	size_t tail = 3;
 18 | 	if (count < tail) return utf8_to_utf32_scalar(src, count, dest);
 19 | 
 20 | 	/* validate first three bytes */
 21 | 	{
 22 | 		size_t idx = tail;
 23 | 		while (idx < count && (src[idx] >> 6) == 0b10)
 24 | 			++idx;
 25 | 		uintOut_t buf[10];
 26 | 		if (idx > tail + 3 || !utf8_to_utf32_scalar(src, idx, buf))
 27 | 			return 0;
 28 | 	}
 29 | 
 30 | 	size_t n = count - tail;
 31 | 	uintOut_t *destBeg = dest;
 32 | 
 33 | 	static const uint64_t err1m[] = { 0x0202020202020202, 0x4915012180808080 };
 34 | 	static const uint64_t err2m[] = { 0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB };
 35 | 	static const uint64_t err3m[] = { 0x0101010101010101, 0X01010101BABAAEE6 };
 36 | 
 37 | 	const vuint8m1_t err1tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
 38 | 	const vuint8m1_t err2tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
 39 | 	const vuint8m1_t err3tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));
 40 | 
 41 | 	const size_t vl8m1 = __riscv_vsetvlmax_e8m1();
 42 | 	const size_t vl16m2 = __riscv_vsetvlmax_e16m2();
 43 | 
 44 | #if TO_16
 45 | 	size_t vl8m2 = __riscv_vsetvlmax_e8m2();
 46 | 	const vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(__riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
 47 | #endif
 48 | 
 49 | 	for (size_t vl, vlOut; n > 0; n -= vl, src += vl, dest += vlOut) {
 50 | 
 51 | 		vl = __riscv_vsetvl_e8m2(n);
 52 | 
 53 | 		vuint8m2_t v0 = __riscv_vle8_v_u8m2((uint8_t const*)src, vl);
 54 | 		uint64_t max = __riscv_vmv_x_s_u8m1_u8(__riscv_vredmaxu_vs_u8m2_u8m1(v0, __riscv_vmv_s_x_u8m1(0, vl), vl));
 55 | 
 56 | 		uint8_t next0 = src[vl+0];
 57 | 		uint8_t next1 = src[vl+1];
 58 | 		uint8_t next2 = src[vl+2];
 59 | 
 60 | 		/* fast path: ASCII */
 61 | 		if ((max|next0|next1|next2) < 0b10000000) {
 62 | 			vlOut = vl;
 63 | #if TO_16
 64 | 			__riscv_vse16_v_u16m4(dest, __riscv_vzext_vf2_u16m4(v0, vlOut), vlOut);
 65 | #else
 66 | 			__riscv_vse32_v_u32m8(dest, __riscv_vzext_vf4_u32m8(v0, vlOut), vlOut);
 67 | #endif
 68 | 			continue;
 69 | 		}
 70 | 
 71 | 		/* see "Validating UTF-8 In Less Than One Instruction Per Byte"
 72 | 		 * https://arxiv.org/abs/2010.03090 */
 73 | 		vuint8m2_t v1 = __riscv_vslide1down_vx_u8m2(v0, next0, vl);
 74 | 		vuint8m2_t v2 = __riscv_vslide1down_vx_u8m2(v1, next1, vl);
 75 | 		vuint8m2_t v3 = __riscv_vslide1down_vx_u8m2(v2, next2, vl);
 76 | 
 77 | 		vuint8m2_t s1 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(__riscv_vreinterpret_v_u8m2_u16m2(v2), 4, vl16m2));
 78 | 		vuint8m2_t s3 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(__riscv_vreinterpret_v_u8m2_u16m2(v3), 4, vl16m2));
 79 | 
 80 | 		vuint8m2_t idx2 = __riscv_vand_vx_u8m2(v2, 0xF, vl);
 81 | 		vuint8m2_t idx1 = __riscv_vand_vx_u8m2(s1, 0xF, vl);
 82 | 		vuint8m2_t idx3 = __riscv_vand_vx_u8m2(s3, 0xF, vl);
 83 | 
 84 | 		#define VRGATHER_u8m1x2(tbl, idx) \
 85 | 			__riscv_vset_v_u8m1_u8m2(__riscv_vlmul_ext_v_u8m1_u8m2( \
 86 | 				__riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 0), vl8m1)), 1, \
 87 | 				__riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 1), vl8m1));
 88 | 
 89 | 		vuint8m2_t err1 = VRGATHER_u8m1x2(err1tbl, idx1);
 90 | 		vuint8m2_t err2 = VRGATHER_u8m1x2(err2tbl, idx2);
 91 | 		vuint8m2_t err3 = VRGATHER_u8m1x2(err3tbl, idx3);
 92 | 		vint8m2_t errs = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vand_vv_u8m2(__riscv_vand_vv_u8m2(err1, err2, vl), err3, vl));
 93 | 
 94 | 		vbool4_t is_3 = __riscv_vmsgtu_vx_u8m2_b4(v1, 0b11100000-1, vl);
 95 | 		vbool4_t is_4 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b11110000-1, vl);
 96 | 		vbool4_t is_34 = __riscv_vmor_mm_b4(is_3, is_4, vl);
 97 | 		vbool4_t err34 = __riscv_vmxor_mm_b4(is_34, __riscv_vmslt_vx_i8m2_b4(errs, 0, vl), vl);
 98 | 		vbool4_t errm = __riscv_vmor_mm_b4(__riscv_vmsgt_vx_i8m2_b4(errs, 0, vl), err34, vl);
 99 | 		if (__riscv_vfirst_m_b4(errm , vl) >= 0)
100 | 			return 0;
101 | 
102 | 		/* decoding */
103 | 
104 | 		/* mask of non continuation bytes */
105 | 		vbool4_t m = __riscv_vmsgt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v0), -65, vl);
106 | 		vlOut = __riscv_vcpop_m_b4(m, vl);
107 | 
108 | 		/* extract first and second bytes */
109 | 		vuint8m2_t b1 = __riscv_vcompress_vm_u8m2(v0, m, vl);
110 | 		vuint8m2_t b2 = __riscv_vcompress_vm_u8m2(v1, m, vl);
111 | 
112 | 		/* fast path: one and two byte */
113 | 		if (max < 0b11100000) {
114 | 			b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
115 | 
116 | 			vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
117 | 			b1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);
118 | 
119 | 			vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(b1, __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1<<6, m1, vlOut), vlOut);
120 | 			 b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
121 | #if TO_16
122 | 			__riscv_vse16_v_u16m4(dest, b12, vlOut);
123 | #else
124 | 			__riscv_vse32_v_u32m8(dest, __riscv_vzext_vf2_u32m8(b12, vlOut), vlOut);
125 | #endif
126 | 			continue;
127 | 		}
128 | 
129 | 		/* fast path: one, two and three byte */
130 | 		if (max < 0b11110000) {
131 | 			vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
132 | 
133 | 			b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
134 | 			b3 = __riscv_vand_vx_u8m2(b3, 0b00111111, vlOut);
135 | 
136 | 			vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
137 | 			vbool4_t m3 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b11011111, vlOut);
138 | 
139 | 			vuint8m2_t t1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);
140 | 			b1 = __riscv_vand_vx_u8m2_mu(m3, t1, b1, 15, vlOut);
141 | 
142 | 			vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(b1, __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1<<6, m1, vlOut), vlOut);
143 | 			b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
144 | 			vuint16m4_t b123 = __riscv_vwaddu_wv_u16m4_mu(m3, b12, __riscv_vsll_vx_u16m4_mu(m3, b12, b12, 6, vlOut), b3, vlOut);
145 | #if TO_16
146 | 			__riscv_vse16_v_u16m4(dest, b123, vlOut);
147 | #else
148 | 			__riscv_vse32_v_u32m8(dest, __riscv_vzext_vf2_u32m8(b123, vlOut), vlOut);
149 | #endif
150 | 			continue;
151 | 		}
152 | 
153 | 
154 | 		/* extract third and fourth bytes */
155 | 		vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
156 | 		vuint8m2_t b4 = __riscv_vcompress_vm_u8m2(v3, m, vl);
157 | 
158 | #define M1_COMMON(idx) \
159 | 	vuint8m1_t c1 = __riscv_vget_v_u8m2_u8m1(b1, idx); \
160 | 	vuint8m1_t c2 = __riscv_vget_v_u8m2_u8m1(b2, idx); \
161 | 	vuint8m1_t c3 = __riscv_vget_v_u8m2_u8m1(b3, idx); \
162 | 	vuint8m1_t c4 = __riscv_vget_v_u8m2_u8m1(b4, idx); \
163 | 	/* remove prefix from trailing bytes */ \
164 | 	c2 = __riscv_vand_vx_u8m1(c2, 0b00111111, vlOut); \
165 | 	c3 = __riscv_vand_vx_u8m1(c3, 0b00111111, vlOut); \
166 | 	c4 = __riscv_vand_vx_u8m1(c4, 0b00111111, vlOut); \
167 | 	/* remove prefix from leading bytes
168 | 	 *
169 | 	 * We shift left and then right by the number of bytes in the prefix,
170 | 	 * which can be calculated as follows:
171 | 	 *         x                                max(x-10, 0)
172 | 	 * 0xxx -> 0000-0111 -> sift by 0 or 1   -> 0
173 | 	 * 10xx -> 1000-1011 -> don't care
174 | 	 * 110x -> 1100,1101 -> sift by 3        -> 2,3
175 | 	 * 1110 -> 1110      -> sift by 4        -> 4
176 | 	 * 1111 -> 1111      -> sift by 5        -> 5
177 | 	 *
178 | 	 * vssubu.vx v, 10, (max(x-10, 0)) almost gives us what we want, we
179 | 	 * just need to manually detect and handle the one special case:
180 | 	 */ \
181 | 	vuint8m1_t shift = __riscv_vsrl_vx_u8m1(c1, 4, vlOut); \
182 | 	shift = __riscv_vmerge_vxm_u8m1(__riscv_vssubu_vx_u8m1(shift, 10, vlOut), 3, __riscv_vmseq_vx_u8m1_b8(shift, 12, vlOut), vlOut); \
183 | \
184 | 	c1 = __riscv_vsll_vv_u8m1(c1, shift, vlOut); \
185 | 	c1 = __riscv_vsrl_vv_u8m1(c1, shift, vlOut); \
186 | 	/* unconditionally widen and combine to c1234 */ \
187 | 	vuint16m2_t c34 = __riscv_vwaddu_wv_u16m2(__riscv_vwmulu_vx_u16m2(c3,1<<6, vlOut), c4, vlOut); \
188 | 	vuint16m2_t c12 = __riscv_vwaddu_wv_u16m2(__riscv_vwmulu_vx_u16m2(c1,1<<6, vlOut), c2, vlOut); \
189 | 	vuint32m4_t c1234 = __riscv_vwaddu_wv_u32m4(__riscv_vwmulu_vx_u32m4(c12, 1 << 12, vlOut), c34, vlOut); \
190 | 	/* derive required right-shift amount from `shift` to reduce
191 | 	 * c1234 to the required number of bytes */ \
192 | 	c1234 = __riscv_vsrl_vv_u32m4(c1234, __riscv_vzext_vf4_u32m4( \
193 | 		__riscv_vmul_vx_u8m1(__riscv_vrsub_vx_u8m1(__riscv_vssubu_vx_u8m1(shift, 2, vlOut), 3, vlOut), 6, vlOut), \
194 | 		vlOut), vlOut);
195 | 
196 | #define DOWN __riscv_vreinterpret_v_u32m4_u16m4
197 | #define UP __riscv_vreinterpret_v_u16m4_u32m4
198 | 
199 | #if !TO_16
200 | #define M1_STORE \
201 | 	size_t vlDest = vlOut; \
202 | 	__riscv_vse32_v_u32m4(dest, c1234, vlDest);
203 | #else
204 | #define M1_STORE \
205 | 	/* convert [000000000000aaaa|aaaaaabbbbbbbbbb]
206 | 	 * to      [110111bbbbbbbbbb|110110aaaaaaaaaa] */ \
207 | 	vuint32m4_t sur = __riscv_vsub_vx_u32m4(c1234, 0x10000, vlOut); \
208 | 	sur = __riscv_vor_vv_u32m4( \
209 | 		__riscv_vsll_vx_u32m4(sur, 16, vlOut), \
210 | 		__riscv_vsrl_vx_u32m4(sur, 10, vlOut), \
211 | 		vlOut); \
212 | 	sur = __riscv_vand_vx_u32m4(sur, 0x3FF03FF, vlOut); \
213 | 	sur = __riscv_vor_vx_u32m4(sur, 0xDC00D800, vlOut); \
214 | 	/* merge 1 byte c1234 and 2 byte sur */ \
215 | 	vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(c1234, 0xFFFF, vlOut); \
216 | 	c1234 = __riscv_vmerge_vvm_u32m4(c1234, sur, m4, vlOut); \
217 | 	/* compress and store */ \
218 | 	vbool4_t mOut = __riscv_vmor_mm_b4(__riscv_vmsne_vx_u16m4_b4(DOWN(c1234), 0, vlOut*2), m4even, vlOut*2); \
219 | 	c1234 = UP(__riscv_vcompress_vm_u16m4(DOWN(c1234), mOut, vlOut*2)); \
220 | 	size_t vlDest = __riscv_vcpop_m_b4(mOut, vlOut*2); \
221 | 	__riscv_vse16_v_u16m4(dest, DOWN(c1234), vlDest);
222 | #endif
223 | 
224 | 		/* Unrolling this manually reduces register pressure and allows
225 | 		 * us to terminate early. */
226 | 		{
227 | 			size_t vlOutm2 = vlOut;
228 | 			vlOut = __riscv_vsetvl_e8m1(vlOut < vl8m1 ? vlOut : vl8m1);
229 | 			M1_COMMON(0)
230 | 			M1_STORE
231 | 			if (vlOutm2 == vlOut) {
232 | 				vlOut = vlDest;
233 | 				continue;
234 | 			}
235 | 
236 | 			dest += vlDest;
237 | 			vlOut = vlOutm2 - vlOut;
238 | 		}
239 | 		{
240 | 			M1_COMMON(1)
241 | 			M1_STORE
242 | 			vlOut = vlDest;
243 | 		}
244 | 
245 | #undef M1_COMMON
246 | #undef M1_STORE
247 | #undef DOWN
248 | #undef UP
249 | 	}
250 | 
251 | 	/* validate the last character and reparse it + tail */
252 | 	if (count > tail) {
253 | 		if ((src[0] >> 6) == 0b10)
254 | 			--dest;
255 | 		while ((src[0] >> 6) == 0b10 && tail < count)
256 | 			--src, ++tail;
257 | #if TO_16
258 | 		/* go back one more, when on high surrogate */
259 | 		if (dest[-1] >= 0xD800 && dest[-1] <= 0xDBFF)
260 | 			--dest;
261 | #endif
262 | 	}
263 | 	size_t ret = utf8_to_utf32_scalar(src, tail, dest);
264 | 	if (ret == 0) return 0;
265 | 	return (size_t)(dest - destBeg) + ret;
266 | }
267 | 
268 | #undef uintOut_t
269 | #undef utf8_to_utf32_scalar
270 | #undef utf8_to_utf32_rvv
271 | 
272 | 


--------------------------------------------------------------------------------
/vector-utf/Makefile:
--------------------------------------------------------------------------------
 1 | .POSIX:
 2 | 
 3 | include ../config.mk
 4 | 
 5 | BENCHS=bench_8to32 bench_8to16 bench_16to8
 6 | 
 7 | all: ${BENCHS}
 8 | 
 9 | bench_8to16: bench.c 8toN_gather.c
10 | 	${CC} ${CFLAGS} -DNAME=utf8_to_utf16 -DTO_16=1 8toN_gather.c bench.c -o $@
11 | 
12 | bench_8to32: bench.c 8toN_gather.c
13 | 	${CC} ${CFLAGS} -DNAME=utf8_to_utf32 8toN_gather.c bench.c -o $@
14 | 
15 | bench_16to8: bench.c 16to8_gather.c
16 | 	${CC} ${CFLAGS} -DNAME=utf16_to_utf8 16to8_gather.c bench.c -o $@
17 | 
18 | clean:
19 | 	rm -f ${BENCHS}
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/vector-utf/bench.c:
--------------------------------------------------------------------------------
 1 | #define NOLIBC_MAIN
 2 | #include "../nolibc.h"
 3 | #include "scalar.h"
 4 | 
 5 | size_t utf8_to_utf16_rvv(char const *src, size_t n, uint16_t *dest);
 6 | size_t utf8_to_utf32_rvv(char const *src, size_t n, uint32_t *dest);
 7 | size_t utf16_to_utf8_rvv(const uint16_t *src, size_t count, char *dest);
 8 | 
 9 | 
10 | #define MAX_IN (1024*1024*4)
11 | static uint64_t in[MAX_IN];
12 | static uint64_t out[MAX_IN * 4];
13 | 
14 | #define NUM_REPEATS 30000
15 | 
16 | #define PCAT(a,b) a##b
17 | #define CAT(a,b) PCAT(a,b)
18 | #define RVV CAT(NAME, _rvv)
19 | #define SCALAR CAT(NAME, _scalar)
20 | 
21 | #define SCALE_utf8_to_utf16 1
22 | #define SCALE_utf8_to_utf32 1
23 | #define SCALE_utf16_to_utf8 2
24 | #define SCALE CAT(SCALE_, NAME)
25 | 
26 | int
27 | main(void)
28 | {
29 | 	size_t inSize = memread(in, sizeof in);
30 | 	if (inSize == 0) {
31 | 		print("No input provided, please pipe it into the program\n");
32 | 		return 1;
33 | 	}
34 | 	for (size_t s = 1; s; ) {
35 | 		s = memread((uint8_t*)in + inSize, (sizeof in) - inSize);
36 | 		inSize += s;
37 | 	}
38 | 
39 | 	uint64_t beg, end;
40 | 
41 | 	beg = rv_cycles();
42 | 	for (size_t j = 0; j < NUM_REPEATS; ++j)
43 | 		SCALAR((void*)in, inSize / SCALE, (void*)out);
44 | 	end = rv_cycles();
45 | 
46 | 	double scalar_bc = inSize*(double)NUM_REPEATS / (end - beg);
47 | 
48 | 	beg = rv_cycles();
49 | 	for (size_t j = 0; j < NUM_REPEATS; ++j)
50 | 		RVV((void*)in, inSize / SCALE, (void*)out);
51 | 	end = rv_cycles();
52 | 
53 | 	double rvv_bc  = inSize*(double)NUM_REPEATS / (end - beg);
54 | 
55 | 	print("scalar: ")(f,scalar_bc)(" b/c  rvv: ")(f,rvv_bc)(" b/c  speedup: ")(f,rvv_bc/scalar_bc)("x\n");
56 | 
57 | 	return 0;
58 | }
59 | 
60 | 


--------------------------------------------------------------------------------
/vector-utf/rvv-0.7.1/8to16.S:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef __riscv_v
  3 | # include "../../thirdparty/rvv-rollback.S"
  4 | #endif
  5 | .text
  6 | .balign 8
  7 | 
  8 | // Changes from original codegen (clang-18):
  9 | // * rvv-rollback for direct translation
 10 | // * vsetivli -> vsetvl
 11 | // * vle64 -> vle32
 12 | // * vmvNr -> vmv.v.v & 2 vsetvli
 13 | // * vzext.vN -> N/2 vwaddu & N/2+1 vsetvli
 14 | // * vredmax -> vfirst vmsgtu
 15 | //   the SG2042 I've got access to seems to produce the wrong result for
 16 | //   vredmax, so we need to replace it
 17 | 
 18 | 
 19 | .global utf8_to_utf16_rvv
 20 | utf8_to_utf16_rvv:
 21 |         li      a4, 3
 22 |         mv      a3, a1
 23 |         bgeu    a1, a4, .LBB0_2
 24 |         mv      a1, a3
 25 |         tail    utf8_to_utf16_scalar
 26 | .LBB0_2:
 27 |         addi    sp, sp, -96
 28 |         sd      ra, 88(sp)
 29 |         sd      s0, 80(sp)
 30 |         sd      s1, 72(sp)
 31 |         sd      s2, 64(sp)
 32 |         sd      s3, 56(sp)
 33 |         sd      s4, 48(sp)
 34 |         sd      s5, 40(sp)
 35 |         sd      s6, 32(sp)
 36 |         sd      s7, 24(sp)
 37 |         mv      s2, a2
 38 |         bne     a3, a4, .LBB0_4
 39 |         mv      s1, a3
 40 |         li      a1, 3
 41 |         j       .LBB0_9
 42 | .LBB0_4:
 43 |         li      a1, 3
 44 |         li      a2, 128
 45 | .LBB0_5:
 46 |         add     a4, a0, a1
 47 |         lbu     a4, 0(a4)
 48 |         andi    a4, a4, 192
 49 |         bne     a4, a2, .LBB0_8
 50 |         addi    a1, a1, 1
 51 |         bne     a3, a1, .LBB0_5
 52 |         mv      s1, a3
 53 |         mv      a1, a3
 54 |         li      a2, 6
 55 |         bgeu    a2, a3, .LBB0_9
 56 |         j       .LBB0_24
 57 | .LBB0_8:
 58 |         mv      s1, a3
 59 |         li      a2, 6
 60 |         bltu    a2, a1, .LBB0_24
 61 | .LBB0_9:
 62 |         addi    a2, sp, 4
 63 |         mv      s0, a0
 64 |         call    utf8_to_utf16_scalar
 65 |         mv      a1, a0
 66 |         beqz    a0, .LBB0_24
 67 |         mv      a0, s0
 68 |         addi    a1, s1, -3
 69 |         vsetvli a2, zero, e8, m2, ta, ma
 70 |         beqz    a1, .LBB0_32
 71 |         mv      t5, s1
 72 | 	li a2, 4
 73 | 	vsetvli        zero, a2, e32, m1, ta, ma
 74 | 	la a2, utf8_to_utf16_rvv.err1m
 75 | 	vle32.v v10, (a2)
 76 | 	la a2, utf8_to_utf16_rvv.err2m
 77 | 	vle32.v v11, (a2)
 78 | 	la a2, utf8_to_utf16_rvv.err3m
 79 | 	vle32.v v12, (a2)
 80 |         vsetvli a2, zero, e8, m2, ta, ma
 81 |         vid.v   v8
 82 |         vand.vi v8, v8, 1
 83 |         vmseq.vi        v13, v8, 0
 84 |         li      s7, 127
 85 |         li      s6, -33
 86 |         li      a7, -17
 87 |         li      s1, 63
 88 |         li      a6, -65
 89 |         li      s5, 64
 90 |         li      t0, 10
 91 |         lui     t1, 1
 92 |         li      t2, 2
 93 |         li      t3, 6
 94 |         lui     t4, 16
 95 |         lui     a2, 16368
 96 |         addi    t6, a2, 1023
 97 |         addi    s3, t4, -1
 98 |         lui     a2, 901134
 99 |         addi    s4, a2, -2048
100 |         mv      s0, s2
101 |         j       .LBB0_14
102 | .LBB0_12:
103 | 	vsetvli zero, zero, e8, m2, ta, ma
104 | 	vwaddu.vx v16, v8, x0
105 | 	vsetvli zero, zero, e16, m4, ta, ma
106 |         vse16.v v16, (s0)
107 |         mv      a5, a3
108 | .LBB0_13:
109 |         sub     a1, a1, a3
110 |         slli    a5, a5, 1
111 |         add     s0, s0, a5
112 |         beqz    a1, .LBB0_25
113 | .LBB0_14:
114 |         vsetvli a3, a1, e8, m2, ta, ma
115 |         vle8.v  v8, (a0)
116 |         vmsgtu.vx       v14, v8, s7
117 |         vfirst.m        a2, v14
118 |         add     a0, a0, a3
119 |         bltz    a2, .LBB0_12
120 |         lbu     a2, 0(a0)
121 |         lbu     a4, 1(a0)
122 |         lbu     a5, 2(a0)
123 |         vslide1down.vx  v16, v8, a2
124 |         vslide1down.vx  v18, v16, a4
125 |         vslide1down.vx  v20, v18, a5
126 |         vsetvli a2, zero, e16, m2, ta, ma
127 |         vsrl.vi v14, v18, 4
128 |         vsrl.vi v22, v20, 4
129 |         vsetvli zero, a3, e8, m2, ta, ma
130 |         vand.vi v24, v18, 15
131 |         vand.vi v14, v14, 15
132 |         vand.vi v22, v22, 15
133 |         vsetvli a2, zero, e8, m1, ta, ma
134 |         vrgather.vv     v26, v10, v14
135 |         vrgather.vv     v27, v10, v15
136 |         vrgather.vv     v14, v11, v24
137 |         vrgather.vv     v15, v11, v25
138 |         vrgather.vv     v24, v12, v22
139 |         vrgather.vv     v25, v12, v23
140 |         vsetvli zero, a3, e8, m2, ta, ma
141 |         vand.vv v14, v26, v14
142 |         vand.vv v14, v14, v24
143 |         vmsgtu.vx       v23, v16, s6
144 |         vmsgtu.vx       v22, v8, a7
145 |         vmor.mm v23, v23, v22
146 |         vmsgtu.vx       v24, v14, s7
147 |         vmxor.mm        v23, v23, v24
148 |         vmsgt.vi        v24, v14, 0
149 |         vmor.mm v14, v24, v23
150 |         vfirst.m        a2, v14
151 |         bgez    a2, .LBB0_24
152 |         vsrl.vi v14, v8, 6
153 |         vmsne.vi        v23, v14, 2
154 |         vcpop.m a5, v23
155 |         vcompress.vm    v14, v8, v23
156 |         vmsgtu.vx       v24, v8, s6
157 |         vfirst.m        a2, v24
158 |         vcompress.vm    v8, v16, v23
159 |         bltz    a2, .LBB0_20
160 |         vfirst.m        a2, v22
161 |         vcompress.vm    v16, v18, v23
162 |         bltz    a2, .LBB0_21
163 |         vcompress.vm    v18, v20, v23
164 |         vsetvli a4, a5, e8, m1, ta, ma
165 |         vand.vx v8, v8, s1
166 |         vand.vx v16, v16, s1
167 |         vand.vx v18, v18, s1
168 |         vsrl.vi v20, v14, 4
169 |         vmseq.vi        v0, v20, 12
170 |         vssubu.vx       v20, v20, t0
171 |         vmerge.vim      v20, v20, 3, v0
172 |         vsll.vv v14, v14, v20
173 |         vsrl.vv v14, v14, v20
174 |         vwmulu.vx       v22, v16, s5
175 |         vwaddu.wv       v22, v22, v18
176 |         vwmulu.vx       v24, v14, s5
177 |         vwaddu.wv       v24, v24, v8
178 |         vsetvli zero, zero, e16, m2, ta, ma
179 |         vwmulu.vx       v28, v24, t1
180 |         vwaddu.wv       v28, v28, v22
181 |         vsetvli zero, zero, e8, m1, ta, ma
182 |         vssubu.vx       v8, v20, t2
183 |         vrsub.vi        v8, v8, 3
184 |         vmul.vx v8, v8, t3
185 | 	vsetvli zero, zero, e8, m1, ta, ma
186 | 	vwaddu.vx v0, v8, x0
187 | 	vsetvli zero, zero, e16, m2, ta, ma
188 | 	vwaddu.vx v20, v0, x0
189 | 	vsetvli zero, zero, e32, m4, ta, mu
190 |         vsrl.vv v20, v28, v20
191 |         vsub.vx v24, v20, t4
192 |         vsll.vi v28, v24, 16
193 |         vsrl.vi v24, v24, 10
194 |         vor.vv  v24, v28, v24
195 |         vmsgtu.vx       v0, v20, s3
196 |         vand.vx v24, v24, t6
197 |         vor.vx  v20, v24, s4, v0.t
198 |         slli    a2, a4, 1
199 |         vsetvli zero, a2, e16, m4, ta, ma
200 |         vmsne.vi        v8, v20, 0
201 |         vmor.mm v8, v8, v13
202 |         vcompress.vm    v24, v20, v8
203 |         vcpop.m a2, v8
204 |         vsetvli zero, a2, e16, m4, ta, ma
205 |         vse16.v v24, (s0)
206 |         bne     a5, a4, .LBB0_22
207 |         mv      a5, a2
208 |         j       .LBB0_13
209 | .LBB0_20:
210 |         vsetvli zero, a5, e8, m2, ta, mu
211 |         vmsgtu.vx       v0, v14, a6
212 |         vand.vx v8, v8, s1
213 |         vand.vx v14, v14, s1, v0.t
214 |         vmv.v.i v16, 1
215 |         vmerge.vxm      v16, v16, s5, v0
216 |         vwmulu.vv       v20, v14, v16
217 |         vwaddu.wv       v20, v20, v8, v0.t
218 | 	vsetvli zero, a5, e16, m4, ta, ma
219 |         j       .LBB0_23
220 | .LBB0_21:
221 |         vsetvli zero, a5, e8, m2, ta, mu
222 |         vand.vx v18, v8, s1
223 |         vand.vx v16, v16, s1
224 |         vmsgtu.vx       v8, v14, a6
225 |         vmsgtu.vx       v9, v14, s6
226 | 	vmv.v.v v20, v14
227 | 	vsetvli zero, zero, e8, m1, ta, ma
228 | 	vmv.v.v v0, v8
229 | 	vsetvli zero, a5, e8, m2, ta, mu
230 |         vand.vx v20, v14, s1, v0.t
231 | 	vsetvli zero, zero, e8, m1, ta, ma
232 | 	vmv.v.v v0, v9
233 | 	vsetvli zero, a5, e8, m2, ta, mu
234 |         vand.vi v20, v14, 15, v0.t
235 |         vmv.v.i v14, 1
236 | 	vsetvli zero, zero, e8, m1, ta, ma
237 | 	vmv.v.v v0, v8
238 | 	vsetvli zero, a5, e8, m2, ta, mu
239 |         vmerge.vxm      v14, v14, s5, v0
240 |         vwmulu.vv       v24, v20, v14
241 |         vwaddu.wv       v24, v24, v18, v0.t
242 |         vsetvli zero, zero, e16, m4, ta, mu
243 | 	vsetvli zero, zero, e8, m1, ta, ma
244 | 	vmv.v.v v0, v9
245 | 	vsetvli zero, a5, e16, m4, ta, mu
246 | 	vmv.v.v v20, v24
247 |         vsll.vi v20, v24, 6, v0.t
248 |         vsetvli zero, zero, e8, m2, ta, mu
249 |         vwaddu.wv       v24, v20, v16, v0.t
250 | 	vsetvli zero, zero, e16, m4, ta, mu
251 |         vse16.v v24, (s0)
252 |         j       .LBB0_13
253 | .LBB0_22:
254 |         sub     a5, a5, a4
255 |         slli    a2, a2, 1
256 |         add     s0, s0, a2
257 |         vsetvli zero, a5, e8, m1, ta, ma
258 |         vand.vx v8, v9, s1
259 |         vand.vx v9, v17, s1
260 |         vand.vx v14, v19, s1
261 |         vsrl.vi v16, v15, 4
262 |         vmseq.vi        v0, v16, 12
263 |         vssubu.vx       v16, v16, t0
264 |         vmerge.vim      v16, v16, 3, v0
265 |         vsll.vv v15, v15, v16
266 |         vsrl.vv v15, v15, v16
267 |         vwmulu.vx       v18, v9, s5
268 |         vwaddu.wv       v18, v18, v14
269 |         vwmulu.vx       v20, v15, s5
270 |         vwaddu.wv       v20, v20, v8
271 |         vsetvli zero, zero, e16, m2, ta, ma
272 |         vwmulu.vx       v24, v20, t1
273 |         vwaddu.wv       v24, v24, v18
274 |         vsetvli zero, zero, e8, m1, ta, ma
275 |         vssubu.vx       v8, v16, t2
276 |         vrsub.vi        v8, v8, 3
277 |         vmul.vx v8, v8, t3
278 |         vsetvli zero, zero, e32, m4, ta, mu
279 | 	vsetvli zero, zero, e8, m1, ta, ma
280 | 	vwaddu.vx v0, v8, x0
281 | 	vsetvli zero, zero, e16, m2, ta, ma
282 | 	vwaddu.vx v16, v0, x0
283 | 	vsetvli zero, zero, e32, m4, ta, mu
284 |         vsrl.vv v16, v24, v16
285 |         vsub.vx v20, v16, t4
286 |         vsll.vi v24, v20, 16
287 |         vsrl.vi v20, v20, 10
288 |         vor.vv  v20, v24, v20
289 |         vmsgtu.vx       v0, v16, s3
290 |         vand.vx v20, v20, t6
291 |         vor.vx  v16, v20, s4, v0.t
292 |         slli    a5, a5, 1
293 |         vsetvli zero, a5, e16, m4, ta, ma
294 |         vmsne.vi        v8, v16, 0
295 |         vmor.mm v8, v8, v13
296 |         vcompress.vm    v20, v16, v8
297 |         vcpop.m a5, v8
298 |         vsetvli zero, a5, e16, m4, ta, ma
299 | .LBB0_23:
300 |         vse16.v v20, (s0)
301 |         j       .LBB0_13
302 | .LBB0_24:
303 |         li      a0, 0
304 |         j       .LBB0_34
305 | .LBB0_25:
306 |         li      a1, 3
307 |         beq     t5, a1, .LBB0_33
308 |         lbu     a1, 0(a0)
309 |         andi    a3, a1, 192
310 |         addi    a1, a3, -128
311 |         snez    a1, a1
312 |         addi    a1, a1, -1
313 |         andi    a1, a1, -2
314 |         add     s0, s0, a1
315 |         li      a2, 128
316 |         li      a1, 3
317 |         bne     a3, a2, .LBB0_30
318 |         li      a1, 3
319 | .LBB0_28:
320 |         lbu     a3, -1(a0)
321 |         addi    a0, a0, -1
322 |         andi    a3, a3, 192
323 |         addi    a1, a1, 1
324 |         bne     a3, a2, .LBB0_30
325 |         bltu    a1, t5, .LBB0_28
326 | .LBB0_30:
327 |         lhu     a2, -2(s0)
328 |         srli    a2, a2, 10
329 |         li      a3, 54
330 |         bne     a2, a3, .LBB0_33
331 |         addi    s0, s0, -2
332 |         j       .LBB0_33
333 | .LBB0_32:
334 |         li      a1, 3
335 |         mv      s0, s2
336 | .LBB0_33:
337 |         mv      a2, s0
338 |         call    utf8_to_utf16_scalar
339 |         seqz    a1, a0
340 |         sub     a2, s0, s2
341 |         srai    a2, a2, 1
342 |         add     a0, a0, a2
343 |         addi    a1, a1, -1
344 |         and     a0, a0, a1
345 | .LBB0_34:
346 |         ld      ra, 88(sp)
347 |         ld      s0, 80(sp)
348 |         ld      s1, 72(sp)
349 |         ld      s2, 64(sp)
350 |         ld      s3, 56(sp)
351 |         ld      s4, 48(sp)
352 |         ld      s5, 40(sp)
353 |         ld      s6, 32(sp)
354 |         ld      s7, 24(sp)
355 |         addi    sp, sp, 96
356 |         ret
357 | 
358 | 
359 | .data
360 | utf8_to_utf16_rvv.err1m:
361 |         .quad   144680345676153346
362 |         .quad   5266116582681116800
363 | 
364 | utf8_to_utf16_rvv.err2m:
365 |         .quad   -3761689263670582297
366 |         .quad   -3761671395393942581
367 | 
368 | utf8_to_utf16_rvv.err3m:
369 |         .quad   72340172838076673
370 |         .quad   72340175954030310
371 | 
372 | 


--------------------------------------------------------------------------------
/vector-utf/rvv-0.7.1/8to32.S:
--------------------------------------------------------------------------------
  1 | #ifndef __riscv_v
  2 | # include "../../thirdparty/rvv-rollback.S"
  3 | #endif
  4 | .text
  5 | .balign 8
  6 | 
  7 | // Changes from original codegen (clang-18):
  8 | // * rvv-rollback for direct translation
  9 | // * vsetivli -> vsetvl
 10 | // * vle64 -> vle32
 11 | // * vmvNr -> vmv.v.v & 2 vsetvli
 12 | // * vzext.vN -> N/2 vwaddu.vx & N/2+1 vsetvli
 13 | // * vredmax -> vfirst vmsgtu
 14 | //   the SG2042 I've got access to seems to produce the wrong result for
 15 | //   vredmax, so we need to replace it
 16 | 
 17 | .global utf8_to_utf32_rvv
 18 | utf8_to_utf32_rvv:
 19 |         li      a4, 3
 20 |         mv      a3, a1
 21 |         bgeu    a1, a4, .LBB0_2
 22 |         mv      a1, a3
 23 |         tail    utf8_to_utf32_scalar
 24 | .LBB0_2:
 25 |         addi    sp, sp, -80
 26 |         sd      ra, 72(sp)
 27 |         sd      s0, 64(sp)
 28 |         sd      s1, 56(sp)
 29 |         sd      s2, 48(sp)
 30 |         sd      s3, 40(sp)
 31 |         mv      s2, a2
 32 |         bne     a3, a4, .LBB0_4
 33 |         mv      s1, a3
 34 |         li      a1, 3
 35 |         j       .LBB0_9
 36 | .LBB0_4:
 37 |         li      a1, 3
 38 |         li      a2, 128
 39 | .LBB0_5:
 40 |         add     a4, a0, a1
 41 |         lbu     a4, 0(a4)
 42 |         andi    a4, a4, 192
 43 |         bne     a4, a2, .LBB0_8
 44 |         addi    a1, a1, 1
 45 |         bne     a3, a1, .LBB0_5
 46 |         mv      s1, a3
 47 |         mv      a1, a3
 48 |         li      a2, 6
 49 |         bgeu    a2, a3, .LBB0_9
 50 |         j       .LBB0_22
 51 | .LBB0_8:
 52 |         mv      s1, a3
 53 |         li      a2, 6
 54 |         bltu    a2, a1, .LBB0_22
 55 | .LBB0_9:
 56 |         mv      a2, sp
 57 |         mv      s0, a0
 58 |         call    utf8_to_utf32_scalar
 59 |         mv      a1, a0
 60 |         beqz    a0, .LBB0_22
 61 |         mv      a0, s0
 62 |         addi    a1, s1, -3
 63 |         vsetvli a2, zero, e16, m2, ta, ma
 64 |         beqz    a1, .LBB0_28
 65 |         mv      t4, s1
 66 | 	li a2, 4
 67 | 	vsetvli        zero, a2, e32, m1, ta, ma
 68 | 	la a2, utf8_to_utf32_rvv.err1m
 69 | 	vle32.v v10, (a2)
 70 | 	la a2, utf8_to_utf32_rvv.err2m
 71 | 	vle32.v v11, (a2)
 72 | 	la a2, utf8_to_utf32_rvv.err3m
 73 | 	vle32.v v12, (a2)
 74 |         li      s3, 127
 75 |         li      t6, -33
 76 |         li      a7, -17
 77 |         li      s1, 63
 78 |         li      a6, -65
 79 |         li      t5, 64
 80 |         li      t0, 10
 81 |         lui     t1, 1
 82 |         li      t2, 2
 83 |         li      t3, 6
 84 |         mv      s0, s2
 85 |         j       .LBB0_14
 86 | 	nop # needed for alignment, I think :3
 87 | .LBB0_12:
 88 | 	vsetvli zero, zero, e8, m2, ta, ma
 89 | 	vwaddu.vx v0, v8, x0
 90 | 	vsetvli zero, zero, e16, m4, ta, ma
 91 | 	vwaddu.vx v16, v0, x0
 92 | 	vsetvli zero, zero, e32, m8, ta, ma
 93 |         vse32.v v16, (s0)
 94 |         mv      a5, a2
 95 | .LBB0_13:
 96 |         sub     a1, a1, a2
 97 |         slli    a5, a5, 2
 98 |         add     s0, s0, a5
 99 |         beqz    a1, .LBB0_23
100 | .LBB0_14:
101 |         vsetvli a2, a1, e8, m2, ta, ma
102 |         vle8.v  v8, (a0)
103 |         vmsgtu.vx       v13, v8, s3
104 |         vfirst.m        a4, v13
105 |         add     a0, a0, a2
106 |         bltz    a4, .LBB0_12
107 |         lbu     a4, 0(a0)
108 |         lbu     a5, 1(a0)
109 |         lbu     a3, 2(a0)
110 |         vslide1down.vx  v20, v8, a4
111 |         vslide1down.vx  v18, v20, a5
112 |         vslide1down.vx  v16, v18, a3
113 |         vsetvli a3, zero, e16, m2, ta, ma
114 |         vsrl.vi v14, v18, 4
115 |         vsrl.vi v22, v16, 4
116 |         vsetvli zero, a2, e8, m2, ta, ma
117 |         vand.vi v24, v18, 15
118 |         vand.vi v14, v14, 15
119 |         vand.vi v22, v22, 15
120 |         vsetvli a3, zero, e8, m1, ta, ma
121 |         vrgather.vv     v26, v10, v14
122 |         vrgather.vv     v27, v10, v15
123 |         vrgather.vv     v14, v11, v24
124 |         vrgather.vv     v15, v11, v25
125 |         vrgather.vv     v24, v12, v22
126 |         vrgather.vv     v25, v12, v23
127 |         vsetvli zero, a2, e8, m2, ta, ma
128 |         vand.vv v14, v26, v14
129 |         vand.vv v14, v14, v24
130 |         vmsgtu.vx       v22, v20, t6
131 |         vmsgtu.vx       v13, v8, a7
132 |         vmor.mm v22, v22, v13
133 |         vmsgtu.vx       v23, v14, s3
134 |         vmxor.mm        v22, v22, v23
135 |         vmsgt.vi        v23, v14, 0
136 |         vmor.mm v14, v23, v22
137 |         vfirst.m        a3, v14
138 |         bgez    a3, .LBB0_22
139 |         vsrl.vi v14, v8, 6
140 |         vmsne.vi        v22, v14, 2
141 |         vcpop.m a5, v22
142 |         vcompress.vm    v14, v8, v22
143 |         vmsgtu.vx       v23, v8, t6
144 |         vfirst.m        a3, v23
145 |         vcompress.vm    v8, v20, v22
146 |         bltz    a3, .LBB0_20
147 |         vfirst.m        a3, v13
148 |         vcompress.vm    v20, v18, v22
149 |         bltz    a3, .LBB0_21
150 |         vcompress.vm    v18, v16, v22
151 |         vsetvli a4, a5, e8, m1, ta, ma
152 |         vand.vx v8, v8, s1
153 |         vand.vx v13, v20, s1
154 |         vand.vx v16, v18, s1
155 |         vsrl.vi v17, v14, 4
156 |         vmseq.vi        v0, v17, 12
157 |         vssubu.vx       v17, v17, t0
158 |         vmerge.vim      v17, v17, 3, v0
159 |         vsll.vv v14, v14, v17
160 |         vsrl.vv v14, v14, v17
161 |         vwmulu.vx       v22, v13, t5
162 |         vwaddu.wv       v22, v22, v16
163 |         vwmulu.vx       v24, v14, t5
164 |         vwaddu.wv       v24, v24, v8
165 |         vsetvli zero, zero, e16, m2, ta, ma
166 |         vwmulu.vx       v28, v24, t1
167 |         vwaddu.wv       v28, v28, v22
168 |         vsetvli zero, zero, e8, m1, ta, ma
169 |         vssubu.vx       v8, v17, t2
170 |         vrsub.vi        v8, v8, 3
171 |         vmul.vx v8, v8, t3
172 | 	vsetvli zero, zero, e8, m1, ta, ma
173 | 	vwaddu.vx v0, v8, x0
174 | 	vsetvli zero, zero, e16, m2, ta, ma
175 | 	vwaddu.vx v24, v0, x0
176 | 	vsetvli zero, zero, e32, m4, ta, ma
177 |         vsrl.vv v24, v28, v24
178 |         vse32.v v24, (s0)
179 |         beq     a5, a4, .LBB0_13
180 |         sub     a5, a5, a4
181 |         slli    a4, a4, 2
182 |         add     s0, s0, a4
183 |         vsetvli zero, a5, e8, m1, ta, ma
184 |         vand.vx v8, v9, s1
185 |         vand.vx v9, v21, s1
186 |         vand.vx v13, v19, s1
187 |         vsrl.vi v14, v15, 4
188 |         vmseq.vi        v0, v14, 12
189 |         vssubu.vx       v14, v14, t0
190 |         vmerge.vim      v14, v14, 3, v0
191 |         vsll.vv v15, v15, v14
192 |         vsrl.vv v15, v15, v14
193 |         vwmulu.vx       v16, v9, t5
194 |         vwaddu.wv       v16, v16, v13
195 |         vwmulu.vx       v18, v15, t5
196 |         vwaddu.wv       v18, v18, v8
197 |         vsetvli zero, zero, e16, m2, ta, ma
198 |         vwmulu.vx       v20, v18, t1
199 |         vwaddu.wv       v20, v20, v16
200 |         vsetvli zero, zero, e8, m1, ta, ma
201 |         vssubu.vx       v8, v14, t2
202 |         vrsub.vi        v8, v8, 3
203 |         vmul.vx v8, v8, t3
204 |         vsetvli zero, zero, e32, m4, ta, ma
205 | 	vsetvli zero, zero, e8, m1, ta, ma
206 | 	vwaddu.vx v0, v8, x0
207 | 	vsetvli zero, zero, e16, m2, ta, ma
208 | 	vwaddu.vx v16, v0, x0
209 | 	vsetvli zero, zero, e32, m4, ta, ma
210 |         vsrl.vv v16, v20, v16
211 |         vse32.v v16, (s0)
212 |         j       .LBB0_13
213 | .LBB0_20:
214 |         vsetvli zero, a5, e8, m2, ta, mu
215 |         vmsgtu.vx       v0, v14, a6
216 |         vand.vx v8, v8, s1
217 |         vand.vx v14, v14, s1, v0.t
218 |         vmv.v.i v16, 1
219 |         vmerge.vxm      v16, v16, t5, v0
220 |         vwmulu.vv       v20, v14, v16
221 |         vwaddu.wv       v20, v20, v8, v0.t
222 | 	vsetvli zero, zero, e16, m4, ta, ma
223 | 	vwaddu.vx v24, v20, x0
224 | 	vsetvli zero, zero, e32, m8, ta, ma
225 |         vse32.v v24, (s0)
226 |         j       .LBB0_13
227 | .LBB0_21:
228 |         vsetvli zero, a5, e8, m2, ta, ma
229 |         vand.vx v22, v8, s1
230 |         vand.vx v20, v20, s1
231 |         vmsgtu.vx       v8, v14, a6
232 |         vmsgtu.vx       v9, v14, t6
233 |         vmv.v.v v24, v14
234 | 	vsetvli zero, zero, e8, m1, ta, ma
235 | 	vmv.v.v v0, v8
236 | 	vsetvli zero, a5, e8, m2, ta, mu
237 |         vand.vx v24, v14, s1, v0.t
238 | 	vsetvli zero, zero, e8, m1, ta, ma
239 | 	vmv.v.v v0, v9
240 | 	vsetvli zero, a5, e8, m2, ta, mu
241 |         vand.vi v24, v14, 15, v0.t
242 |         vmv.v.i v14, 1
243 | 	vsetvli zero, zero, e8, m1, ta, ma
244 | 	vmv.v.v v0, v8
245 | 	vsetvli zero, a5, e8, m2, ta, mu
246 |         vmerge.vxm      v14, v14, t5, v0
247 |         vwmulu.vv       v16, v24, v14
248 |         vwaddu.wv       v16, v16, v22, v0.t
249 | 	vsetvli zero, zero, e8, m1, ta, ma
250 | 	vmv.v.v v0, v9
251 | 	vsetvli zero, a5, e16, m4, ta, mu
252 | 	vmv.v.v v24, v16
253 |         vsll.vi v24, v16, 6, v0.t
254 |         vsetvli zero, zero, e8, m2, ta, mu
255 |         vwaddu.wv       v16, v24, v20, v0.t
256 | 	vsetvli zero, zero, e16, m4, ta, ma
257 | 	vwaddu.vx v24, v16, x0
258 | 	vsetvli zero, zero, e32, m8, ta, ma
259 |         vse32.v v24, (s0)
260 |         j       .LBB0_13
261 | .LBB0_22:
262 |         li      a0, 0
263 |         j       .LBB0_30
264 | .LBB0_23:
265 |         li      a1, 3
266 |         beq     t4, a1, .LBB0_29
267 |         lbu     a2, 0(a0)
268 |         andi    a3, a2, 192
269 |         addi    a2, a3, -128
270 |         snez    a2, a2
271 |         slli    a2, a2, 2
272 |         add     s0, s0, a2
273 |         li      a2, 128
274 |         addi    s0, s0, -4
275 |         bne     a3, a2, .LBB0_29
276 |         li      a1, 3
277 | .LBB0_26:
278 |         lbu     a3, -1(a0)
279 |         addi    a0, a0, -1
280 |         andi    a3, a3, 192
281 |         addi    a1, a1, 1
282 |         bne     a3, a2, .LBB0_29
283 |         bltu    a1, t4, .LBB0_26
284 |         j       .LBB0_29
285 | .LBB0_28:
286 |         li      a1, 3
287 |         mv      s0, s2
288 | .LBB0_29:
289 |         mv      a2, s0
290 |         call    utf8_to_utf32_scalar
291 |         seqz    a1, a0
292 |         sub     a2, s0, s2
293 |         srai    a2, a2, 2
294 |         add     a0, a0, a2
295 |         addi    a1, a1, -1
296 |         and     a0, a0, a1
297 | .LBB0_30:
298 |         ld      ra, 72(sp)
299 |         ld      s0, 64(sp)
300 |         ld      s1, 56(sp)
301 |         ld      s2, 48(sp)
302 |         ld      s3, 40(sp)
303 |         addi    sp, sp, 80
304 |         ret
305 | 
306 | .data
307 | utf8_to_utf32_rvv.err1m:
308 |         .quad   144680345676153346
309 |         .quad   5266116582681116800
310 | utf8_to_utf32_rvv.err2m:
311 |         .quad   -3761689263670582297
312 |         .quad   -3761671395393942581
313 | utf8_to_utf32_rvv.err3m:
314 |         .quad   72340172838076673
315 |         .quad   72340175954030310
316 | 
317 | 


--------------------------------------------------------------------------------
/vector-utf/scalar.h:
--------------------------------------------------------------------------------
  1 | // code from https://github.com/simdutf/simdutf/tree/master/src/scalar
  2 | 
  3 | // little endian
  4 | size_t
  5 | utf8_to_utf16_scalar(const char *buf, size_t len, uint16_t *utf16_output)
  6 | {
  7 | 	const uint8_t *data = (const uint8_t *)buf;
  8 | 	size_t pos = 0;
  9 | 	uint16_t *start = utf16_output;
 10 | #if 1
 11 | 	while (pos < len) {
 12 | 		// try to convert the next block of 16 ASCII bytes
 13 | 		if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
 14 | 			uint64_t v1;
 15 | 			memcpy(&v1, data + pos, sizeof(uint64_t));
 16 | 			uint64_t v2;
 17 | 			memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
 18 | 			uint64_t v = v1 | v2;
 19 | 			if ((v & 0x8080808080808080) == 0) {
 20 | 				size_t final_pos = pos + 16;
 21 | 				while(pos < final_pos) {
 22 | 					*utf16_output++ = (uint16_t)(buf[pos]);
 23 | 					pos++;
 24 | 				}
 25 | 				continue;
 26 | 			}
 27 | 		}
 28 | #else
 29 | 	// only uses aligned load/stores
 30 | 	size_t aligned = 8 - ((uintptr_t)data & 7);
 31 | 	while (pos < len) {
 32 | 		// try to convert the next block of 16 ASCII bytes
 33 | 		if ((pos & 7) == aligned && pos + 16 <= len) {
 34 | 			uintptr_t p = (uintptr_t)(data+pos) & ~7ull; // compiler hint
 35 | 			uint64_t v1;
 36 | 			memcpy(&v1, (const uint8_t*)p, sizeof(uint64_t));
 37 | 			uint64_t v2;
 38 | 			memcpy(&v2, (const uint8_t*)p + sizeof(uint64_t), sizeof(uint64_t));
 39 | 
 40 | 			uint64_t v = v1 | v2;
 41 | 			if ((v & 0x8080808080808080) == 0) {
 42 | 				for (size_t i = 0; i < 16; ++i)
 43 | 					*utf16_output++ = (uint16_t)buf[pos++];
 44 | 				continue;
 45 | 			}
 46 | 		}
 47 | #endif
 48 | 
 49 | 
 50 | 		uint8_t leading_byte = data[pos]; // leading byte
 51 | 		if (leading_byte < 0b10000000) {
 52 | 			// converting one ASCII byte !!!
 53 | 			*utf16_output++ = (uint16_t)leading_byte;
 54 | 			pos++;
 55 | 		} else if ((leading_byte & 0b11100000) == 0b11000000) {
 56 | 			// We have a two-byte UTF-8, it should become
 57 | 			// a single UTF-16 word.
 58 | 			if(pos + 1 >= len) { return 0; } // minimal bound checking
 59 | 			if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
 60 | 			// range check
 61 | 			uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
 62 | 			if (code_point < 0x80 || 0x7ff < code_point) { return 0; }
 63 | 			*utf16_output++ = (uint16_t)code_point;
 64 | 			pos += 2;
 65 | 		} else if ((leading_byte & 0b11110000) == 0b11100000) {
 66 | 			// We have a three-byte UTF-8, it should become
 67 | 			// a single UTF-16 word.
 68 | 			if(pos + 2 >= len) { return 0; } // minimal bound checking
 69 | 
 70 | 			if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
 71 | 			if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
 72 | 			// range check
 73 | 			uint32_t code_point = (leading_byte & 0b00001111) << 12 |
 74 | 				(data[pos + 1] & 0b00111111) << 6 |
 75 | 				(data[pos + 2] & 0b00111111);
 76 | 			if (code_point < 0x800 || 0xffff < code_point ||
 77 | 					(0xd7ff < code_point && code_point < 0xe000)) {
 78 | 				return 0;
 79 | 			}
 80 | 			*utf16_output++ = (uint16_t)code_point;
 81 | 			pos += 3;
 82 | 		} else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
 83 | 									// we have a 4-byte UTF-8 word.
 84 | 			if(pos + 3 >= len) { return 0; } // minimal bound checking
 85 | 			if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
 86 | 			if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
 87 | 			if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; }
 88 | 
 89 | 			// range check
 90 | 			uint32_t code_point =
 91 | 				(leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
 92 | 				(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
 93 | 			if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; }
 94 | 			code_point -= 0x10000;
 95 | 			uint16_t high_surrogate = (uint16_t)(0xD800 + (code_point >> 10));
 96 | 			uint16_t low_surrogate = (uint16_t)(0xDC00 + (code_point & 0x3FF));
 97 | 			*utf16_output++ = (uint16_t)(high_surrogate);
 98 | 			*utf16_output++ = (uint16_t)(low_surrogate);
 99 | 			pos += 4;
100 | 		} else {
101 | 			return 0;
102 | 		}
103 | 	}
104 | 	return utf16_output - start;
105 | }
106 | 
107 | size_t
108 | utf8_to_utf32_scalar(const char *buf, size_t len, uint32_t *utf32_output)
109 | {
110 | 	const uint8_t *data = (const uint8_t *)buf;
111 | 	size_t pos = 0;
112 | 	uint32_t* start = utf32_output;
113 | 	while (pos < len) {
114 | 		// try to convert the next block of 16 ASCII bytes
115 | 		if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
116 | 			uint64_t v1;
117 | 			memcpy(&v1, data + pos, sizeof(uint64_t));
118 | 			uint64_t v2;
119 | 			memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
120 | 			uint64_t v = v1 | v2;
121 | 			if ((v & 0x8080808080808080) == 0) {
122 | 				size_t final_pos = pos + 16;
123 | 				while(pos < final_pos) {
124 | 					*utf32_output++ = (uint32_t)buf[pos];
125 | 					pos++;
126 | 				}
127 | 				continue;
128 | 			}
129 | 		}
130 | 		uint8_t leading_byte = data[pos]; // leading byte
131 | 		if (leading_byte < 0b10000000) {
132 | 			// converting one ASCII byte !!!
133 | 			*utf32_output++ = (uint32_t)leading_byte;
134 | 			pos++;
135 | 		} else if ((leading_byte & 0b11100000) == 0b11000000) {
136 | 			// We have a two-byte UTF-8
137 | 			if(pos + 1 >= len) { return 0; } // minimal bound checking
138 | 			if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
139 | 			// range check
140 | 			uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
141 | 			if (code_point < 0x80 || 0x7ff < code_point) { return 0; }
142 | 			*utf32_output++ = (uint32_t)code_point;
143 | 			pos += 2;
144 | 		} else if ((leading_byte & 0b11110000) == 0b11100000) {
145 | 			// We have a three-byte UTF-8
146 | 			if (pos + 2 >= len) { return 0; } // minimal bound checking
147 | 
148 | 			if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
149 | 			if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
150 | 			// range check
151 | 			uint32_t code_point = (leading_byte & 0b00001111) << 12 |
152 | 				(data[pos + 1] & 0b00111111) << 6 |
153 | 				(data[pos + 2] & 0b00111111);
154 | 			if (code_point < 0x800 || 0xffff < code_point ||
155 | 					(0xd7ff < code_point && code_point < 0xe000)) {
156 | 				return 0;
157 | 			}
158 | 			*utf32_output++ = (uint32_t)code_point;
159 | 			pos += 3;
160 | 		} else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
161 | 									// we have a 4-byte UTF-8 word.
162 | 			if(pos + 3 >= len) { return 0; } // minimal bound checking
163 | 			if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
164 | 			if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
165 | 			if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; }
166 | 
167 | 			// range check
168 | 			uint32_t code_point =
169 | 				(leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
170 | 				(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
171 | 			if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; }
172 | 			*utf32_output++ = (uint32_t)code_point;
173 | 			pos += 4;
174 | 		} else {
175 | 			return 0;
176 | 		}
177 | 	}
178 | 	return utf32_output - start;
179 | }
180 | 
181 | 
182 | // little endian
183 | size_t
184 | utf16_to_utf8_scalar(const uint16_t *buf, size_t len, char *utf8_output)
185 | {
186 | 	const uint16_t *data = (const uint16_t *)buf;
187 | 	size_t pos = 0;
188 | 	char *start = utf8_output;
189 | 	while (pos < len) {
190 | 		// try to convert the next block of 8 bytes
191 | 		if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
192 | 			uint64_t v;
193 | 			memcpy(&v, data + pos, sizeof(uint64_t));
194 | 			if ((v & 0xFF80FF80FF80FF80) == 0) {
195 | 				size_t final_pos = pos + 4;
196 | 				while(pos < final_pos) {
197 | 					*utf8_output++ =  (char)(buf[pos]);
198 | 					pos++;
199 | 				}
200 | 				continue;
201 | 			}
202 | 		}
203 | 		uint16_t word = data[pos];
204 | 		if((word & 0xFF80)==0) {
205 | 			// will generate one UTF-8 bytes
206 | 			*utf8_output++ = (char)(word);
207 | 			pos++;
208 | 		} else if((word & 0xF800)==0) {
209 | 			// will generate two UTF-8 bytes
210 | 			// we have 0b110XXXXX 0b10XXXXXX
211 | 			*utf8_output++ = (char)((word>>6) | 0b11000000);
212 | 			*utf8_output++ = (char)((word & 0b111111) | 0b10000000);
213 | 			pos++;
214 | 		} else if((word &0xF800 ) != 0xD800) {
215 | 			// will generate three UTF-8 bytes
216 | 			// we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
217 | 			*utf8_output++ = (char)((word>>12) | 0b11100000);
218 | 			*utf8_output++ = (char)(((word>>6) & 0b111111) | 0b10000000);
219 | 			*utf8_output++ = (char)((word & 0b111111) | 0b10000000);
220 | 			pos++;
221 | 		} else {
222 | 			// must be a surrogate pair
223 | 			if(pos + 1 >= len) { return 0; }
224 | 			uint16_t diff = (uint16_t)(word - 0xD800);
225 | 			if(diff > 0x3FF) { return 0; }
226 | 			uint16_t next_word = data[pos + 1];
227 | 			uint16_t diff2 = (uint16_t)(next_word - 0xDC00);
228 | 			if(diff2 > 0x3FF) { return 0; }
229 | 			uint32_t value = (diff << 10) + diff2 + 0x10000;
230 | 			// will generate four UTF-8 bytes
231 | 			// we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
232 | 			*utf8_output++ = (char)((value>>18) | 0b11110000);
233 | 			*utf8_output++ = (char)(((value>>12) & 0b111111) | 0b10000000);
234 | 			*utf8_output++ = (char)(((value>>6) & 0b111111) | 0b10000000);
235 | 			*utf8_output++ = (char)((value & 0b111111) | 0b10000000);
236 | 			pos += 2;
237 | 		}
238 | 	}
239 | 	return utf8_output - start;
240 | }
241 | 
242 | 
243 | size_t
244 | utf32_to_utf8_scalar(const uint32_t *buf, size_t len, char *utf8_output)
245 | {
246 | 	const uint32_t *data = (const uint32_t *)buf;
247 | 	size_t pos = 0;
248 | 	char *start = utf8_output;
249 | 	while (pos < len) {
250 | 		// try to convert the next block of 2 ASCII characters
251 | 		if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
252 | 			uint64_t v;
253 | 			memcpy(&v, data + pos, sizeof(uint64_t));
254 | 			if ((v & 0xFFFFFF80FFFFFF80) == 0) {
255 | 				*utf8_output++ = (char)(buf[pos]);
256 | 				*utf8_output++ = (char)(buf[pos+1]);
257 | 				pos += 2;
258 | 				continue;
259 | 			}
260 | 		}
261 | 		uint32_t word = data[pos];
262 | 		if((word & 0xFFFFFF80)==0) {
263 | 			// will generate one UTF-8 bytes
264 | 			*utf8_output++ = (char)(word);
265 | 			pos++;
266 | 		} else if((word & 0xFFFFF800)==0) {
267 | 			// will generate two UTF-8 bytes
268 | 			// we have 0b110XXXXX 0b10XXXXXX
269 | 			*utf8_output++ = (char)((word>>6) | 0b11000000);
270 | 			*utf8_output++ = (char)((word & 0b111111) | 0b10000000);
271 | 			pos++;
272 | 		} else if((word & 0xFFFF0000)==0) {
273 | 			// will generate three UTF-8 bytes
274 | 			// we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
275 | 			if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
276 | 			*utf8_output++ = (char)((word>>12) | 0b11100000);
277 | 			*utf8_output++ = (char)(((word>>6) & 0b111111) | 0b10000000);
278 | 			*utf8_output++ = (char)((word & 0b111111) | 0b10000000);
279 | 			pos++;
280 | 		} else {
281 | 			// will generate four UTF-8 bytes
282 | 			// we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
283 | 			if (word > 0x10FFFF) { return 0; }
284 | 			*utf8_output++ = (char)((word>>18) | 0b11110000);
285 | 			*utf8_output++ = (char)(((word>>12) & 0b111111) | 0b10000000);
286 | 			*utf8_output++ = (char)(((word>>6) & 0b111111) | 0b10000000);
287 | 			*utf8_output++ = (char)((word & 0b111111) | 0b10000000);
288 | 			pos ++;
289 | 		}
290 | 	}
291 | 	return utf8_output - start;
292 | }
293 | 
294 | // little endian
295 | size_t
296 | utf32_to_utf16_scalar(const uint32_t *buf, size_t len, uint16_t *utf16_output)
297 | {
298 | 	const uint32_t *data = (const uint32_t*)buf;
299 | 	size_t pos = 0;
300 | 	uint16_t *start = utf16_output;
301 | 	while (pos < len) {
302 | 		uint32_t word = data[pos];
303 | 		if((word & 0xFFFF0000)==0) {
304 | 			if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
305 | 			// will not generate a surrogate pair
306 | 			*utf16_output++ = word;
307 | 		} else {
308 | 			// will generate a surrogate pair
309 | 			if (word > 0x10FFFF) { return 0; }
310 | 			word -= 0x10000;
311 | 			uint16_t high_surrogate = 0xD800 + (word >> 10);
312 | 			uint16_t low_surrogate = 0xDC00 + (word & 0x3FF);
313 | 			*utf16_output++ = high_surrogate;
314 | 			*utf16_output++ = low_surrogate;
315 | 		}
316 | 		pos++;
317 | 	}
318 | 	return utf16_output - start;
319 | }
320 | 


--------------------------------------------------------------------------------
/vector-utf/simdutf.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstddef>
 2 | #include <simdutf.h>
 3 | #include <simdutf.cpp>
 4 | 
 5 | extern "C" {
 6 | 
 7 | size_t
 8 | utf8_to_utf16_rvv(char const *src, size_t count, uint16_t *dest)
 9 | {
10 | 	return simdutf::convert_utf8_to_utf16le(src, count, (char16_t*)dest);
11 | }
12 | 
13 | size_t
14 | utf8_to_utf32_rvv(char const *src, size_t count, uint32_t *dest)
15 | {
16 | 	return simdutf::convert_utf8_to_utf32(src, count, (char32_t*)dest);
17 | }
18 | 
19 | size_t
20 | utf16_to_utf8_rvv(uint16_t const *src, size_t count, char *dest)
21 | {
22 | 	return simdutf::convert_utf16le_to_utf8((char16_t*)src, count, dest);
23 | }
24 | 
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/vector-utf/tests/16to8.c:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | size_t utf16_to_utf8_rvv(const uint16_t *src, size_t count, char *dest);
 4 | 
 5 | #define MAX_UTF32_CHARS (1024*16)
 6 | static uint32_t utf32[MAX_UTF32_CHARS];
 7 | static uint8_t out[MAX_UTF32_CHARS*4];
 8 | static uint8_t golden[MAX_UTF32_CHARS*4];
 9 | static uint16_t in[MAX_UTF32_CHARS*2];
10 | 
11 | static void
12 | test(size_t length, size_t bitFlipCount)
13 | {
14 | 	size_t len32 = randu64() % length, origLen;
15 | 	origLen = len32;
16 | 	for (size_t i = 0; i < len32; ++i) {
17 | 		do utf32[i] = randu64() >> (64 - randu64() % 22);
18 | 		while (utf32[i] > 0x10FFFF || (utf32[i] >= 0xD800 && utf32[i] <= 0xDFFF));
19 | 	}
20 | 	size_t lenIn = utf32_to_utf16_scalar(utf32, len32, in);
21 | 
22 | 	if (lenIn)
23 | 	for (size_t i = 0; i < bitFlipCount; ++i)
24 | 		in[randu64() % lenIn] ^= 1 << (randu64() & (sizeof *in - 1));
25 | 
26 | 	size_t lenGolden = utf16_to_utf8_scalar(in, lenIn, (char*)golden);
27 | 	size_t lenOut = utf16_to_utf8_rvv(in, lenIn, (char*)out);
28 | 
29 | 	if (lenGolden != lenOut) {
30 | 		print("ERROR: length mismatch, expected ")(u,lenGolden)(" got ")(u,lenOut)(" from ")(u,origLen);
31 | 		print("\nin:  ");
32 | 		for (size_t i = 0; i < lenIn; ++i)
33 | 			print(b16,in[i])(" ");
34 | 		print("\nout: ");
35 | 		for (size_t i = 0; i < lenOut; ++i)
36 | 			print(b8,out[i])(" ");
37 | 		print("\ntar: ");
38 | 		for (size_t i = 0; i < lenGolden; ++i)
39 | 			print(b8,golden[i])(" ");
40 | 		print_flush();exit(0);
41 | 		return;
42 | 	}
43 | 	for (size_t i = 0; i < lenGolden; ++i) {
44 | 		if (golden[i] != out[i]) {
45 | 			print("ERROR: at ")(u,i)("/")(u,lenGolden)(" expected ")(u,golden[i])(" got ")(u,out[i])("\n");
46 | 		print("\nin:  ");
47 | 		for (size_t i = 0; i < lenIn; ++i)
48 | 			print(b16,in[i])(" ");
49 | 		print("\nout: ");
50 | 		for (size_t i = 0; i < lenOut; ++i)
51 | 			print(b8,out[i])(" ");
52 | 		print("\ntar: ");
53 | 		for (size_t i = 0; i < lenGolden; ++i)
54 | 			print(b8,golden[i])(" ");
55 | 		print_flush();exit(0);
56 | 		}
57 | 	}
58 | }
59 | 
60 | int
61 | main(void)
62 | {
63 | 	randState.x ^= rv_cycles();
64 | 	for (size_t i = 0; i < 10000000; ++i) {
65 | 		test(10,   2);
66 | 		test(10,   10);
67 | 		test(10,   100);
68 | 		test(100,  2);
69 | 		test(100,  10);
70 | 		test(100,  100);
71 | 		test(400,  2);
72 | 		test(400,  10);
73 | 		test(2000, 2);
74 | 		test(2000, 100);
75 | 		if ((i & 127) == 0)
76 | 		print("\r")(u,i)("          ")(flush,);
77 | 	}
78 | 	return 0;
79 | }
80 | 
81 | 


--------------------------------------------------------------------------------
/vector-utf/tests/8to16.c:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | size_t utf8_to_utf16_rvv(char const *src, size_t n, uint16_t *dest);
 4 | 
 5 | #define MAX_UTF32_CHARS (1024*16)
 6 | static uint32_t utf32[MAX_UTF32_CHARS];
 7 | static uint16_t out[MAX_UTF32_CHARS*2];
 8 | static uint16_t golden[MAX_UTF32_CHARS*2];
 9 | static uint8_t in[MAX_UTF32_CHARS*4];
10 | 
11 | static void
12 | test(size_t length, size_t bitFlipCount)
13 | {
14 | 	size_t len32 = randu64() % length, origLen;
15 | 	origLen = len32;
16 | 	for (size_t i = 0; i < len32; ++i) {
17 | 		do utf32[i] = randu64() >> (64 - randu64() % 22);
18 | 		while (utf32[i] > 0x10FFFF || (utf32[i] >= 0xD800 && utf32[i] <= 0xDFFF));
19 | 	}
20 | 	size_t lenIn = utf32_to_utf8_scalar(utf32, len32, (char*)in);
21 | 
22 | 	if (lenIn)
23 | 	for (size_t i = 0; i < bitFlipCount; ++i)
24 | 		in[randu64() % lenIn] ^= 1 << (randu64() & (sizeof *in - 1));
25 | 
26 | 	size_t lenGolden = utf8_to_utf16_scalar((char*)in, lenIn, golden);
27 | 	size_t lenOut = utf8_to_utf16_rvv((char*)in, lenIn, out);
28 | 
29 | 	if (lenGolden != lenOut) {
30 | 		print("ERROR: length mismatch, expected ")(u,lenGolden)(" got ")(u,lenOut)(" from ")(u,origLen)("\n");
31 | 		for (size_t i = 0; i < lenIn; ++i)
32 | 			print(b8,in[i])(" ");
33 | 		print("\nin:  ");
34 | 		for (size_t i = 0; i < lenIn; ++i)
35 | 			print(b8,in[i])(" ");
36 | 		print("\nout: ");
37 | 		for (size_t i = 0; i < lenOut; ++i)
38 | 			print(b16,out[i])(" ");
39 | 		print("\ntar: ");
40 | 		for (size_t i = 0; i < lenGolden; ++i)
41 | 			print(b16,golden[i])(" ");
42 | 		print_flush();exit(0);
43 | 		return;
44 | 	}
45 | 	for (size_t i = 0; i < lenGolden; ++i) {
46 | 		if (golden[i] != out[i]) {
47 | 			print("ERROR: at ")(u,i)("/")(u,lenGolden)(" expected ")(u,golden[i])(" got ")(u,out[i])("\n");
48 | 			return;
49 | 		}
50 | 	}
51 | }
52 | 
53 | int
54 | main(void)
55 | {
56 | 	randState.x ^= rv_cycles();
57 | 	for (size_t i = 0; i < 10000000; ++i) {
58 | 		test(10,   2);
59 | 		test(10,   10);
60 | 		test(10,   100);
61 | 		test(100,  2);
62 | 		test(100,  10);
63 | 		test(100,  100);
64 | 		test(400,  2);
65 | 		test(400,  10);
66 | 		test(2000, 2);
67 | 		test(2000, 100);
68 | 		if ((i & 127) == 0)
69 | 		print("\r")(u,i)("          ")(flush,);
70 | 	}
71 | 	return 0;
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/vector-utf/tests/8to32.c:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | size_t utf8_to_utf32_rvv(char const *src, size_t n, uint32_t *dest);
 4 | 
 5 | #define MAX_UTF32_CHARS (1024*16)
 6 | static uint32_t utf32[MAX_UTF32_CHARS];
 7 | static uint32_t out[MAX_UTF32_CHARS];
 8 | static uint8_t in[MAX_UTF32_CHARS*4];
 9 | 
10 | static void
11 | test(size_t maxlen32, size_t bitFlipCount)
12 | {
13 | 	size_t len32 = randu64() % maxlen32;
14 | 	for (size_t i = 0; i < len32; ++i) {
15 | 		do utf32[i] = randu64() >> (64 - randu64() % 22);
16 | 		while (utf32[i] > 0x10FFFF || (utf32[i] >= 0xD800 && utf32[i] <= 0xDFFF));
17 | 	}
18 | 	size_t lenIn = utf32_to_utf8_scalar(utf32, len32, (char*)in);
19 | 
20 | 	if (lenIn)
21 | 	for (size_t i = 0; i < bitFlipCount; ++i)
22 | 		in[randu64() % lenIn] ^= 1 << (sizeof *in - 1);
23 | 
24 | 	if (bitFlipCount)
25 | 		len32 = utf8_to_utf32_scalar((char*)in, lenIn, utf32);
26 | 	size_t lenOut = utf8_to_utf32_rvv((char*)in, lenIn, out);
27 | 
28 | 	if (len32 != lenOut) {
29 | 		print("ERROR: length mismatch, expected ")(u,len32)(" got ")(u,lenOut)("\n");
30 | 		print("\nin:  ");
31 | 		for (size_t i = 0; i < lenIn; ++i)
32 | 			print(b8,in[i])(" ");
33 | 		print("\nout: ");
34 | 		for (size_t i = 0; i < lenOut; ++i)
35 | 			print(b32,out[i])(" ");
36 | 		print("\ntar: ");
37 | 		for (size_t i = 0; i < len32; ++i)
38 | 			print(b32,utf32[i])(" ");
39 | 		print_flush();exit(0);
40 | 		return;
41 | 	}
42 | 	for (size_t i = 0; i < len32; ++i) {
43 | 		if (utf32[i] != out[i]) {
44 | 			print("ERROR: at ")(u,i)("/")(u,len32)(" expected ")(u,utf32[i])(" got ")(u,out[i])("\n");
45 | 			return;
46 | 		}
47 | 	}
48 | }
49 | 
50 | int
51 | main(void)
52 | {
53 | 	randState.x ^= rv_cycles();
54 | 	for (size_t i = 0; i < 10000000; ++i) {
55 | 		test(10,   2);
56 | 		test(10,   10);
57 | 		test(10,   100);
58 | 		test(100,  2);
59 | 		test(100,  10);
60 | 		test(100,  100);
61 | 		test(400,  2);
62 | 		test(400,  10);
63 | 		test(2000, 2);
64 | 		test(2000, 100);
65 | 		if ((i & 127) == 0)
66 | 		print("\r")(u,i)("          ")(flush,);
67 | 	}
68 | 	return 0;
69 | 
70 | }
71 | 
72 | 


--------------------------------------------------------------------------------
/vector-utf/tests/Makefile:
--------------------------------------------------------------------------------
 1 | .POSIX:
 2 | 
 3 | include ../../config.mk
 4 | 
 5 | TESTS=8to32 8to16 16to8
 6 | 
 7 | all: ${TESTS}
 8 | 
 9 | 8to16: 8to16.c common.h ../8toN_gather.c
10 | 	${CC} ${CFLAGS} -DTO_16=1 -o 8to16 8to16.c ../8toN_gather.c
11 | 
12 | 8to32: 8to32.c common.h ../8toN_gather.c
13 | 	${CC} ${CFLAGS} -o 8to32 8to32.c ../8toN_gather.c
14 | 
15 | 16to8: 16to8.c common.h ../16to8_gather.c
16 | 	${CC} ${CFLAGS} -o 16to8 16to8.c ../16to8_gather.c
17 | 
18 | clean:
19 | 	rm -f ${TESTS}
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/vector-utf/tests/common.h:
--------------------------------------------------------------------------------
 1 | #define NOLIBC_MAIN
 2 | #include "../../nolibc.h"
 3 | #include "../scalar.h"
 4 | 
 5 | static URand randState = { 123, 456, 789 };
 6 | 
 7 | static uint64_t randu64(void) { return urand(&randState); }
 8 | 
 9 | static void
10 | print_b8(uint8_t val)
11 | {
12 | 	if (printEnd - printIt < 8) print_flush();
13 | 	size_t n = 8;
14 | 	while (n--) *printIt++ = (val >> 7) + '0', val <<= 1;
15 | }
16 | 
17 | static void
18 | print_b16(uint16_t val)
19 | {
20 | 	if (printEnd - printIt < 16) print_flush();
21 | 	size_t n = 16;
22 | 	while (n--) *printIt++ = (val >> 15) + '0', val <<= 1;
23 | }
24 | 
25 | static void
26 | print_b32(uint32_t val)
27 | {
28 | 	if (printEnd - printIt < 32) print_flush();
29 | 	size_t n = 32;
30 | 	while (n--) *printIt++ = (val >> 31) + '0', val <<= 1;
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------