├── .editorconfig
├── .github
    └── workflows
    │   └── test.yml
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── Makefile
├── README.md
├── base64-benchmarks.png
├── bin
    ├── base64.c
    ├── base64.manifest
    └── base64.rc
├── cmake
    ├── Modules
    │   ├── TargetArch.cmake
    │   └── TargetSIMDInstructionSet.cmake
    ├── base64-config.cmake.in
    ├── config.h.in
    └── test-arch.c
├── include
    └── libbase64.h
├── lib
    ├── arch
    │   ├── avx
    │   │   ├── codec.c
    │   │   └── enc_loop_asm.c
    │   ├── avx2
    │   │   ├── codec.c
    │   │   ├── dec_loop.c
    │   │   ├── dec_reshuffle.c
    │   │   ├── enc_loop.c
    │   │   ├── enc_loop_asm.c
    │   │   ├── enc_reshuffle.c
    │   │   └── enc_translate.c
    │   ├── avx512
    │   │   ├── codec.c
    │   │   ├── enc_loop.c
    │   │   └── enc_reshuffle_translate.c
    │   ├── generic
    │   │   ├── 32
    │   │   │   ├── dec_loop.c
    │   │   │   └── enc_loop.c
    │   │   ├── 64
    │   │   │   └── enc_loop.c
    │   │   ├── codec.c
    │   │   ├── dec_head.c
    │   │   ├── dec_tail.c
    │   │   ├── enc_head.c
    │   │   └── enc_tail.c
    │   ├── neon32
    │   │   ├── codec.c
    │   │   ├── dec_loop.c
    │   │   ├── enc_loop.c
    │   │   ├── enc_reshuffle.c
    │   │   └── enc_translate.c
    │   ├── neon64
    │   │   ├── codec.c
    │   │   ├── dec_loop.c
    │   │   ├── enc_loop.c
    │   │   ├── enc_loop_asm.c
    │   │   └── enc_reshuffle.c
    │   ├── sse41
    │   │   └── codec.c
    │   ├── sse42
    │   │   └── codec.c
    │   └── ssse3
    │   │   ├── codec.c
    │   │   ├── dec_loop.c
    │   │   ├── dec_reshuffle.c
    │   │   ├── enc_loop.c
    │   │   ├── enc_loop_asm.c
    │   │   ├── enc_reshuffle.c
    │   │   └── enc_translate.c
    ├── codec_choose.c
    ├── codecs.h
    ├── env.h
    ├── exports.txt
    ├── lib.c
    ├── lib_openmp.c
    └── tables
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── table_dec_32bit.h
    │   ├── table_enc_12bit.h
    │   ├── table_enc_12bit.py
    │   ├── table_generator.c
    │   ├── tables.c
    │   └── tables.h
└── test
    ├── CMakeLists.txt
    ├── Makefile
    ├── benchmark.c
    ├── ci
        ├── .gitattributes
        ├── analysis.sh
        ├── checksums.txt
        ├── test.sh
        └── 😵‍💫.bin
    ├── codec_supported.c
    ├── codec_supported.h
    ├── moby_dick.h
    ├── moby_dick_base64.txt
    ├── moby_dick_plain.txt
    └── test_base64.c


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # https://EditorConfig.org
 2 | root = true
 3 | 
 4 | [*]
 5 | charset = utf-8
 6 | insert_final_newline = true
 7 | trim_trailing_whitespace = true
 8 | 
 9 | indent_style = tab
10 | tab_width = 8
11 | indent_size = 8
12 | 
13 | [CMakeLists.txt]
14 | tab_width = 4
15 | indent_style = space
16 | [*.cmake]
17 | tab_width = 4
18 | indent_style = space
19 | 
20 | [*.py]
21 | tab_width = 4
22 | indent_style = space
23 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
  1 | name: Test
  2 | 
  3 | on: [push, pull_request]
  4 | 
  5 | jobs:
  6 |   makefile-analysis:
  7 |     name: makefile-analysis
  8 |     runs-on: ubuntu-24.04
  9 |     steps:
 10 |       - name: Checkout
 11 |         uses: actions/checkout@v4
 12 |       - name: "Install analysis tools"
 13 |         run: |
 14 |           sudo apt-get update
 15 |           sudo apt-get install -y clang-tools valgrind
 16 |       - name: Run tests
 17 |         run: ./test/ci/analysis.sh
 18 | 
 19 |   makefile-test:
 20 |     name: makefile-${{ matrix.runner }}-amd64-${{ matrix.compiler }} ${{ ((matrix.openmp == 1) && '+openmp') || '' }}
 21 |     needs: makefile-analysis
 22 |     runs-on: ${{ matrix.runner }}
 23 |     strategy:
 24 |       fail-fast: false
 25 |       matrix:
 26 |         runner: ["ubuntu-22.04"]
 27 |         compiler: ["gcc", "clang"]
 28 |         openmp: ["0", "1"]
 29 |         include:
 30 |           - runner: "macos-13"
 31 |             compiler: "clang"
 32 |             openmp: "0"
 33 |     env:
 34 |       OPENMP: ${{ matrix.openmp }}
 35 |       OMP_NUM_THREADS: ${{ ((matrix.openmp == 1) && '2') || '0' }}
 36 |       CC: ${{ matrix.compiler }}
 37 |       OBJCOPY: ${{ (startsWith(matrix.runner, 'macos') && 'echo') || 'objcopy' }}
 38 |     steps:
 39 |       - name: Checkout
 40 |         uses: actions/checkout@v4
 41 |       - name: Install Ubuntu libomp support
 42 |         if: runner.os == 'Linux' && matrix.compiler == 'clang' && matrix.openmp == 1
 43 |         run: |
 44 |           sudo apt-get update
 45 |           sudo apt-get install -y libomp-dev
 46 |       - name: Run tests
 47 |         run: ./test/ci/test.sh
 48 | 
 49 |   cmake-test:
 50 |     name: cmake-${{ matrix.runner }}-${{ matrix.platform }}
 51 |     needs: makefile-analysis
 52 |     runs-on: ${{ matrix.runner }}
 53 |     strategy:
 54 |       fail-fast: false
 55 |       matrix:
 56 |         runner: ["ubuntu-22.04", "windows-2019"]
 57 |         platform: ["x86_64", "i686"]
 58 |         include:
 59 |           - runner: "macos-13"
 60 |             platform: "x86_64"
 61 |           - runner: "macos-14"
 62 |             platform: "arm64"
 63 |           - runner: "ubuntu-22.04-arm"
 64 |             platform: "aarch64"
 65 |           - runner: "windows-11-arm"
 66 |             platform: "ARM64"
 67 |     steps:
 68 |       - name: Checkout
 69 |         uses: actions/checkout@v4
 70 |       - name: Install Ubuntu i686 support
 71 |         if: runner.os == 'Linux' && matrix.platform == 'i686'
 72 |         run: |
 73 |           sudo apt-get update
 74 |           sudo apt-get install -y libc6-dev-i386
 75 |           echo "CFLAGS=-m32" >> $GITHUB_ENV
 76 |           echo "LDFLAGS=-m32" >> $GITHUB_ENV
 77 |       - name: CMake Configure
 78 |         run: >
 79 |           cmake
 80 |           -B out
 81 |           -Werror=dev
 82 |           -DBASE64_BUILD_TESTS=ON
 83 |           ${{ runner.os != 'Windows' && '-DCMAKE_BUILD_TYPE=Release' || '' }}
 84 |           ${{ runner.os == 'Windows' && matrix.platform == 'i686' && '-A Win32' || '' }}
 85 |       - name: CMake Build
 86 |         run: cmake --build out --config Release --verbose
 87 |       - name: CTest
 88 |         run: ctest --no-tests=error --test-dir out -VV --build-config Release
 89 |         env:
 90 |           BASE64_TEST_SKIP_AVX512: "1"
 91 | 
 92 |   alpine-makefile-test:
 93 |     name: makefile-alpine-amd64-gcc
 94 |     needs: makefile-analysis
 95 |     runs-on: ubuntu-latest
 96 |     container:
 97 |       image: alpine:3.12
 98 |       env:
 99 |         CC: gcc
100 |     steps:
101 |       - name: Install deps
102 |         run: apk add --update bash build-base git
103 |       - name: Checkout
104 |         uses: actions/checkout@v4
105 |       - name: Run tests
106 |         run: ./test/ci/test.sh
107 | 
108 |   alpine-cmake-test:
109 |     name: cmake-alpine-amd64-gcc
110 |     needs: makefile-analysis
111 |     runs-on: ubuntu-latest
112 |     container:
113 |       image: alpine:3.12
114 |     steps:
115 |       - name: Install deps
116 |         run: apk add --update bash build-base cmake git
117 |       - name: Checkout
118 |         uses: actions/checkout@v4
119 |       - name: CMake Configure
120 |         run: >
121 |           cmake
122 |           -B out
123 |           -Werror=dev
124 |           -DBASE64_BUILD_TESTS=ON
125 |           -DCMAKE_BUILD_TYPE=Release
126 |       - name: CMake Build
127 |         run: cmake --build out --config Release --verbose
128 |       - name: CTest
129 |         run: ctest --no-tests=error -VV --build-config Release
130 |         working-directory: ./out
131 |         env:
132 |           BASE64_TEST_SKIP_AVX512: "1"
133 | 
134 |   alpine-alt-arch-makefile-test:
135 |     name: makefile-alpine-${{matrix.arch}}-${{matrix.cc}}
136 |     needs: makefile-analysis
137 |     runs-on: ubuntu-latest
138 |     strategy:
139 |       fail-fast: false
140 |       matrix:
141 |         arch: [armv7, aarch64, s390x, ppc64le]
142 |         cc: [gcc, clang]
143 |     steps:
144 |       - name: Checkout
145 |         uses: actions/checkout@v4
146 |       - uses: uraimo/run-on-arch-action@v3
147 |         with:
148 |           arch: ${{matrix.arch}}
149 |           distro: alpine_latest
150 |           env: |
151 |             CC: ${{matrix.cc}}
152 |           install: apk add --update bash build-base cmake git ${{matrix.cc}}
153 |           run: ./test/ci/test.sh
154 | 
155 |   alpine-alt-arch-cmake-test:
156 |     name: cmake-alpine-${{matrix.arch}}-${{matrix.cc}}
157 |     needs: makefile-analysis
158 |     runs-on: ubuntu-latest
159 |     strategy:
160 |       fail-fast: false
161 |       matrix:
162 |         arch: [armv7, aarch64, s390x, ppc64le]
163 |         cc: [gcc, clang]
164 |     steps:
165 |       - name: Checkout
166 |         uses: actions/checkout@v4
167 |       - uses: uraimo/run-on-arch-action@v3
168 |         with:
169 |           arch: ${{matrix.arch}}
170 |           distro: alpine_latest
171 |           env: |
172 |             CC: ${{matrix.cc}}
173 |           install: apk add --update bash build-base cmake git ${{matrix.cc}}
174 |           run: |
175 |             echo "::group::CMake Configure"
176 |             cmake -B out -Werror=dev -DBASE64_BUILD_TESTS=ON -DCMAKE_BUILD_TYPE=Release
177 |             echo "::endgroup::CMake Configure"
178 |             echo "::group::CMake Build"
179 |             cmake --build out --config Release --verbose
180 |             echo "::endgroup::CMake Build"
181 |             echo "::group::CTest"
182 |             ctest --no-tests=error --test-dir out -VV --build-config Release
183 |             echo "::endgroup::CTest"
184 | 
185 |   msys2-cmake-test:
186 |     name: msys2-cmake-${{ matrix.msystem }}
187 |     needs: makefile-analysis
188 |     runs-on: windows-2022
189 |     strategy:
190 |       fail-fast: false
191 |       matrix:
192 |         include:
193 |           - { msystem: msys, toolchain: "gcc" }
194 |           - { msystem: mingw32, env: mingw-w64-i686- }
195 |           - { msystem: mingw64, env: mingw-w64-x86_64- }
196 |           - { msystem: ucrt64, env: mingw-w64-ucrt-x86_64- }
197 |           - { msystem: clang64, env: mingw-w64-clang-x86_64- }
198 | 
199 |     steps:
200 |       - name: Checkout
201 |         uses: actions/checkout@v4
202 |       - name: Setup MSYS2 ${{matrix.msystem}}
203 |         uses: msys2/setup-msys2@v2
204 |         with:
205 |           msystem: ${{matrix.msystem}}
206 |           update: true
207 |           install: >-
208 |             make
209 |             ${{ matrix.env }}${{ matrix.toolchain || 'toolchain' }}
210 |             ${{ matrix.env }}cmake
211 |       - name: CMake Configure
212 |         shell: msys2 {0}
213 |         run: >
214 |           cmake
215 |           -B out
216 |           -Werror=dev
217 |           -DBASE64_BUILD_TESTS=ON
218 |           -DCMAKE_BUILD_TYPE=Release
219 |       - name: CMake Build
220 |         shell: msys2 {0}
221 |         run: cmake --build out --config Release --verbose
222 |       - name: CTest
223 |         shell: msys2 {0}
224 |         run: ctest --no-tests=error --test-dir out -VV --build-config Release
225 |         env:
226 |           BASE64_TEST_SKIP_AVX512: "1"
227 |       - name: Test demo utility with unicode filenames and file contents on Windows
228 |         shell: msys2 {0}
229 |         run: |
230 |           out/bin/base64 test/ci/😵‍💫.bin > test/ci/😵‍💫.bin.b64
231 |           out/bin/base64 -d test/ci/😵‍💫.bin.b64 > test/ci/😵‍💫.bin
232 |           cd test/ci
233 |           sha256sum -c checksums.txt
234 | 
235 |   msys2-makefile-test:
236 |     name: msys2-makefile-${{ matrix.msystem }}
237 |     needs: makefile-analysis
238 |     runs-on: windows-2022
239 |     strategy:
240 |       fail-fast: false
241 |       matrix:
242 |         include:
243 |           - { msystem: msys, toolchain: "gcc" }
244 |           - { msystem: mingw32, env: mingw-w64-i686- }
245 |           - { msystem: mingw64, env: mingw-w64-x86_64- }
246 |           - { msystem: ucrt64, env: mingw-w64-ucrt-x86_64- }
247 |           # - { msystem: clang32, env: mingw-w64-clang-i686- }  disabled, lld does not support the "-r" option
248 |           # - { msystem: clang64, env: mingw-w64-clang-x86_64- }  disabled, lld does not support the "-r" option
249 |     env:
250 |       CC: cc.exe
251 |     steps:
252 |       - name: Checkout
253 |         uses: actions/checkout@v4
254 |       - name: Setup MSYS2 ${{matrix.msystem}}
255 |         uses: msys2/setup-msys2@v2
256 |         with:
257 |           msystem: ${{matrix.msystem}}
258 |           update: true
259 |           install: >-
260 |             make
261 |             ${{ matrix.env }}${{ matrix.toolchain || 'toolchain' }}
262 |       - name: Run tests
263 |         shell: msys2 {0}
264 |         run: ./test/ci/test.sh
265 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | bin/base64
 3 | lib/config.h
 4 | test/benchmark
 5 | test/test_base64
 6 | 
 7 | # visual studio symbol db, etc.
 8 | .vs/
 9 | # build directory used by CMakePresets
10 | out/
11 | # private cmake presets
12 | CMakeUserPresets.json
13 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # Written in 2016-2017, 2021 by Henrik Steffen Gaßmann henrik@gassmann.onl
  2 | #
  3 | # To the extent possible under law, the author(s) have dedicated all
  4 | # copyright and related and neighboring rights to this software to the
  5 | # public domain worldwide. This software is distributed without any warranty.
  6 | #
  7 | # You should have received a copy of the CC0 Public Domain Dedication
  8 | # along with this software. If not, see
  9 | #
 10 | #     http://creativecommons.org/publicdomain/zero/1.0/
 11 | #
 12 | ########################################################################
 13 | cmake_minimum_required(VERSION 3.10.2)
 14 | 
 15 | # new dependent option syntax. We are already compliant
 16 | if (POLICY CMP0127)
 17 |     cmake_policy(SET CMP0127 NEW)
 18 | endif()
 19 | 
 20 | project(base64 LANGUAGES C VERSION 0.5.2)
 21 | 
 22 | include(GNUInstallDirs)
 23 | include(CMakeDependentOption)
 24 | include(CheckIncludeFile)
 25 | include(FeatureSummary)
 26 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
 27 | 
 28 | #######################################################################
 29 | # platform detection
 30 | include(TargetArch)
 31 | detect_target_architecture(_TARGET_ARCH)
 32 | 
 33 | check_include_file(getopt.h HAVE_GETOPT_H)
 34 | cmake_dependent_option(BASE64_BUILD_CLI "Build the cli for encoding and decoding" ON "HAVE_GETOPT_H" OFF)
 35 | add_feature_info(CLI BASE64_BUILD_CLI "enables the CLI executable for encoding and decoding")
 36 | 
 37 | ###################################################################
 38 | # optional/conditional dependencies
 39 | find_package(OpenMP)
 40 | set_package_properties(OpenMP PROPERTIES
 41 |     TYPE OPTIONAL
 42 |     PURPOSE "Allows to utilize OpenMP"
 43 | )
 44 | 
 45 | 
 46 | ########################################################################
 47 | # Compilation options
 48 | option(BASE64_WERROR "Treat warnings as error" ON)
 49 | option(BASE64_BUILD_TESTS "add test projects" OFF)
 50 | cmake_dependent_option(BASE64_WITH_OpenMP "use OpenMP" OFF "OpenMP_FOUND" OFF)
 51 | add_feature_info("OpenMP codec" BASE64_WITH_OpenMP "spreads codec work accross multiple threads")
 52 | cmake_dependent_option(BASE64_REGENERATE_TABLES "regenerate the codec tables" OFF "NOT CMAKE_CROSSCOMPILING" OFF)
 53 | 
 54 | set(_IS_X86 "_TARGET_ARCH_x86 OR _TARGET_ARCH_x64")
 55 | cmake_dependent_option(BASE64_WITH_SSSE3 "add SSSE 3 codepath" ON ${_IS_X86} OFF)
 56 | add_feature_info(SSSE3 BASE64_WITH_SSSE3 "add SSSE 3 codepath")
 57 | cmake_dependent_option(BASE64_WITH_SSE41 "add SSE 4.1 codepath" ON ${_IS_X86} OFF)
 58 | add_feature_info(SSE4.1 BASE64_WITH_SSE41 "add SSE 4.1 codepath")
 59 | cmake_dependent_option(BASE64_WITH_SSE42 "add SSE 4.2 codepath" ON ${_IS_X86} OFF)
 60 | add_feature_info(SSE4.2 BASE64_WITH_SSE42 "add SSE 4.2 codepath")
 61 | cmake_dependent_option(BASE64_WITH_AVX "add AVX codepath" ON ${_IS_X86} OFF)
 62 | add_feature_info(AVX BASE64_WITH_AVX "add AVX codepath")
 63 | cmake_dependent_option(BASE64_WITH_AVX2 "add AVX 2 codepath" ON ${_IS_X86} OFF)
 64 | add_feature_info(AVX2 BASE64_WITH_AVX2 "add AVX2 codepath")
 65 | cmake_dependent_option(BASE64_WITH_AVX512 "add AVX 512 codepath" ON ${_IS_X86} OFF)
 66 | add_feature_info(AVX512 BASE64_WITH_AVX512 "add AVX512 codepath")
 67 | 
 68 | cmake_dependent_option(BASE64_WITH_NEON32 "add NEON32 codepath" OFF _TARGET_ARCH_arm OFF)
 69 | add_feature_info(NEON32 BASE64_WITH_NEON32 "add NEON32 codepath")
 70 | 
 71 | cmake_dependent_option(BASE64_WITH_NEON64 "add NEON64 codepath" ON _TARGET_ARCH_arm64 OFF)
 72 | add_feature_info(NEON64 BASE64_WITH_NEON64 "add NEON64 codepath")
 73 | 
 74 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/bin")
 75 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/bin")
 76 | 
 77 | ########################################################################
 78 | # Regenerate headers
 79 | 
 80 | if (BASE64_REGENERATE_TABLES)
 81 |     # Generate tables in build folder and copy to source tree.
 82 |     # Don't add the tables in the source tree to the outputs, to avoid `make clean` removing them.
 83 |     add_executable(table_generator
 84 |         lib/tables/table_generator.c
 85 |     )
 86 | 
 87 |     add_custom_command(OUTPUT table_dec_32bit.h "${CMAKE_CURRENT_SOURCE_DIR}/lib/tables/table_dec_32bit.h"
 88 |         COMMAND table_generator > table_dec_32bit.h
 89 |         COMMAND "${CMAKE_COMMAND}" -E copy table_dec_32bit.h "${CMAKE_CURRENT_SOURCE_DIR}/lib/tables/table_dec_32bit.h"
 90 |         DEPENDS table_generator
 91 |     )
 92 |     set(Python_ADDITIONAL_VERSIONS 3)
 93 |     find_package(PythonInterp REQUIRED)
 94 |     add_custom_command(OUTPUT table_enc_12bit.h "${CMAKE_CURRENT_SOURCE_DIR}/lib/tables/table_enc_12bit.h"
 95 |         COMMAND "${PYTHON_EXECUTABLE}" "${CMAKE_CURRENT_SOURCE_DIR}/lib/tables/table_enc_12bit.py" > table_enc_12bit.h
 96 |         COMMAND "${CMAKE_COMMAND}" -E copy table_enc_12bit.h "${CMAKE_CURRENT_SOURCE_DIR}/lib/tables/table_enc_12bit.h"
 97 |         DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/lib/tables/table_enc_12bit.py"
 98 |     )
 99 | endif()
100 | 
101 | 
102 | ########################################################################
103 | # library project
104 | add_library(base64
105 |     # library files
106 |     lib/lib.c
107 |     lib/codec_choose.c
108 |     include/libbase64.h
109 | 
110 |     lib/tables/tables.c
111 |     # Add generated headers explicitly to target, to insert them in the dependency tree
112 |     lib/tables/table_dec_32bit.h
113 |     lib/tables/table_enc_12bit.h
114 | 
115 |     # codec implementations
116 |     lib/arch/generic/codec.c
117 | 
118 |     lib/arch/ssse3/codec.c
119 |     lib/arch/sse41/codec.c
120 |     lib/arch/sse42/codec.c
121 |     lib/arch/avx/codec.c
122 |     lib/arch/avx2/codec.c
123 |     lib/arch/avx512/codec.c
124 | 
125 |     lib/arch/neon32/codec.c
126 |     lib/arch/neon64/codec.c
127 | )
128 | 
129 | target_include_directories(base64
130 |     PUBLIC
131 |         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
132 |         $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
133 |     PRIVATE
134 |         "${CMAKE_CURRENT_BINARY_DIR}"
135 | )
136 | 
137 | ####################################################################
138 | # platform/compiler specific configuration
139 | set_target_properties(base64 PROPERTIES
140 |     C_STANDARD 99
141 |     C_STANDARD_REQUIRED YES
142 |     C_EXTENSIONS OFF
143 |     DEFINE_SYMBOL BASE64_EXPORTS
144 |     VERSION ${PROJECT_VERSION}
145 |     SOVERSION ${PROJECT_VERSION_MAJOR}
146 | )
147 | 
148 | #generate_export_header(base64)
149 | # the following definitions and those in libbase64.h have been
150 | # kept forward compatible in case we ever switch to generate_export_header
151 | if (BUILD_SHARED_LIBS)
152 |     set_target_properties(base64 PROPERTIES
153 |         C_VISIBILITY_PRESET hidden
154 |     )
155 | else()
156 |     target_compile_definitions(base64
157 |         PUBLIC
158 |             BASE64_STATIC_DEFINE
159 |     )
160 | endif()
161 | 
162 | target_compile_options(base64 PRIVATE
163 |   $<$<C_COMPILER_ID:MSVC>:
164 |     /W4
165 |     /we4013 # Error warning C4013: 'function' undefined; assuming extern returning int
166 |     /we4700 # Error warning C4700: uninitialized local variable
167 |     /we4715 # not all control paths return a value
168 |     /we4003 # not enough actual parameters for macro
169 |     /wd4456 # disable warning C4456: declaration of 'xxx' hides previous local declaration
170 |   >
171 |   $<$<NOT:$<C_COMPILER_ID:MSVC>>:
172 |     -Wall
173 |     -Wextra
174 |     -Wpedantic
175 |   >
176 |   $<$<BOOL:${BASE64_WERROR}>:$<IF:$<C_COMPILER_ID:MSVC>,/WX,-Werror>>
177 | )
178 | 
179 | target_compile_definitions(base64 PRIVATE
180 |   $<$<C_COMPILER_ID:MSVC>:
181 |     # remove unnecessary warnings about unchecked iterators
182 |     _SCL_SECURE_NO_WARNINGS
183 |   >
184 | )
185 | 
186 | ########################################################################
187 | # SIMD settings
188 | include(TargetSIMDInstructionSet)
189 | define_SIMD_compile_flags()
190 | 
191 | if (_TARGET_ARCH STREQUAL "x86" OR _TARGET_ARCH STREQUAL "x64")
192 |     macro(configure_codec _TYPE)
193 |         if (BASE64_WITH_${_TYPE})
194 |             string(TOLOWER "${_TYPE}" _DIR)
195 |             set_source_files_properties("lib/arch/${_DIR}/codec.c" PROPERTIES
196 |                 COMPILE_FLAGS "${COMPILE_FLAGS_${_TYPE}}"
197 |             )
198 | 
199 |             if (${ARGC} GREATER 1 AND MSVC)
200 |                 set_source_files_properties("lib/arch/${_DIR}/codec.c" PROPERTIES
201 |                     COMPILE_DEFINITIONS ${ARGV1}
202 |                 )
203 |             endif()
204 |         endif()
205 |     endmacro()
206 | 
207 |     configure_codec(SSSE3 __SSSE3__)
208 |     configure_codec(SSE41 __SSSE4_1__)
209 |     configure_codec(SSE42 __SSSE4_2__)
210 |     configure_codec(AVX)
211 |     configure_codec(AVX2)
212 |     configure_codec(AVX512)
213 | 
214 | elseif (_TARGET_ARCH STREQUAL "arm")
215 |     set(BASE64_NEON32_CFLAGS "${COMPILE_FLAGS_NEON32}" CACHE STRING "the NEON32 compile flags (for 'lib/arch/neon32/codec.c')")
216 |     mark_as_advanced(BASE64_NEON32_CFLAGS)
217 | 
218 |     if (BASE64_WITH_NEON32)
219 |         set_source_files_properties("lib/arch/neon32/codec.c" PROPERTIES
220 |             COMPILE_FLAGS "${BASE64_NEON32_CFLAGS} "
221 |         )
222 |     endif()
223 | 
224 | #elseif (_TARGET_ARCH STREQUAL "arm64" AND BASE64_WITH_NEON64)
225 | 
226 | endif()
227 | 
228 | configure_file("${CMAKE_CURRENT_LIST_DIR}/cmake/config.h.in" "${CMAKE_CURRENT_BINARY_DIR}/config.h" @ONLY)
229 | 
230 | ########################################################################
231 | # OpenMP Settings
232 | if (BASE64_WITH_OpenMP)
233 |     target_link_libraries(base64 PRIVATE OpenMP::OpenMP_C)
234 | endif()
235 | 
236 | ########################################################################
237 | if (BASE64_BUILD_TESTS)
238 |     enable_testing()
239 |     add_subdirectory(test)
240 | endif()
241 | 
242 | ########################################################################
243 | # base64
244 | if (BASE64_BUILD_CLI)
245 |     add_executable(base64-bin
246 |         bin/base64.c
247 |     )
248 |     target_link_libraries(base64-bin PRIVATE base64)
249 |     set_target_properties(base64-bin PROPERTIES
250 |         OUTPUT_NAME base64
251 |     )
252 | 
253 |     if (WIN32)
254 |       target_sources(base64-bin PRIVATE bin/base64.rc)
255 |     endif ()
256 | endif()
257 | 
258 | ########################################################################
259 | # cmake install
260 | install(DIRECTORY include/ TYPE INCLUDE)
261 | install(TARGETS base64
262 |     EXPORT base64-targets
263 |     DESTINATION ${CMAKE_INSTALL_LIBDIR}
264 |     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
265 | )
266 | if (BASE64_BUILD_CLI)
267 |     install(TARGETS base64-bin EXPORT base64-targets DESTINATION ${CMAKE_INSTALL_BINDIR})
268 | endif()
269 | 
270 | include(CMakePackageConfigHelpers)
271 | configure_package_config_file(cmake/base64-config.cmake.in
272 |     "${CMAKE_CURRENT_BINARY_DIR}/base64-config.cmake"
273 | 
274 |     INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}"
275 | )
276 | write_basic_package_version_file(
277 |     "${CMAKE_CURRENT_BINARY_DIR}/base64-config-version.cmake"
278 |     VERSION ${BASE64_VERSION}
279 |     COMPATIBILITY SameMajorVersion
280 | )
281 | 
282 | install(FILES
283 |         "${CMAKE_CURRENT_BINARY_DIR}/base64-config.cmake"
284 |         "${CMAKE_CURRENT_BINARY_DIR}/base64-config-version.cmake"
285 |     DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}"
286 | )
287 | 
288 | install(EXPORT base64-targets
289 |     NAMESPACE aklomp::
290 |     DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}"
291 | )
292 | 
293 | ########################################################################
294 | feature_summary(WHAT PACKAGES_FOUND PACKAGES_NOT_FOUND ENABLED_FEATURES DISABLED_FEATURES)
295 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2005-2007, Nick Galbreath
 2 | Copyright (c) 2015-2018, Wojciech Muła
 3 | Copyright (c) 2016-2017, Matthieu Darbois
 4 | Copyright (c) 2013-2022, Alfred Klomp
 5 | All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without
 8 | modification, are permitted provided that the following conditions are
 9 | met:
10 | 
11 | - Redistributions of source code must retain the above copyright notice,
12 |   this list of conditions and the following disclaimer.
13 | 
14 | - Redistributions in binary form must reproduce the above copyright
15 |   notice, this list of conditions and the following disclaimer in the
16 |   documentation and/or other materials provided with the distribution.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
21 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
24 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | CFLAGS += -std=c99 -O3 -Wall -Wextra -pedantic -DBASE64_STATIC_DEFINE
  2 | 
  3 | # Set OBJCOPY if not defined by environment:
  4 | OBJCOPY ?= objcopy
  5 | 
  6 | OBJS = \
  7 |   lib/arch/avx512/codec.o \
  8 |   lib/arch/avx2/codec.o \
  9 |   lib/arch/generic/codec.o \
 10 |   lib/arch/neon32/codec.o \
 11 |   lib/arch/neon64/codec.o \
 12 |   lib/arch/ssse3/codec.o \
 13 |   lib/arch/sse41/codec.o \
 14 |   lib/arch/sse42/codec.o \
 15 |   lib/arch/avx/codec.o \
 16 |   lib/lib.o \
 17 |   lib/codec_choose.o \
 18 |   lib/tables/tables.o
 19 | 
 20 | HAVE_AVX512 = 0
 21 | HAVE_AVX2   = 0
 22 | HAVE_NEON32 = 0
 23 | HAVE_NEON64 = 0
 24 | HAVE_SSSE3  = 0
 25 | HAVE_SSE41  = 0
 26 | HAVE_SSE42  = 0
 27 | HAVE_AVX    = 0
 28 | 
 29 | # The user should supply compiler flags for the codecs they want to build.
 30 | # Check which codecs we're going to include:
 31 | ifdef AVX512_CFLAGS
 32 |   HAVE_AVX512 = 1
 33 | endif
 34 | ifdef AVX2_CFLAGS
 35 |   HAVE_AVX2 = 1
 36 | endif
 37 | ifdef NEON32_CFLAGS
 38 |   HAVE_NEON32 = 1
 39 | endif
 40 | ifdef NEON64_CFLAGS
 41 |   HAVE_NEON64 = 1
 42 | endif
 43 | ifdef SSSE3_CFLAGS
 44 |   HAVE_SSSE3 = 1
 45 | endif
 46 | ifdef SSE41_CFLAGS
 47 |   HAVE_SSE41 = 1
 48 | endif
 49 | ifdef SSE42_CFLAGS
 50 |   HAVE_SSE42 = 1
 51 | endif
 52 | ifdef AVX_CFLAGS
 53 |   HAVE_AVX = 1
 54 | endif
 55 | ifdef OPENMP
 56 |   CFLAGS += -fopenmp
 57 | endif
 58 | 
 59 | TARGET := $(shell $(CC) -dumpmachine)
 60 | 
 61 | .PHONY: all analyze clean
 62 | 
 63 | all: bin/base64 lib/libbase64.o
 64 | 
 65 | bin/base64: bin/base64.o lib/libbase64.o
 66 | 	$(CC) $(CFLAGS) -o $@ $^
 67 | 
 68 | # Workaround: mangle exported function names on MinGW32.
 69 | lib/exports.build.txt: lib/exports.txt
 70 | ifeq (i686-w64-mingw32, $(TARGET))
 71 | 	sed -e 's/^/_/' $< > $@
 72 | else
 73 | 	cp -f $< $@
 74 | endif
 75 | 
 76 | lib/libbase64.o: lib/exports.build.txt $(OBJS)
 77 | 	$(LD) -r -o $@ $(OBJS)
 78 | 	$(OBJCOPY) --keep-global-symbols=$< $@
 79 | 
 80 | lib/config.h:
 81 | 	@echo "#define HAVE_AVX512 $(HAVE_AVX512)"  > $@
 82 | 	@echo "#define HAVE_AVX2   $(HAVE_AVX2)"   >> $@
 83 | 	@echo "#define HAVE_NEON32 $(HAVE_NEON32)" >> $@
 84 | 	@echo "#define HAVE_NEON64 $(HAVE_NEON64)" >> $@
 85 | 	@echo "#define HAVE_SSSE3  $(HAVE_SSSE3)"  >> $@
 86 | 	@echo "#define HAVE_SSE41  $(HAVE_SSE41)"  >> $@
 87 | 	@echo "#define HAVE_SSE42  $(HAVE_SSE42)"  >> $@
 88 | 	@echo "#define HAVE_AVX    $(HAVE_AVX)"    >> $@
 89 | 
 90 | $(OBJS): lib/config.h
 91 | $(OBJS): CFLAGS += -Ilib
 92 | 
 93 | lib/arch/avx512/codec.o: CFLAGS += $(AVX512_CFLAGS)
 94 | lib/arch/avx2/codec.o:   CFLAGS += $(AVX2_CFLAGS)
 95 | lib/arch/neon32/codec.o: CFLAGS += $(NEON32_CFLAGS)
 96 | lib/arch/neon64/codec.o: CFLAGS += $(NEON64_CFLAGS)
 97 | lib/arch/ssse3/codec.o:  CFLAGS += $(SSSE3_CFLAGS)
 98 | lib/arch/sse41/codec.o:  CFLAGS += $(SSE41_CFLAGS)
 99 | lib/arch/sse42/codec.o:  CFLAGS += $(SSE42_CFLAGS)
100 | lib/arch/avx/codec.o:    CFLAGS += $(AVX_CFLAGS)
101 | 
102 | %.o: %.c
103 | 	$(CC) $(CFLAGS) -o $@ -c $<
104 | 
105 | analyze: clean
106 | 	scan-build --use-analyzer=`which clang` --status-bugs make
107 | 
108 | clean:
109 | 	rm -f bin/base64 bin/base64.o lib/libbase64.o lib/config.h lib/exports.build.txt $(OBJS)
110 | 


--------------------------------------------------------------------------------
/base64-benchmarks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aklomp/base64/9e8ed65048ff0f703fad3deb03bf66ac7f78a4d7/base64-benchmarks.png


--------------------------------------------------------------------------------
/bin/base64.manifest:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
 2 | <assembly manifestVersion="1.0" xmlns="urn:schemas-microsoft-com:asm.v1">
 3 | 	<assemblyIdentity type="win32" name="base64" version="0.5.2.0" />
 4 | 	<trustInfo xmlns="urn:schemas-microsoft-com:asm.v2">
 5 | 		<security>
 6 | 			<requestedPrivileges xmlns="urn:schemas-microsoft-com:asm.v3">
 7 | 				<requestedExecutionLevel level="asInvoker" uiAccess="false" />
 8 | 			</requestedPrivileges>
 9 | 		</security>
10 | 	</trustInfo>
11 | 	<compatibility xmlns="urn:schemas-microsoft-com:compatibility.v1">
12 | 		<application>
13 | 			<!-- Windows 10 and Windows 11 -->
14 | 			<supportedOS Id="{8e0f7a12-bfb3-4fe8-b9a5-48fd50a15a9a}" />
15 | 		</application>
16 | 	</compatibility>
17 | 	<application>
18 | 		<windowsSettings>
19 | 			<activeCodePage
20 | 				xmlns="http://schemas.microsoft.com/SMI/2019/WindowsSettings">UTF-8</activeCodePage>
21 | 		</windowsSettings>
22 | 	</application>
23 | </assembly>
24 | 


--------------------------------------------------------------------------------
/bin/base64.rc:
--------------------------------------------------------------------------------
1 | #include "winuser.h"
2 | CREATEPROCESS_MANIFEST_RESOURCE_ID RT_MANIFEST "base64.manifest"
3 | 


--------------------------------------------------------------------------------
/cmake/Modules/TargetArch.cmake:
--------------------------------------------------------------------------------
 1 | # Written in 2017 by Henrik Steffen Gaßmann henrik@gassmann.onl
 2 | #
 3 | # To the extent possible under law, the author(s) have dedicated all
 4 | # copyright and related and neighboring rights to this software to the
 5 | # public domain worldwide. This software is distributed without any warranty.
 6 | #
 7 | # You should have received a copy of the CC0 Public Domain Dedication
 8 | # along with this software. If not, see
 9 | #
10 | #     http://creativecommons.org/publicdomain/zero/1.0/
11 | #
12 | ########################################################################
13 | 
14 | set(TARGET_ARCHITECTURE_TEST_FILE "${CMAKE_CURRENT_LIST_DIR}/../test-arch.c")
15 | 
16 | function(detect_target_architecture OUTPUT_VARIABLE)
17 |     message(STATUS "${CMAKE_CURRENT_LIST_DIR}")
18 |     try_compile(_IGNORED "${CMAKE_CURRENT_BINARY_DIR}"
19 |         "${TARGET_ARCHITECTURE_TEST_FILE}"
20 |         OUTPUT_VARIABLE _LOG
21 |     )
22 | 
23 |     string(REGEX MATCH "##arch=([^#]+)##" _IGNORED "${_LOG}")
24 | 
25 |     set(${OUTPUT_VARIABLE} "${CMAKE_MATCH_1}" PARENT_SCOPE)
26 |     set("${OUTPUT_VARIABLE}_${CMAKE_MATCH_1}" 1 PARENT_SCOPE)
27 |     if (CMAKE_MATCH_1 STREQUAL "unknown")
28 |         message(WARNING "could not detect the target architecture.")
29 |     endif()
30 | endfunction()
31 | 


--------------------------------------------------------------------------------
/cmake/Modules/TargetSIMDInstructionSet.cmake:
--------------------------------------------------------------------------------
 1 | # Written in 2016-2017 by Henrik Steffen Gaßmann henrik@gassmann.onl
 2 | #
 3 | # To the extent possible under law, the author(s) have dedicated all
 4 | # copyright and related and neighboring rights to this software to the
 5 | # public domain worldwide. This software is distributed without any warranty.
 6 | #
 7 | # You should have received a copy of the CC0 Public Domain Dedication
 8 | # along with this software. If not, see
 9 | #
10 | #     http://creativecommons.org/publicdomain/zero/1.0/
11 | #
12 | ########################################################################
13 | 
14 | ########################################################################
15 | # compiler flags definition
16 | macro(define_SIMD_compile_flags)
17 |     if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang" OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang")
18 |         # x86
19 |         set(COMPILE_FLAGS_SSSE3 "-mssse3")
20 |         set(COMPILE_FLAGS_SSE41 "-msse4.1")
21 |         set(COMPILE_FLAGS_SSE42 "-msse4.2")
22 |         set(COMPILE_FLAGS_AVX "-mavx")
23 |         set(COMPILE_FLAGS_AVX2 "-mavx2")
24 |         set(COMPILE_FLAGS_AVX512 "-mavx512vl -mavx512vbmi")
25 | 
26 |         #arm
27 |         set(COMPILE_FLAGS_NEON32 "-mfpu=neon")
28 |     elseif(MSVC)
29 |         set(COMPILE_FLAGS_SSSE3 " ")
30 |         set(COMPILE_FLAGS_SSE41 " ")
31 |         set(COMPILE_FLAGS_SSE42 " ")
32 |         set(COMPILE_FLAGS_AVX "/arch:AVX")
33 |         set(COMPILE_FLAGS_AVX2 "/arch:AVX2")
34 |         set(COMPILE_FLAGS_AVX512 "/arch:AVX512")
35 |     endif()
36 | endmacro(define_SIMD_compile_flags)
37 | 


--------------------------------------------------------------------------------
/cmake/base64-config.cmake.in:
--------------------------------------------------------------------------------
1 | @PACKAGE_INIT@
2 | 
3 | include("${CMAKE_CURRENT_LIST_DIR}/base64-targets.cmake")
4 | 
5 | check_required_components(base64)
6 | 


--------------------------------------------------------------------------------
/cmake/config.h.in:
--------------------------------------------------------------------------------
 1 | #ifndef BASE64_CONFIG_H
 2 | #define BASE64_CONFIG_H
 3 | 
 4 | #cmakedefine01 BASE64_WITH_SSSE3
 5 | #define HAVE_SSSE3 BASE64_WITH_SSSE3
 6 | 
 7 | #cmakedefine01 BASE64_WITH_SSE41
 8 | #define HAVE_SSE41 BASE64_WITH_SSE41
 9 | 
10 | #cmakedefine01 BASE64_WITH_SSE42
11 | #define HAVE_SSE42 BASE64_WITH_SSE42
12 | 
13 | #cmakedefine01 BASE64_WITH_AVX
14 | #define HAVE_AVX BASE64_WITH_AVX
15 | 
16 | #cmakedefine01 BASE64_WITH_AVX2
17 | #define HAVE_AVX2 BASE64_WITH_AVX2
18 | 
19 | #cmakedefine01 BASE64_WITH_AVX512
20 | #define HAVE_AVX512 BASE64_WITH_AVX512
21 | 
22 | #cmakedefine01 BASE64_WITH_NEON32
23 | #define HAVE_NEON32 BASE64_WITH_NEON32
24 | 
25 | #cmakedefine01 BASE64_WITH_NEON64
26 | #define HAVE_NEON64 BASE64_WITH_NEON64
27 | 
28 | #endif // BASE64_CONFIG_H
29 | 


--------------------------------------------------------------------------------
/cmake/test-arch.c:
--------------------------------------------------------------------------------
 1 | // Written in 2017 by Henrik Steffen Gaßmann henrik@gassmann.onl
 2 | //
 3 | // To the extent possible under law, the author(s) have dedicated all
 4 | // copyright and related and neighboring rights to this software to the
 5 | // public domain worldwide. This software is distributed without any warranty.
 6 | //
 7 | // You should have received a copy of the CC0 Public Domain Dedication
 8 | // along with this software. If not, see
 9 | //
10 | //     http://creativecommons.org/publicdomain/zero/1.0/
11 | //
12 | ////////////////////////////////////////////////////////////////////////////////
13 | 
14 | // ARM 64-Bit
15 | #if defined(__aarch64__) \
16 |     || defined(_M_ARM64)
17 | #error ##arch=arm64##
18 | 
19 | // ARM 32-Bit
20 | #elif defined(__arm__) \
21 |     || defined(_M_ARM)
22 | #error ##arch=arm##
23 | 
24 | // x86 64-Bit
25 | #elif defined(__x86_64__) \
26 |     || defined(_M_X64)
27 | #error ##arch=x64##
28 | 
29 | // x86 32-Bit
30 | #elif defined(__i386__) \
31 |     || defined(_M_IX86)
32 | #error ##arch=x86##
33 | 
34 | #else
35 | #error ##arch=unknown##
36 | #endif
37 | 


--------------------------------------------------------------------------------
/include/libbase64.h:
--------------------------------------------------------------------------------
  1 | #ifndef LIBBASE64_H
  2 | #define LIBBASE64_H
  3 | 
  4 | #include <stddef.h>	/* size_t */
  5 | 
  6 | 
  7 | #if defined(_WIN32) || defined(__CYGWIN__)
  8 | #define BASE64_SYMBOL_IMPORT __declspec(dllimport)
  9 | #define BASE64_SYMBOL_EXPORT __declspec(dllexport)
 10 | #define BASE64_SYMBOL_PRIVATE
 11 | 
 12 | #elif __GNUC__ >= 4
 13 | #define BASE64_SYMBOL_IMPORT   __attribute__ ((visibility ("default")))
 14 | #define BASE64_SYMBOL_EXPORT   __attribute__ ((visibility ("default")))
 15 | #define BASE64_SYMBOL_PRIVATE  __attribute__ ((visibility ("hidden")))
 16 | 
 17 | #else
 18 | #define BASE64_SYMBOL_IMPORT
 19 | #define BASE64_SYMBOL_EXPORT
 20 | #define BASE64_SYMBOL_PRIVATE
 21 | #endif
 22 | 
 23 | #if defined(BASE64_STATIC_DEFINE)
 24 | #define BASE64_EXPORT
 25 | #define BASE64_NO_EXPORT
 26 | 
 27 | #else
 28 | #if defined(BASE64_EXPORTS) // defined if we are building the shared library
 29 | #define BASE64_EXPORT BASE64_SYMBOL_EXPORT
 30 | 
 31 | #else
 32 | #define BASE64_EXPORT BASE64_SYMBOL_IMPORT
 33 | #endif
 34 | 
 35 | #define BASE64_NO_EXPORT BASE64_SYMBOL_PRIVATE
 36 | #endif
 37 | 
 38 | 
 39 | #ifdef __cplusplus
 40 | extern "C" {
 41 | #endif
 42 | 
 43 | /* These are the flags that can be passed in the `flags` argument. The values
 44 |  * below force the use of a given codec, even if that codec is a no-op in the
 45 |  * current build. Used in testing. Set to 0 for the default behavior, which is
 46 |  * runtime feature detection on x86, a compile-time fixed codec on ARM, and
 47 |  * the plain codec on other platforms: */
 48 | #define BASE64_FORCE_AVX2	(1 << 0)
 49 | #define BASE64_FORCE_NEON32	(1 << 1)
 50 | #define BASE64_FORCE_NEON64	(1 << 2)
 51 | #define BASE64_FORCE_PLAIN	(1 << 3)
 52 | #define BASE64_FORCE_SSSE3	(1 << 4)
 53 | #define BASE64_FORCE_SSE41	(1 << 5)
 54 | #define BASE64_FORCE_SSE42	(1 << 6)
 55 | #define BASE64_FORCE_AVX	(1 << 7)
 56 | #define BASE64_FORCE_AVX512	(1 << 8)
 57 | 
 58 | struct base64_state {
 59 | 	int eof;
 60 | 	int bytes;
 61 | 	int flags;
 62 | 	unsigned char carry;
 63 | };
 64 | 
 65 | /* Wrapper function to encode a plain string of given length. Output is written
 66 |  * to *out without trailing zero. Output length in bytes is written to *outlen.
 67 |  * The buffer in `out` has been allocated by the caller and is at least 4/3 the
 68 |  * size of the input. See above for `flags`; set to 0 for default operation: */
 69 | void BASE64_EXPORT base64_encode
 70 | 	( const char		*src
 71 | 	, size_t		 srclen
 72 | 	, char			*out
 73 | 	, size_t		*outlen
 74 | 	, int			 flags
 75 | 	) ;
 76 | 
 77 | /* Call this before calling base64_stream_encode() to init the state. See above
 78 |  * for `flags`; set to 0 for default operation: */
 79 | void BASE64_EXPORT base64_stream_encode_init
 80 | 	( struct base64_state	*state
 81 | 	, int			 flags
 82 | 	) ;
 83 | 
 84 | /* Encodes the block of data of given length at `src`, into the buffer at
 85 |  * `out`. Caller is responsible for allocating a large enough out-buffer; it
 86 |  * must be at least 4/3 the size of the in-buffer, but take some margin. Places
 87 |  * the number of new bytes written into `outlen` (which is set to zero when the
 88 |  * function starts). Does not zero-terminate or finalize the output. */
 89 | void BASE64_EXPORT base64_stream_encode
 90 | 	( struct base64_state	*state
 91 | 	, const char		*src
 92 | 	, size_t		 srclen
 93 | 	, char			*out
 94 | 	, size_t		*outlen
 95 | 	) ;
 96 | 
 97 | /* Finalizes the output begun by previous calls to `base64_stream_encode()`.
 98 |  * Adds the required end-of-stream markers if appropriate. `outlen` is modified
 99 |  * and will contain the number of new bytes written at `out` (which will quite
100 |  * often be zero). */
101 | void BASE64_EXPORT base64_stream_encode_final
102 | 	( struct base64_state	*state
103 | 	, char			*out
104 | 	, size_t		*outlen
105 | 	) ;
106 | 
107 | /* Wrapper function to decode a plain string of given length. Output is written
108 |  * to *out without trailing zero. Output length in bytes is written to *outlen.
109 |  * The buffer in `out` has been allocated by the caller and is at least 3/4 the
110 |  * size of the input. See above for `flags`, set to 0 for default operation: */
111 | int BASE64_EXPORT base64_decode
112 | 	( const char		*src
113 | 	, size_t		 srclen
114 | 	, char			*out
115 | 	, size_t		*outlen
116 | 	, int			 flags
117 | 	) ;
118 | 
119 | /* Call this before calling base64_stream_decode() to init the state. See above
120 |  * for `flags`; set to 0 for default operation: */
121 | void BASE64_EXPORT base64_stream_decode_init
122 | 	( struct base64_state	*state
123 | 	, int			 flags
124 | 	) ;
125 | 
126 | /* Decodes the block of data of given length at `src`, into the buffer at
127 |  * `out`. Caller is responsible for allocating a large enough out-buffer; it
128 |  * must be at least 3/4 the size of the in-buffer, but take some margin. Places
129 |  * the number of new bytes written into `outlen` (which is set to zero when the
130 |  * function starts). Does not zero-terminate the output. Returns 1 if all is
131 |  * well, and 0 if a decoding error was found, such as an invalid character.
132 |  * Returns -1 if the chosen codec is not included in the current build. Used by
133 |  * the test harness to check whether a codec is available for testing. */
134 | int BASE64_EXPORT base64_stream_decode
135 | 	( struct base64_state	*state
136 | 	, const char		*src
137 | 	, size_t		 srclen
138 | 	, char			*out
139 | 	, size_t		*outlen
140 | 	) ;
141 | 
142 | #ifdef __cplusplus
143 | }
144 | #endif
145 | 
146 | #endif /* LIBBASE64_H */
147 | 


--------------------------------------------------------------------------------
/lib/arch/avx/codec.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stddef.h>
 3 | #include <stdlib.h>
 4 | 
 5 | #include "../../../include/libbase64.h"
 6 | #include "../../tables/tables.h"
 7 | #include "../../codecs.h"
 8 | #include "config.h"
 9 | #include "../../env.h"
10 | 
11 | #if HAVE_AVX
12 | #include <immintrin.h>
13 | 
14 | // Only enable inline assembly on supported compilers and on 64-bit CPUs.
15 | #ifndef BASE64_AVX_USE_ASM
16 | # if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
17 | #  define BASE64_AVX_USE_ASM 1
18 | # else
19 | #  define BASE64_AVX_USE_ASM 0
20 | # endif
21 | #endif
22 | 
23 | #include "../ssse3/dec_reshuffle.c"
24 | #include "../ssse3/dec_loop.c"
25 | 
26 | #if BASE64_AVX_USE_ASM
27 | # include "enc_loop_asm.c"
28 | #else
29 | # include "../ssse3/enc_translate.c"
30 | # include "../ssse3/enc_reshuffle.c"
31 | # include "../ssse3/enc_loop.c"
32 | #endif
33 | 
34 | #endif	// HAVE_AVX
35 | 
36 | void
37 | base64_stream_encode_avx BASE64_ENC_PARAMS
38 | {
39 | #if HAVE_AVX
40 | 	#include "../generic/enc_head.c"
41 | 
42 | 	// For supported compilers, use a hand-optimized inline assembly
43 | 	// encoder. Otherwise fall back on the SSSE3 encoder, but compiled with
44 | 	// AVX flags to generate better optimized AVX code.
45 | 
46 | #if BASE64_AVX_USE_ASM
47 | 	enc_loop_avx(&s, &slen, &o, &olen);
48 | #else
49 | 	enc_loop_ssse3(&s, &slen, &o, &olen);
50 | #endif
51 | 
52 | 	#include "../generic/enc_tail.c"
53 | #else
54 | 	base64_enc_stub(state, src, srclen, out, outlen);
55 | #endif
56 | }
57 | 
58 | int
59 | base64_stream_decode_avx BASE64_DEC_PARAMS
60 | {
61 | #if HAVE_AVX
62 | 	#include "../generic/dec_head.c"
63 | 	dec_loop_ssse3(&s, &slen, &o, &olen);
64 | 	#include "../generic/dec_tail.c"
65 | #else
66 | 	return base64_dec_stub(state, src, srclen, out, outlen);
67 | #endif
68 | }
69 | 


--------------------------------------------------------------------------------
/lib/arch/avx/enc_loop_asm.c:
--------------------------------------------------------------------------------
  1 | // Apologies in advance for combining the preprocessor with inline assembly,
  2 | // two notoriously gnarly parts of C, but it was necessary to avoid a lot of
  3 | // code repetition. The preprocessor is used to template large sections of
  4 | // inline assembly that differ only in the registers used. If the code was
  5 | // written out by hand, it would become very large and hard to audit.
  6 | 
  7 | // Generate a block of inline assembly that loads register R0 from memory. The
  8 | // offset at which the register is loaded is set by the given round.
  9 | #define LOAD(R0, ROUND) \
 10 | 	"vlddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t"
 11 | 
 12 | // Generate a block of inline assembly that deinterleaves and shuffles register
 13 | // R0 using preloaded constants. Outputs in R0 and R1.
 14 | #define SHUF(R0, R1, R2) \
 15 | 	"vpshufb  %[lut0], %["R0"], %["R1"] \n\t" \
 16 | 	"vpand    %["R1"], %[msk0], %["R2"] \n\t" \
 17 | 	"vpand    %["R1"], %[msk2], %["R1"] \n\t" \
 18 | 	"vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \
 19 | 	"vpmullw  %["R1"], %[msk3], %["R1"] \n\t" \
 20 | 	"vpor     %["R1"], %["R2"], %["R1"] \n\t"
 21 | 
 22 | // Generate a block of inline assembly that takes R0 and R1 and translates
 23 | // their contents to the base64 alphabet, using preloaded constants.
 24 | #define TRAN(R0, R1, R2) \
 25 | 	"vpsubusb %[n51],  %["R1"], %["R0"] \n\t" \
 26 | 	"vpcmpgtb %[n25],  %["R1"], %["R2"] \n\t" \
 27 | 	"vpsubb   %["R2"], %["R0"], %["R0"] \n\t" \
 28 | 	"vpshufb  %["R0"], %[lut1], %["R2"] \n\t" \
 29 | 	"vpaddb   %["R1"], %["R2"], %["R0"] \n\t"
 30 | 
 31 | // Generate a block of inline assembly that stores the given register R0 at an
 32 | // offset set by the given round.
 33 | #define STOR(R0, ROUND) \
 34 | 	"vmovdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t"
 35 | 
 36 | // Generate a block of inline assembly that generates a single self-contained
 37 | // encoder round: fetch the data, process it, and store the result. Then update
 38 | // the source and destination pointers.
 39 | #define ROUND() \
 40 | 	LOAD("a", 0) \
 41 | 	SHUF("a", "b", "c") \
 42 | 	TRAN("a", "b", "c") \
 43 | 	STOR("a", 0) \
 44 | 	"add $12, %[src] \n\t" \
 45 | 	"add $16, %[dst] \n\t"
 46 | 
 47 | // Define a macro that initiates a three-way interleaved encoding round by
 48 | // preloading registers a, b and c from memory.
 49 | // The register graph shows which registers are in use during each step, and
 50 | // is a visual aid for choosing registers for that step. Symbol index:
 51 | //
 52 | //  +  indicates that a register is loaded by that step.
 53 | //  |  indicates that a register is in use and must not be touched.
 54 | //  -  indicates that a register is decommissioned by that step.
 55 | //  x  indicates that a register is used as a temporary by that step.
 56 | //  V  indicates that a register is an input or output to the macro.
 57 | //
 58 | #define ROUND_3_INIT() 			/*  a b c d e f  */ \
 59 | 	LOAD("a",   0)			/*  +            */ \
 60 | 	SHUF("a", "d", "e")		/*  |     + x    */ \
 61 | 	LOAD("b",   1)			/*  | +   |      */ \
 62 | 	TRAN("a", "d", "e")		/*  | |   - x    */ \
 63 | 	LOAD("c",   2)			/*  V V V        */
 64 | 
 65 | // Define a macro that translates, shuffles and stores the input registers A, B
 66 | // and C, and preloads registers D, E and F for the next round.
 67 | // This macro can be arbitrarily daisy-chained by feeding output registers D, E
 68 | // and F back into the next round as input registers A, B and C. The macro
 69 | // carefully interleaves memory operations with data operations for optimal
 70 | // pipelined performance.
 71 | 
 72 | #define ROUND_3(ROUND, A,B,C,D,E,F) 	/*  A B C D E F  */ \
 73 | 	LOAD(D, (ROUND + 3))		/*  V V V +      */ \
 74 | 	SHUF(B, E, F)			/*  | | | | + x  */ \
 75 | 	STOR(A, (ROUND + 0))		/*  - | | | |    */ \
 76 | 	TRAN(B, E, F)			/*    | | | - x  */ \
 77 | 	LOAD(E, (ROUND + 4))		/*    | | | +    */ \
 78 | 	SHUF(C, A, F)			/*  + | | | | x  */ \
 79 | 	STOR(B, (ROUND + 1))		/*  | - | | |    */ \
 80 | 	TRAN(C, A, F)			/*  -   | | | x  */ \
 81 | 	LOAD(F, (ROUND + 5))		/*      | | | +  */ \
 82 | 	SHUF(D, A, B)			/*  + x | | | |  */ \
 83 | 	STOR(C, (ROUND + 2))		/*  |   - | | |  */ \
 84 | 	TRAN(D, A, B)			/*  - x   V V V  */
 85 | 
 86 | // Define a macro that terminates a ROUND_3 macro by taking pre-loaded
 87 | // registers D, E and F, and translating, shuffling and storing them.
 88 | #define ROUND_3_END(ROUND, A,B,C,D,E,F)	/*  A B C D E F  */ \
 89 | 	SHUF(E, A, B)			/*  + x   V V V  */ \
 90 | 	STOR(D, (ROUND + 3))		/*  |     - | |  */ \
 91 | 	TRAN(E, A, B)			/*  - x     | |  */ \
 92 | 	SHUF(F, C, D)			/*      + x | |  */ \
 93 | 	STOR(E, (ROUND + 4))		/*      |   - |  */ \
 94 | 	TRAN(F, C, D)			/*      - x   |  */ \
 95 | 	STOR(F, (ROUND + 5))		/*            -  */
 96 | 
 97 | // Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
 98 | #define ROUND_3_A(ROUND) \
 99 | 	ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
100 | 
101 | // Define a type B round. Inputs and outputs are swapped with regard to type A.
102 | #define ROUND_3_B(ROUND) \
103 | 	ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
104 | 
105 | // Terminating macro for a type A round.
106 | #define ROUND_3_A_LAST(ROUND) \
107 | 	ROUND_3_A(ROUND) \
108 | 	ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
109 | 
110 | // Terminating macro for a type B round.
111 | #define ROUND_3_B_LAST(ROUND) \
112 | 	ROUND_3_B(ROUND) \
113 | 	ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
114 | 
115 | // Suppress clang's warning that the literal string in the asm statement is
116 | // overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
117 | // compilers). It may be true, but the goal here is not C99 portability.
118 | #pragma GCC diagnostic push
119 | #pragma GCC diagnostic ignored "-Woverlength-strings"
120 | 
121 | static inline void
122 | enc_loop_avx (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
123 | {
124 | 	// For a clearer explanation of the algorithm used by this function,
125 | 	// please refer to the plain (not inline assembly) implementation. This
126 | 	// function follows the same basic logic.
127 | 
128 | 	if (*slen < 16) {
129 | 		return;
130 | 	}
131 | 
132 | 	// Process blocks of 12 bytes at a time. Input is read in blocks of 16
133 | 	// bytes, so "reserve" four bytes from the input buffer to ensure that
134 | 	// we never read beyond the end of the input buffer.
135 | 	size_t rounds = (*slen - 4) / 12;
136 | 
137 | 	*slen -= rounds * 12;   // 12 bytes consumed per round
138 | 	*olen += rounds * 16;   // 16 bytes produced per round
139 | 
140 | 	// Number of times to go through the 36x loop.
141 | 	size_t loops = rounds / 36;
142 | 
143 | 	// Number of rounds remaining after the 36x loop.
144 | 	rounds %= 36;
145 | 
146 | 	// Lookup tables.
147 | 	const __m128i lut0 = _mm_set_epi8(
148 | 		10, 11,  9, 10,  7,  8,  6,  7,  4,  5,  3,  4,  1,  2,  0,  1);
149 | 
150 | 	const __m128i lut1 = _mm_setr_epi8(
151 | 		65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
152 | 
153 | 	// Temporary registers.
154 | 	__m128i a, b, c, d, e, f;
155 | 
156 | 	__asm__ volatile (
157 | 
158 | 		// If there are 36 rounds or more, enter a 36x unrolled loop of
159 | 		// interleaved encoding rounds. The rounds interleave memory
160 | 		// operations (load/store) with data operations (table lookups,
161 | 		// etc) to maximize pipeline throughput.
162 | 		"    test %[loops], %[loops] \n\t"
163 | 		"    jz   18f                \n\t"
164 | 		"    jmp  36f                \n\t"
165 | 		"                            \n\t"
166 | 		".balign 64                  \n\t"
167 | 		"36: " ROUND_3_INIT()
168 | 		"    " ROUND_3_A( 0)
169 | 		"    " ROUND_3_B( 3)
170 | 		"    " ROUND_3_A( 6)
171 | 		"    " ROUND_3_B( 9)
172 | 		"    " ROUND_3_A(12)
173 | 		"    " ROUND_3_B(15)
174 | 		"    " ROUND_3_A(18)
175 | 		"    " ROUND_3_B(21)
176 | 		"    " ROUND_3_A(24)
177 | 		"    " ROUND_3_B(27)
178 | 		"    " ROUND_3_A_LAST(30)
179 | 		"    add $(12 * 36), %[src] \n\t"
180 | 		"    add $(16 * 36), %[dst] \n\t"
181 | 		"    dec %[loops]           \n\t"
182 | 		"    jnz 36b                \n\t"
183 | 
184 | 		// Enter an 18x unrolled loop for rounds of 18 or more.
185 | 		"18: cmp $18, %[rounds] \n\t"
186 | 		"    jl  9f             \n\t"
187 | 		"    " ROUND_3_INIT()
188 | 		"    " ROUND_3_A(0)
189 | 		"    " ROUND_3_B(3)
190 | 		"    " ROUND_3_A(6)
191 | 		"    " ROUND_3_B(9)
192 | 		"    " ROUND_3_A_LAST(12)
193 | 		"    sub $18,        %[rounds] \n\t"
194 | 		"    add $(12 * 18), %[src]    \n\t"
195 | 		"    add $(16 * 18), %[dst]    \n\t"
196 | 
197 | 		// Enter a 9x unrolled loop for rounds of 9 or more.
198 | 		"9:  cmp $9, %[rounds] \n\t"
199 | 		"    jl  6f            \n\t"
200 | 		"    " ROUND_3_INIT()
201 | 		"    " ROUND_3_A(0)
202 | 		"    " ROUND_3_B_LAST(3)
203 | 		"    sub $9,        %[rounds] \n\t"
204 | 		"    add $(12 * 9), %[src]    \n\t"
205 | 		"    add $(16 * 9), %[dst]    \n\t"
206 | 
207 | 		// Enter a 6x unrolled loop for rounds of 6 or more.
208 | 		"6:  cmp $6, %[rounds] \n\t"
209 | 		"    jl  55f           \n\t"
210 | 		"    " ROUND_3_INIT()
211 | 		"    " ROUND_3_A_LAST(0)
212 | 		"    sub $6,        %[rounds] \n\t"
213 | 		"    add $(12 * 6), %[src]    \n\t"
214 | 		"    add $(16 * 6), %[dst]    \n\t"
215 | 
216 | 		// Dispatch the remaining rounds 0..5.
217 | 		"55: cmp $3, %[rounds] \n\t"
218 | 		"    jg  45f           \n\t"
219 | 		"    je  3f            \n\t"
220 | 		"    cmp $1, %[rounds] \n\t"
221 | 		"    jg  2f            \n\t"
222 | 		"    je  1f            \n\t"
223 | 		"    jmp 0f            \n\t"
224 | 
225 | 		"45: cmp $4, %[rounds] \n\t"
226 | 		"    je  4f            \n\t"
227 | 
228 | 		// Block of non-interlaced encoding rounds, which can each
229 | 		// individually be jumped to. Rounds fall through to the next.
230 | 		"5: " ROUND()
231 | 		"4: " ROUND()
232 | 		"3: " ROUND()
233 | 		"2: " ROUND()
234 | 		"1: " ROUND()
235 | 		"0: \n\t"
236 | 
237 | 		// Outputs (modified).
238 | 		: [rounds] "+r"  (rounds),
239 | 		  [loops]  "+r"  (loops),
240 | 		  [src]    "+r"  (*s),
241 | 		  [dst]    "+r"  (*o),
242 | 		  [a]      "=&x" (a),
243 | 		  [b]      "=&x" (b),
244 | 		  [c]      "=&x" (c),
245 | 		  [d]      "=&x" (d),
246 | 		  [e]      "=&x" (e),
247 | 		  [f]      "=&x" (f)
248 | 
249 | 		// Inputs (not modified).
250 | 		: [lut0] "x" (lut0),
251 | 		  [lut1] "x" (lut1),
252 | 		  [msk0] "x" (_mm_set1_epi32(0x0FC0FC00)),
253 | 		  [msk1] "x" (_mm_set1_epi32(0x04000040)),
254 | 		  [msk2] "x" (_mm_set1_epi32(0x003F03F0)),
255 | 		  [msk3] "x" (_mm_set1_epi32(0x01000010)),
256 | 		  [n51]  "x" (_mm_set1_epi8(51)),
257 | 		  [n25]  "x" (_mm_set1_epi8(25))
258 | 
259 | 		// Clobbers.
260 | 		: "cc", "memory"
261 | 	);
262 | }
263 | 
264 | #pragma GCC diagnostic pop
265 | 


--------------------------------------------------------------------------------
/lib/arch/avx2/codec.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stddef.h>
 3 | #include <stdlib.h>
 4 | 
 5 | #include "../../../include/libbase64.h"
 6 | #include "../../tables/tables.h"
 7 | #include "../../codecs.h"
 8 | #include "config.h"
 9 | #include "../../env.h"
10 | 
11 | #if HAVE_AVX2
12 | #include <immintrin.h>
13 | 
14 | // Only enable inline assembly on supported compilers and on 64-bit CPUs.
15 | #ifndef BASE64_AVX2_USE_ASM
16 | # if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
17 | #  define BASE64_AVX2_USE_ASM 1
18 | # else
19 | #  define BASE64_AVX2_USE_ASM 0
20 | # endif
21 | #endif
22 | 
23 | #include "dec_reshuffle.c"
24 | #include "dec_loop.c"
25 | 
26 | #if BASE64_AVX2_USE_ASM
27 | # include "enc_loop_asm.c"
28 | #else
29 | # include "enc_translate.c"
30 | # include "enc_reshuffle.c"
31 | # include "enc_loop.c"
32 | #endif
33 | 
34 | #endif	// HAVE_AVX2
35 | 
36 | void
37 | base64_stream_encode_avx2 BASE64_ENC_PARAMS
38 | {
39 | #if HAVE_AVX2
40 | 	#include "../generic/enc_head.c"
41 | 	enc_loop_avx2(&s, &slen, &o, &olen);
42 | 	#include "../generic/enc_tail.c"
43 | #else
44 | 	base64_enc_stub(state, src, srclen, out, outlen);
45 | #endif
46 | }
47 | 
48 | int
49 | base64_stream_decode_avx2 BASE64_DEC_PARAMS
50 | {
51 | #if HAVE_AVX2
52 | 	#include "../generic/dec_head.c"
53 | 	dec_loop_avx2(&s, &slen, &o, &olen);
54 | 	#include "../generic/dec_tail.c"
55 | #else
56 | 	return base64_dec_stub(state, src, srclen, out, outlen);
57 | #endif
58 | }
59 | 


--------------------------------------------------------------------------------
/lib/arch/avx2/dec_loop.c:
--------------------------------------------------------------------------------
  1 | static BASE64_FORCE_INLINE int
  2 | dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
  3 | {
  4 | 	const __m256i lut_lo = _mm256_setr_epi8(
  5 | 		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
  6 | 		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
  7 | 		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
  8 | 		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
  9 | 
 10 | 	const __m256i lut_hi = _mm256_setr_epi8(
 11 | 		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
 12 | 		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
 13 | 		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
 14 | 		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
 15 | 
 16 | 	const __m256i lut_roll = _mm256_setr_epi8(
 17 | 		0,  16,  19,   4, -65, -65, -71, -71,
 18 | 		0,   0,   0,   0,   0,   0,   0,   0,
 19 | 		0,  16,  19,   4, -65, -65, -71, -71,
 20 | 		0,   0,   0,   0,   0,   0,   0,   0);
 21 | 
 22 | 	const __m256i mask_2F = _mm256_set1_epi8(0x2F);
 23 | 
 24 | 	// Load input:
 25 | 	__m256i str = _mm256_loadu_si256((__m256i *) *s);
 26 | 
 27 | 	// See the SSSE3 decoder for an explanation of the algorithm.
 28 | 	const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
 29 | 	const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
 30 | 	const __m256i hi         = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
 31 | 	const __m256i lo         = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
 32 | 
 33 | 	if (!_mm256_testz_si256(lo, hi)) {
 34 | 		return 0;
 35 | 	}
 36 | 
 37 | 	const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
 38 | 	const __m256i roll  = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
 39 | 
 40 | 	// Now simply add the delta values to the input:
 41 | 	str = _mm256_add_epi8(str, roll);
 42 | 
 43 | 	// Reshuffle the input to packed 12-byte output format:
 44 | 	str = dec_reshuffle(str);
 45 | 
 46 | 	// Store the output:
 47 | 	_mm256_storeu_si256((__m256i *) *o, str);
 48 | 
 49 | 	*s += 32;
 50 | 	*o += 24;
 51 | 	*rounds -= 1;
 52 | 
 53 | 	return 1;
 54 | }
 55 | 
 56 | static inline void
 57 | dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 58 | {
 59 | 	if (*slen < 45) {
 60 | 		return;
 61 | 	}
 62 | 
 63 | 	// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
 64 | 	// written after the output, ensure that there will be at least 13
 65 | 	// bytes of input data left to cover the gap. (11 data bytes and up to
 66 | 	// two end-of-string markers.)
 67 | 	size_t rounds = (*slen - 13) / 32;
 68 | 
 69 | 	*slen -= rounds * 32;	// 32 bytes consumed per round
 70 | 	*olen += rounds * 24;	// 24 bytes produced per round
 71 | 
 72 | 	do {
 73 | 		if (rounds >= 8) {
 74 | 			if (dec_loop_avx2_inner(s, o, &rounds) &&
 75 | 			    dec_loop_avx2_inner(s, o, &rounds) &&
 76 | 			    dec_loop_avx2_inner(s, o, &rounds) &&
 77 | 			    dec_loop_avx2_inner(s, o, &rounds) &&
 78 | 			    dec_loop_avx2_inner(s, o, &rounds) &&
 79 | 			    dec_loop_avx2_inner(s, o, &rounds) &&
 80 | 			    dec_loop_avx2_inner(s, o, &rounds) &&
 81 | 			    dec_loop_avx2_inner(s, o, &rounds)) {
 82 | 				continue;
 83 | 			}
 84 | 			break;
 85 | 		}
 86 | 		if (rounds >= 4) {
 87 | 			if (dec_loop_avx2_inner(s, o, &rounds) &&
 88 | 			    dec_loop_avx2_inner(s, o, &rounds) &&
 89 | 			    dec_loop_avx2_inner(s, o, &rounds) &&
 90 | 			    dec_loop_avx2_inner(s, o, &rounds)) {
 91 | 				continue;
 92 | 			}
 93 | 			break;
 94 | 		}
 95 | 		if (rounds >= 2) {
 96 | 			if (dec_loop_avx2_inner(s, o, &rounds) &&
 97 | 			    dec_loop_avx2_inner(s, o, &rounds)) {
 98 | 				continue;
 99 | 			}
100 | 			break;
101 | 		}
102 | 		dec_loop_avx2_inner(s, o, &rounds);
103 | 		break;
104 | 
105 | 	} while (rounds > 0);
106 | 
107 | 	// Adjust for any rounds that were skipped:
108 | 	*slen += rounds * 32;
109 | 	*olen -= rounds * 24;
110 | }
111 | 


--------------------------------------------------------------------------------
/lib/arch/avx2/dec_reshuffle.c:
--------------------------------------------------------------------------------
 1 | static BASE64_FORCE_INLINE __m256i
 2 | dec_reshuffle (const __m256i in)
 3 | {
 4 | 	// in, lower lane, bits, upper case are most significant bits, lower
 5 | 	// case are least significant bits:
 6 | 	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
 7 | 	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
 8 | 	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
 9 | 	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
10 | 
11 | 	const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
12 | 	// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
13 | 	// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
14 | 	// 0000eeee FFffffff 0000DDDD DDddEEEE
15 | 	// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
16 | 
17 | 	__m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
18 | 	// 00000000 JJJJJJjj KKKKkkkk LLllllll
19 | 	// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
20 | 	// 00000000 DDDDDDdd EEEEeeee FFffffff
21 | 	// 00000000 AAAAAAaa BBBBbbbb CCcccccc
22 | 
23 | 	// Pack bytes together in each lane:
24 | 	out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
25 | 		2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
26 | 		2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
27 | 	// 00000000 00000000 00000000 00000000
28 | 	// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
29 | 	// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
30 | 	// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
31 | 
32 | 	// Pack lanes:
33 | 	return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
34 | }
35 | 


--------------------------------------------------------------------------------
/lib/arch/avx2/enc_loop.c:
--------------------------------------------------------------------------------
 1 | static BASE64_FORCE_INLINE void
 2 | enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
 3 | {
 4 | 	// First load is done at s - 0 to not get a segfault:
 5 | 	__m256i src = _mm256_loadu_si256((__m256i *) *s);
 6 | 
 7 | 	// Shift by 4 bytes, as required by enc_reshuffle:
 8 | 	src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
 9 | 
10 | 	// Reshuffle, translate, store:
11 | 	src = enc_reshuffle(src);
12 | 	src = enc_translate(src);
13 | 	_mm256_storeu_si256((__m256i *) *o, src);
14 | 
15 | 	// Subsequent loads will be done at s - 4, set pointer for next round:
16 | 	*s += 20;
17 | 	*o += 32;
18 | }
19 | 
20 | static BASE64_FORCE_INLINE void
21 | enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
22 | {
23 | 	// Load input:
24 | 	__m256i src = _mm256_loadu_si256((__m256i *) *s);
25 | 
26 | 	// Reshuffle, translate, store:
27 | 	src = enc_reshuffle(src);
28 | 	src = enc_translate(src);
29 | 	_mm256_storeu_si256((__m256i *) *o, src);
30 | 
31 | 	*s += 24;
32 | 	*o += 32;
33 | }
34 | 
35 | static inline void
36 | enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
37 | {
38 | 	if (*slen < 32) {
39 | 		return;
40 | 	}
41 | 
42 | 	// Process blocks of 24 bytes at a time. Because blocks are loaded 32
43 | 	// bytes at a time an offset of -4, ensure that there will be at least
44 | 	// 4 remaining bytes after the last round, so that the final read will
45 | 	// not pass beyond the bounds of the input buffer:
46 | 	size_t rounds = (*slen - 4) / 24;
47 | 
48 | 	*slen -= rounds * 24;   // 24 bytes consumed per round
49 | 	*olen += rounds * 32;   // 32 bytes produced per round
50 | 
51 | 	// The first loop iteration requires special handling to ensure that
52 | 	// the read, which is done at an offset, does not underflow the buffer:
53 | 	enc_loop_avx2_inner_first(s, o);
54 | 	rounds--;
55 | 
56 | 	while (rounds > 0) {
57 | 		if (rounds >= 8) {
58 | 			enc_loop_avx2_inner(s, o);
59 | 			enc_loop_avx2_inner(s, o);
60 | 			enc_loop_avx2_inner(s, o);
61 | 			enc_loop_avx2_inner(s, o);
62 | 			enc_loop_avx2_inner(s, o);
63 | 			enc_loop_avx2_inner(s, o);
64 | 			enc_loop_avx2_inner(s, o);
65 | 			enc_loop_avx2_inner(s, o);
66 | 			rounds -= 8;
67 | 			continue;
68 | 		}
69 | 		if (rounds >= 4) {
70 | 			enc_loop_avx2_inner(s, o);
71 | 			enc_loop_avx2_inner(s, o);
72 | 			enc_loop_avx2_inner(s, o);
73 | 			enc_loop_avx2_inner(s, o);
74 | 			rounds -= 4;
75 | 			continue;
76 | 		}
77 | 		if (rounds >= 2) {
78 | 			enc_loop_avx2_inner(s, o);
79 | 			enc_loop_avx2_inner(s, o);
80 | 			rounds -= 2;
81 | 			continue;
82 | 		}
83 | 		enc_loop_avx2_inner(s, o);
84 | 		break;
85 | 	}
86 | 
87 | 	// Add the offset back:
88 | 	*s += 4;
89 | }
90 | 


--------------------------------------------------------------------------------
/lib/arch/avx2/enc_loop_asm.c:
--------------------------------------------------------------------------------
  1 | // Apologies in advance for combining the preprocessor with inline assembly,
  2 | // two notoriously gnarly parts of C, but it was necessary to avoid a lot of
  3 | // code repetition. The preprocessor is used to template large sections of
  4 | // inline assembly that differ only in the registers used. If the code was
  5 | // written out by hand, it would become very large and hard to audit.
  6 | 
  7 | // Generate a block of inline assembly that loads register R0 from memory. The
  8 | // offset at which the register is loaded is set by the given round and a
  9 | // constant offset.
 10 | #define LOAD(R0, ROUND, OFFSET) \
 11 | 	"vlddqu ("#ROUND" * 24 + "#OFFSET")(%[src]), %["R0"] \n\t"
 12 | 
 13 | // Generate a block of inline assembly that deinterleaves and shuffles register
 14 | // R0 using preloaded constants. Outputs in R0 and R1.
 15 | #define SHUF(R0, R1, R2) \
 16 | 	"vpshufb  %[lut0], %["R0"], %["R1"] \n\t" \
 17 | 	"vpand    %["R1"], %[msk0], %["R2"] \n\t" \
 18 | 	"vpand    %["R1"], %[msk2], %["R1"] \n\t" \
 19 | 	"vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \
 20 | 	"vpmullw  %["R1"], %[msk3], %["R1"] \n\t" \
 21 | 	"vpor     %["R1"], %["R2"], %["R1"] \n\t"
 22 | 
 23 | // Generate a block of inline assembly that takes R0 and R1 and translates
 24 | // their contents to the base64 alphabet, using preloaded constants.
 25 | #define TRAN(R0, R1, R2) \
 26 | 	"vpsubusb %[n51],  %["R1"], %["R0"] \n\t" \
 27 | 	"vpcmpgtb %[n25],  %["R1"], %["R2"] \n\t" \
 28 | 	"vpsubb   %["R2"], %["R0"], %["R0"] \n\t" \
 29 | 	"vpshufb  %["R0"], %[lut1], %["R2"] \n\t" \
 30 | 	"vpaddb   %["R1"], %["R2"], %["R0"] \n\t"
 31 | 
 32 | // Generate a block of inline assembly that stores the given register R0 at an
 33 | // offset set by the given round.
 34 | #define STOR(R0, ROUND) \
 35 | 	"vmovdqu %["R0"], ("#ROUND" * 32)(%[dst]) \n\t"
 36 | 
 37 | // Generate a block of inline assembly that generates a single self-contained
 38 | // encoder round: fetch the data, process it, and store the result. Then update
 39 | // the source and destination pointers.
 40 | #define ROUND() \
 41 | 	LOAD("a", 0, -4) \
 42 | 	SHUF("a", "b", "c") \
 43 | 	TRAN("a", "b", "c") \
 44 | 	STOR("a", 0) \
 45 | 	"add $24, %[src] \n\t" \
 46 | 	"add $32, %[dst] \n\t"
 47 | 
 48 | // Define a macro that initiates a three-way interleaved encoding round by
 49 | // preloading registers a, b and c from memory.
 50 | // The register graph shows which registers are in use during each step, and
 51 | // is a visual aid for choosing registers for that step. Symbol index:
 52 | //
 53 | //  +  indicates that a register is loaded by that step.
 54 | //  |  indicates that a register is in use and must not be touched.
 55 | //  -  indicates that a register is decommissioned by that step.
 56 | //  x  indicates that a register is used as a temporary by that step.
 57 | //  V  indicates that a register is an input or output to the macro.
 58 | //
 59 | #define ROUND_3_INIT() 			/*  a b c d e f  */ \
 60 | 	LOAD("a",   0,  -4)		/*  +            */ \
 61 | 	SHUF("a", "d", "e")		/*  |     + x    */ \
 62 | 	LOAD("b",   1,  -4)		/*  | +   |      */ \
 63 | 	TRAN("a", "d", "e")		/*  | |   - x    */ \
 64 | 	LOAD("c",   2,  -4)		/*  V V V        */
 65 | 
 66 | // Define a macro that translates, shuffles and stores the input registers A, B
 67 | // and C, and preloads registers D, E and F for the next round.
 68 | // This macro can be arbitrarily daisy-chained by feeding output registers D, E
 69 | // and F back into the next round as input registers A, B and C. The macro
 70 | // carefully interleaves memory operations with data operations for optimal
 71 | // pipelined performance.
 72 | 
 73 | #define ROUND_3(ROUND, A,B,C,D,E,F) 	/*  A B C D E F  */ \
 74 | 	LOAD(D, (ROUND + 3), -4)	/*  V V V +      */ \
 75 | 	SHUF(B, E, F)			/*  | | | | + x  */ \
 76 | 	STOR(A, (ROUND + 0))		/*  - | | | |    */ \
 77 | 	TRAN(B, E, F)			/*    | | | - x  */ \
 78 | 	LOAD(E, (ROUND + 4), -4)	/*    | | | +    */ \
 79 | 	SHUF(C, A, F)			/*  + | | | | x  */ \
 80 | 	STOR(B, (ROUND + 1))		/*  | - | | |    */ \
 81 | 	TRAN(C, A, F)			/*  -   | | | x  */ \
 82 | 	LOAD(F, (ROUND + 5), -4)	/*      | | | +  */ \
 83 | 	SHUF(D, A, B)			/*  + x | | | |  */ \
 84 | 	STOR(C, (ROUND + 2))		/*  |   - | | |  */ \
 85 | 	TRAN(D, A, B)			/*  - x   V V V  */
 86 | 
 87 | // Define a macro that terminates a ROUND_3 macro by taking pre-loaded
 88 | // registers D, E and F, and translating, shuffling and storing them.
 89 | #define ROUND_3_END(ROUND, A,B,C,D,E,F)	/*  A B C D E F  */ \
 90 | 	SHUF(E, A, B)			/*  + x   V V V  */ \
 91 | 	STOR(D, (ROUND + 3))		/*  |     - | |  */ \
 92 | 	TRAN(E, A, B)			/*  - x     | |  */ \
 93 | 	SHUF(F, C, D)			/*      + x | |  */ \
 94 | 	STOR(E, (ROUND + 4))		/*      |   - |  */ \
 95 | 	TRAN(F, C, D)			/*      - x   |  */ \
 96 | 	STOR(F, (ROUND + 5))		/*            -  */
 97 | 
 98 | // Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
 99 | #define ROUND_3_A(ROUND) \
100 | 	ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
101 | 
102 | // Define a type B round. Inputs and outputs are swapped with regard to type A.
103 | #define ROUND_3_B(ROUND) \
104 | 	ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
105 | 
106 | // Terminating macro for a type A round.
107 | #define ROUND_3_A_LAST(ROUND) \
108 | 	ROUND_3_A(ROUND) \
109 | 	ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
110 | 
111 | // Terminating macro for a type B round.
112 | #define ROUND_3_B_LAST(ROUND) \
113 | 	ROUND_3_B(ROUND) \
114 | 	ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
115 | 
116 | // Suppress clang's warning that the literal string in the asm statement is
117 | // overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
118 | // compilers). It may be true, but the goal here is not C99 portability.
119 | #pragma GCC diagnostic push
120 | #pragma GCC diagnostic ignored "-Woverlength-strings"
121 | 
122 | static inline void
123 | enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
124 | {
125 | 	// For a clearer explanation of the algorithm used by this function,
126 | 	// please refer to the plain (not inline assembly) implementation. This
127 | 	// function follows the same basic logic.
128 | 
129 | 	if (*slen < 32) {
130 | 		return;
131 | 	}
132 | 
133 | 	// Process blocks of 24 bytes at a time. Because blocks are loaded 32
134 | 	// bytes at a time an offset of -4, ensure that there will be at least
135 | 	// 4 remaining bytes after the last round, so that the final read will
136 | 	// not pass beyond the bounds of the input buffer.
137 | 	size_t rounds = (*slen - 4) / 24;
138 | 
139 | 	*slen -= rounds * 24;   // 24 bytes consumed per round
140 | 	*olen += rounds * 32;   // 32 bytes produced per round
141 | 
142 | 	// Pre-decrement the number of rounds to get the number of rounds
143 | 	// *after* the first round, which is handled as a special case.
144 | 	rounds--;
145 | 
146 | 	// Number of times to go through the 36x loop.
147 | 	size_t loops = rounds / 36;
148 | 
149 | 	// Number of rounds remaining after the 36x loop.
150 | 	rounds %= 36;
151 | 
152 | 	// Lookup tables.
153 | 	const __m256i lut0 = _mm256_set_epi8(
154 | 		10, 11,  9, 10,  7,  8,  6,  7,  4,  5,  3,  4,  1,  2,  0,  1,
155 | 		14, 15, 13, 14, 11, 12, 10, 11,  8,  9,  7,  8,  5,  6,  4,  5);
156 | 
157 | 	const __m256i lut1 = _mm256_setr_epi8(
158 | 		65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
159 | 		65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
160 | 
161 | 	// Temporary registers.
162 | 	__m256i a, b, c, d, e;
163 | 
164 | 	// Temporary register f doubles as the shift mask for the first round.
165 | 	__m256i f = _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6);
166 | 
167 | 	__asm__ volatile (
168 | 
169 | 		// The first loop iteration requires special handling to ensure
170 | 		// that the read, which is normally done at an offset of -4,
171 | 		// does not underflow the buffer. Load the buffer at an offset
172 | 		// of 0 and permute the input to achieve the same effect.
173 | 		LOAD("a", 0, 0)
174 | 		"vpermd %[a], %[f], %[a] \n\t"
175 | 
176 | 		// Perform the standard shuffling and translation steps.
177 | 		SHUF("a", "b", "c")
178 | 		TRAN("a", "b", "c")
179 | 
180 | 		// Store the result and increment the source and dest pointers.
181 | 		"vmovdqu %[a], (%[dst]) \n\t"
182 | 		"add     $24,  %[src]   \n\t"
183 | 		"add     $32,  %[dst]   \n\t"
184 | 
185 | 		// If there are 36 rounds or more, enter a 36x unrolled loop of
186 | 		// interleaved encoding rounds. The rounds interleave memory
187 | 		// operations (load/store) with data operations (table lookups,
188 | 		// etc) to maximize pipeline throughput.
189 | 		"    test %[loops], %[loops] \n\t"
190 | 		"    jz   18f                \n\t"
191 | 		"    jmp  36f                \n\t"
192 | 		"                            \n\t"
193 | 		".balign 64                  \n\t"
194 | 		"36: " ROUND_3_INIT()
195 | 		"    " ROUND_3_A( 0)
196 | 		"    " ROUND_3_B( 3)
197 | 		"    " ROUND_3_A( 6)
198 | 		"    " ROUND_3_B( 9)
199 | 		"    " ROUND_3_A(12)
200 | 		"    " ROUND_3_B(15)
201 | 		"    " ROUND_3_A(18)
202 | 		"    " ROUND_3_B(21)
203 | 		"    " ROUND_3_A(24)
204 | 		"    " ROUND_3_B(27)
205 | 		"    " ROUND_3_A_LAST(30)
206 | 		"    add $(24 * 36), %[src] \n\t"
207 | 		"    add $(32 * 36), %[dst] \n\t"
208 | 		"    dec %[loops]           \n\t"
209 | 		"    jnz 36b                \n\t"
210 | 
211 | 		// Enter an 18x unrolled loop for rounds of 18 or more.
212 | 		"18: cmp $18, %[rounds] \n\t"
213 | 		"    jl  9f             \n\t"
214 | 		"    " ROUND_3_INIT()
215 | 		"    " ROUND_3_A(0)
216 | 		"    " ROUND_3_B(3)
217 | 		"    " ROUND_3_A(6)
218 | 		"    " ROUND_3_B(9)
219 | 		"    " ROUND_3_A_LAST(12)
220 | 		"    sub $18,        %[rounds] \n\t"
221 | 		"    add $(24 * 18), %[src]    \n\t"
222 | 		"    add $(32 * 18), %[dst]    \n\t"
223 | 
224 | 		// Enter a 9x unrolled loop for rounds of 9 or more.
225 | 		"9:  cmp $9, %[rounds] \n\t"
226 | 		"    jl  6f            \n\t"
227 | 		"    " ROUND_3_INIT()
228 | 		"    " ROUND_3_A(0)
229 | 		"    " ROUND_3_B_LAST(3)
230 | 		"    sub $9,        %[rounds] \n\t"
231 | 		"    add $(24 * 9), %[src]    \n\t"
232 | 		"    add $(32 * 9), %[dst]    \n\t"
233 | 
234 | 		// Enter a 6x unrolled loop for rounds of 6 or more.
235 | 		"6:  cmp $6, %[rounds] \n\t"
236 | 		"    jl  55f           \n\t"
237 | 		"    " ROUND_3_INIT()
238 | 		"    " ROUND_3_A_LAST(0)
239 | 		"    sub $6,        %[rounds] \n\t"
240 | 		"    add $(24 * 6), %[src]    \n\t"
241 | 		"    add $(32 * 6), %[dst]    \n\t"
242 | 
243 | 		// Dispatch the remaining rounds 0..5.
244 | 		"55: cmp $3, %[rounds] \n\t"
245 | 		"    jg  45f           \n\t"
246 | 		"    je  3f            \n\t"
247 | 		"    cmp $1, %[rounds] \n\t"
248 | 		"    jg  2f            \n\t"
249 | 		"    je  1f            \n\t"
250 | 		"    jmp 0f            \n\t"
251 | 
252 | 		"45: cmp $4, %[rounds] \n\t"
253 | 		"    je  4f            \n\t"
254 | 
255 | 		// Block of non-interlaced encoding rounds, which can each
256 | 		// individually be jumped to. Rounds fall through to the next.
257 | 		"5: " ROUND()
258 | 		"4: " ROUND()
259 | 		"3: " ROUND()
260 | 		"2: " ROUND()
261 | 		"1: " ROUND()
262 | 		"0: \n\t"
263 | 
264 | 		// Outputs (modified).
265 | 		: [rounds] "+r"  (rounds),
266 | 		  [loops]  "+r"  (loops),
267 | 		  [src]    "+r"  (*s),
268 | 		  [dst]    "+r"  (*o),
269 | 		  [a]      "=&x" (a),
270 | 		  [b]      "=&x" (b),
271 | 		  [c]      "=&x" (c),
272 | 		  [d]      "=&x" (d),
273 | 		  [e]      "=&x" (e),
274 | 		  [f]      "+x"  (f)
275 | 
276 | 		// Inputs (not modified).
277 | 		: [lut0] "x" (lut0),
278 | 		  [lut1] "x" (lut1),
279 | 		  [msk0] "x" (_mm256_set1_epi32(0x0FC0FC00)),
280 | 		  [msk1] "x" (_mm256_set1_epi32(0x04000040)),
281 | 		  [msk2] "x" (_mm256_set1_epi32(0x003F03F0)),
282 | 		  [msk3] "x" (_mm256_set1_epi32(0x01000010)),
283 | 		  [n51]  "x" (_mm256_set1_epi8(51)),
284 | 		  [n25]  "x" (_mm256_set1_epi8(25))
285 | 
286 | 		// Clobbers.
287 | 		: "cc", "memory"
288 | 	);
289 | }
290 | 
291 | #pragma GCC diagnostic pop
292 | 


--------------------------------------------------------------------------------
/lib/arch/avx2/enc_reshuffle.c:
--------------------------------------------------------------------------------
 1 | static BASE64_FORCE_INLINE __m256i
 2 | enc_reshuffle (const __m256i input)
 3 | {
 4 | 	// Translation of the SSSE3 reshuffling algorithm to AVX2. This one
 5 | 	// works with shifted (4 bytes) input in order to be able to work
 6 | 	// efficiently in the two 128-bit lanes.
 7 | 
 8 | 	// Input, bytes MSB to LSB:
 9 | 	// 0 0 0 0 x w v u t s r q p o n m
10 | 	// l k j i h g f e d c b a 0 0 0 0
11 | 
12 | 	const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8(
13 | 		10, 11,  9, 10,
14 | 		 7,  8,  6,  7,
15 | 		 4,  5,  3,  4,
16 | 		 1,  2,  0,  1,
17 | 
18 | 		14, 15, 13, 14,
19 | 		11, 12, 10, 11,
20 | 		 8,  9,  7,  8,
21 | 		 5,  6,  4,  5));
22 | 	// in, bytes MSB to LSB:
23 | 	// w x v w
24 | 	// t u s t
25 | 	// q r p q
26 | 	// n o m n
27 | 	// k l j k
28 | 	// h i g h
29 | 	// e f d e
30 | 	// b c a b
31 | 
32 | 	const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0FC0FC00));
33 | 	// bits, upper case are most significant bits, lower case are least
34 | 	// significant bits.
35 | 	// 0000wwww XX000000 VVVVVV00 00000000
36 | 	// 0000tttt UU000000 SSSSSS00 00000000
37 | 	// 0000qqqq RR000000 PPPPPP00 00000000
38 | 	// 0000nnnn OO000000 MMMMMM00 00000000
39 | 	// 0000kkkk LL000000 JJJJJJ00 00000000
40 | 	// 0000hhhh II000000 GGGGGG00 00000000
41 | 	// 0000eeee FF000000 DDDDDD00 00000000
42 | 	// 0000bbbb CC000000 AAAAAA00 00000000
43 | 
44 | 	const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
45 | 	// 00000000 00wwwwXX 00000000 00VVVVVV
46 | 	// 00000000 00ttttUU 00000000 00SSSSSS
47 | 	// 00000000 00qqqqRR 00000000 00PPPPPP
48 | 	// 00000000 00nnnnOO 00000000 00MMMMMM
49 | 	// 00000000 00kkkkLL 00000000 00JJJJJJ
50 | 	// 00000000 00hhhhII 00000000 00GGGGGG
51 | 	// 00000000 00eeeeFF 00000000 00DDDDDD
52 | 	// 00000000 00bbbbCC 00000000 00AAAAAA
53 | 
54 | 	const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003F03F0));
55 | 	// 00000000 00xxxxxx 000000vv WWWW0000
56 | 	// 00000000 00uuuuuu 000000ss TTTT0000
57 | 	// 00000000 00rrrrrr 000000pp QQQQ0000
58 | 	// 00000000 00oooooo 000000mm NNNN0000
59 | 	// 00000000 00llllll 000000jj KKKK0000
60 | 	// 00000000 00iiiiii 000000gg HHHH0000
61 | 	// 00000000 00ffffff 000000dd EEEE0000
62 | 	// 00000000 00cccccc 000000aa BBBB0000
63 | 
64 | 	const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
65 | 	// 00xxxxxx 00000000 00vvWWWW 00000000
66 | 	// 00uuuuuu 00000000 00ssTTTT 00000000
67 | 	// 00rrrrrr 00000000 00ppQQQQ 00000000
68 | 	// 00oooooo 00000000 00mmNNNN 00000000
69 | 	// 00llllll 00000000 00jjKKKK 00000000
70 | 	// 00iiiiii 00000000 00ggHHHH 00000000
71 | 	// 00ffffff 00000000 00ddEEEE 00000000
72 | 	// 00cccccc 00000000 00aaBBBB 00000000
73 | 
74 | 	return _mm256_or_si256(t1, t3);
75 | 	// 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
76 | 	// 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
77 | 	// 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
78 | 	// 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
79 | 	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
80 | 	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
81 | 	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
82 | 	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
83 | }
84 | 


--------------------------------------------------------------------------------
/lib/arch/avx2/enc_translate.c:
--------------------------------------------------------------------------------
 1 | static BASE64_FORCE_INLINE __m256i
 2 | enc_translate (const __m256i in)
 3 | {
 4 | 	// A lookup table containing the absolute offsets for all ranges:
 5 | 	const __m256i lut = _mm256_setr_epi8(
 6 | 		65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
 7 | 		65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
 8 | 
 9 | 	// Translate values 0..63 to the Base64 alphabet. There are five sets:
10 | 	// #  From      To         Abs    Index  Characters
11 | 	// 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
12 | 	// 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
13 | 	// 2  [52..61]  [48..57]    -4  [2..11]  0123456789
14 | 	// 3  [62]      [43]       -19       12  +
15 | 	// 4  [63]      [47]       -16       13  /
16 | 
17 | 	// Create LUT indices from the input. The index for range #0 is right,
18 | 	// others are 1 less than expected:
19 | 	__m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
20 | 
21 | 	// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
22 | 	const __m256i mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25));
23 | 
24 | 	// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
25 | 	// now correct:
26 | 	indices = _mm256_sub_epi8(indices, mask);
27 | 
28 | 	// Add offsets to input values:
29 | 	return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
30 | }
31 | 


--------------------------------------------------------------------------------
/lib/arch/avx512/codec.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stddef.h>
 3 | #include <stdlib.h>
 4 | 
 5 | #include "../../../include/libbase64.h"
 6 | #include "../../tables/tables.h"
 7 | #include "../../codecs.h"
 8 | #include "config.h"
 9 | #include "../../env.h"
10 | 
11 | #if HAVE_AVX512
12 | #include <immintrin.h>
13 | 
14 | #include "../avx2/dec_reshuffle.c"
15 | #include "../avx2/dec_loop.c"
16 | #include "enc_reshuffle_translate.c"
17 | #include "enc_loop.c"
18 | 
19 | #endif	// HAVE_AVX512
20 | 
21 | void
22 | base64_stream_encode_avx512 BASE64_ENC_PARAMS
23 | {
24 | #if HAVE_AVX512
25 | 	#include "../generic/enc_head.c"
26 | 	enc_loop_avx512(&s, &slen, &o, &olen);
27 | 	#include "../generic/enc_tail.c"
28 | #else
29 | 	base64_enc_stub(state, src, srclen, out, outlen);
30 | #endif
31 | }
32 | 
33 | // Reuse AVX2 decoding. Not supporting AVX512 at present
34 | int
35 | base64_stream_decode_avx512 BASE64_DEC_PARAMS
36 | {
37 | #if HAVE_AVX512
38 | 	#include "../generic/dec_head.c"
39 | 	dec_loop_avx2(&s, &slen, &o, &olen);
40 | 	#include "../generic/dec_tail.c"
41 | #else
42 | 	return base64_dec_stub(state, src, srclen, out, outlen);
43 | #endif
44 | }
45 | 


--------------------------------------------------------------------------------
/lib/arch/avx512/enc_loop.c:
--------------------------------------------------------------------------------
 1 | static BASE64_FORCE_INLINE void
 2 | enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
 3 | {
 4 | 	// Load input.
 5 | 	__m512i src = _mm512_loadu_si512((__m512i *) *s);
 6 | 
 7 | 	// Reshuffle, translate, store.
 8 | 	src = enc_reshuffle_translate(src);
 9 | 	_mm512_storeu_si512((__m512i *) *o, src);
10 | 
11 | 	*s += 48;
12 | 	*o += 64;
13 | }
14 | 
15 | static inline void
16 | enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
17 | {
18 | 	if (*slen < 64) {
19 | 		return;
20 | 	}
21 | 
22 | 	// Process blocks of 48 bytes at a time. Because blocks are loaded 64
23 | 	// bytes at a time, ensure that there will be at least 24 remaining
24 | 	// bytes after the last round, so that the final read will not pass
25 | 	// beyond the bounds of the input buffer.
26 | 	size_t rounds = (*slen - 24) / 48;
27 | 
28 | 	*slen -= rounds * 48;   // 48 bytes consumed per round
29 | 	*olen += rounds * 64;   // 64 bytes produced per round
30 | 
31 | 	while (rounds > 0) {
32 | 		if (rounds >= 8) {
33 | 			enc_loop_avx512_inner(s, o);
34 | 			enc_loop_avx512_inner(s, o);
35 | 			enc_loop_avx512_inner(s, o);
36 | 			enc_loop_avx512_inner(s, o);
37 | 			enc_loop_avx512_inner(s, o);
38 | 			enc_loop_avx512_inner(s, o);
39 | 			enc_loop_avx512_inner(s, o);
40 | 			enc_loop_avx512_inner(s, o);
41 | 			rounds -= 8;
42 | 			continue;
43 | 		}
44 | 		if (rounds >= 4) {
45 | 			enc_loop_avx512_inner(s, o);
46 | 			enc_loop_avx512_inner(s, o);
47 | 			enc_loop_avx512_inner(s, o);
48 | 			enc_loop_avx512_inner(s, o);
49 | 			rounds -= 4;
50 | 			continue;
51 | 		}
52 | 		if (rounds >= 2) {
53 | 			enc_loop_avx512_inner(s, o);
54 | 			enc_loop_avx512_inner(s, o);
55 | 			rounds -= 2;
56 | 			continue;
57 | 		}
58 | 		enc_loop_avx512_inner(s, o);
59 | 		break;
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/lib/arch/avx512/enc_reshuffle_translate.c:
--------------------------------------------------------------------------------
 1 | // AVX512 algorithm is based on permutevar and multishift. The code is based on
 2 | // https://github.com/WojciechMula/base64simd which is under BSD-2 license.
 3 | 
 4 | static BASE64_FORCE_INLINE __m512i
 5 | enc_reshuffle_translate (const __m512i input)
 6 | {
 7 | 	// 32-bit input
 8 | 	// [ 0  0  0  0  0  0  0  0|c1 c0 d5 d4 d3 d2 d1 d0|
 9 | 	//  b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4]
10 | 	// output order  [1, 2, 0, 1]
11 | 	// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
12 | 	//  a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
13 | 
14 | 	const __m512i shuffle_input = _mm512_setr_epi32(0x01020001,
15 | 	                                                0x04050304,
16 | 	                                                0x07080607,
17 | 	                                                0x0a0b090a,
18 | 	                                                0x0d0e0c0d,
19 | 	                                                0x10110f10,
20 | 	                                                0x13141213,
21 | 	                                                0x16171516,
22 | 	                                                0x191a1819,
23 | 	                                                0x1c1d1b1c,
24 | 	                                                0x1f201e1f,
25 | 	                                                0x22232122,
26 | 	                                                0x25262425,
27 | 	                                                0x28292728,
28 | 	                                                0x2b2c2a2b,
29 | 	                                                0x2e2f2d2e);
30 | 
31 | 	// Reorder bytes
32 | 	// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
33 | 	//  a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
34 | 	const __m512i in = _mm512_permutexvar_epi8(shuffle_input, input);
35 | 
36 | 	// After multishift a single 32-bit lane has following layout
37 | 	// [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0|
38 | 	//  a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0]
39 | 	// (a = [10:17], b = [4:11], c = [22:27], d = [16:21])
40 | 
41 | 	// 48, 54, 36, 42, 16, 22, 4, 10
42 | 	const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu);
43 | 	__m512i shuffled_in = _mm512_multishift_epi64_epi8(shifts, in);
44 | 
45 | 	// Translate immediatedly after reshuffled.
46 | 	const __m512i lookup = _mm512_loadu_si512(base64_table_enc_6bit);
47 | 
48 | 	// Translation 6-bit values to ASCII.
49 | 	return _mm512_permutexvar_epi8(shuffled_in, lookup);
50 | }
51 | 


--------------------------------------------------------------------------------
/lib/arch/generic/32/dec_loop.c:
--------------------------------------------------------------------------------
 1 | static BASE64_FORCE_INLINE int
 2 | dec_loop_generic_32_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
 3 | {
 4 | 	const uint32_t str
 5 | 		= base64_table_dec_32bit_d0[(*s)[0]]
 6 | 		| base64_table_dec_32bit_d1[(*s)[1]]
 7 | 		| base64_table_dec_32bit_d2[(*s)[2]]
 8 | 		| base64_table_dec_32bit_d3[(*s)[3]];
 9 | 
10 | #if BASE64_LITTLE_ENDIAN
11 | 
12 | 	// LUTs for little-endian set MSB in case of invalid character:
13 | 	if (str & UINT32_C(0x80000000)) {
14 | 		return 0;
15 | 	}
16 | #else
17 | 	// LUTs for big-endian set LSB in case of invalid character:
18 | 	if (str & UINT32_C(1)) {
19 | 		return 0;
20 | 	}
21 | #endif
22 | 	// Store the output:
23 | 	memcpy(*o, &str, sizeof (str));
24 | 
25 | 	*s += 4;
26 | 	*o += 3;
27 | 	*rounds -= 1;
28 | 
29 | 	return 1;
30 | }
31 | 
32 | static inline void
33 | dec_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
34 | {
35 | 	if (*slen < 8) {
36 | 		return;
37 | 	}
38 | 
39 | 	// Process blocks of 4 bytes per round. Because one extra zero byte is
40 | 	// written after the output, ensure that there will be at least 4 bytes
41 | 	// of input data left to cover the gap. (Two data bytes and up to two
42 | 	// end-of-string markers.)
43 | 	size_t rounds = (*slen - 4) / 4;
44 | 
45 | 	*slen -= rounds * 4;	// 4 bytes consumed per round
46 | 	*olen += rounds * 3;	// 3 bytes produced per round
47 | 
48 | 	do {
49 | 		if (rounds >= 8) {
50 | 			if (dec_loop_generic_32_inner(s, o, &rounds) &&
51 | 			    dec_loop_generic_32_inner(s, o, &rounds) &&
52 | 			    dec_loop_generic_32_inner(s, o, &rounds) &&
53 | 			    dec_loop_generic_32_inner(s, o, &rounds) &&
54 | 			    dec_loop_generic_32_inner(s, o, &rounds) &&
55 | 			    dec_loop_generic_32_inner(s, o, &rounds) &&
56 | 			    dec_loop_generic_32_inner(s, o, &rounds) &&
57 | 			    dec_loop_generic_32_inner(s, o, &rounds)) {
58 | 				continue;
59 | 			}
60 | 			break;
61 | 		}
62 | 		if (rounds >= 4) {
63 | 			if (dec_loop_generic_32_inner(s, o, &rounds) &&
64 | 			    dec_loop_generic_32_inner(s, o, &rounds) &&
65 | 			    dec_loop_generic_32_inner(s, o, &rounds) &&
66 | 			    dec_loop_generic_32_inner(s, o, &rounds)) {
67 | 				continue;
68 | 			}
69 | 			break;
70 | 		}
71 | 		if (rounds >= 2) {
72 | 			if (dec_loop_generic_32_inner(s, o, &rounds) &&
73 | 			    dec_loop_generic_32_inner(s, o, &rounds)) {
74 | 				continue;
75 | 			}
76 | 			break;
77 | 		}
78 | 		dec_loop_generic_32_inner(s, o, &rounds);
79 | 		break;
80 | 
81 | 	} while (rounds > 0);
82 | 
83 | 	// Adjust for any rounds that were skipped:
84 | 	*slen += rounds * 4;
85 | 	*olen -= rounds * 3;
86 | }
87 | 


--------------------------------------------------------------------------------
/lib/arch/generic/32/enc_loop.c:
--------------------------------------------------------------------------------
 1 | static BASE64_FORCE_INLINE void
 2 | enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o)
 3 | {
 4 | 	uint32_t src;
 5 | 
 6 | 	// Load input:
 7 | 	memcpy(&src, *s, sizeof (src));
 8 | 
 9 | 	// Reorder to 32-bit big-endian, if not already in that format. The
10 | 	// workset must be in big-endian, otherwise the shifted bits do not
11 | 	// carry over properly among adjacent bytes:
12 | 	src = BASE64_HTOBE32(src);
13 | 
14 | 	// Two indices for the 12-bit lookup table:
15 | 	const size_t index0 = (src >> 20) & 0xFFFU;
16 | 	const size_t index1 = (src >>  8) & 0xFFFU;
17 | 
18 | 	// Table lookup and store:
19 | 	memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
20 | 	memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
21 | 
22 | 	*s += 3;
23 | 	*o += 4;
24 | }
25 | 
26 | static inline void
27 | enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
28 | {
29 | 	if (*slen < 4) {
30 | 		return;
31 | 	}
32 | 
33 | 	// Process blocks of 3 bytes at a time. Because blocks are loaded 4
34 | 	// bytes at a time, ensure that there will be at least one remaining
35 | 	// byte after the last round, so that the final read will not pass
36 | 	// beyond the bounds of the input buffer:
37 | 	size_t rounds = (*slen - 1) / 3;
38 | 
39 | 	*slen -= rounds * 3;	// 3 bytes consumed per round
40 | 	*olen += rounds * 4;	// 4 bytes produced per round
41 | 
42 | 	do {
43 | 		if (rounds >= 8) {
44 | 			enc_loop_generic_32_inner(s, o);
45 | 			enc_loop_generic_32_inner(s, o);
46 | 			enc_loop_generic_32_inner(s, o);
47 | 			enc_loop_generic_32_inner(s, o);
48 | 			enc_loop_generic_32_inner(s, o);
49 | 			enc_loop_generic_32_inner(s, o);
50 | 			enc_loop_generic_32_inner(s, o);
51 | 			enc_loop_generic_32_inner(s, o);
52 | 			rounds -= 8;
53 | 			continue;
54 | 		}
55 | 		if (rounds >= 4) {
56 | 			enc_loop_generic_32_inner(s, o);
57 | 			enc_loop_generic_32_inner(s, o);
58 | 			enc_loop_generic_32_inner(s, o);
59 | 			enc_loop_generic_32_inner(s, o);
60 | 			rounds -= 4;
61 | 			continue;
62 | 		}
63 | 		if (rounds >= 2) {
64 | 			enc_loop_generic_32_inner(s, o);
65 | 			enc_loop_generic_32_inner(s, o);
66 | 			rounds -= 2;
67 | 			continue;
68 | 		}
69 | 		enc_loop_generic_32_inner(s, o);
70 | 		break;
71 | 
72 | 	} while (rounds > 0);
73 | }
74 | 


--------------------------------------------------------------------------------
/lib/arch/generic/64/enc_loop.c:
--------------------------------------------------------------------------------
 1 | static BASE64_FORCE_INLINE void
 2 | enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o)
 3 | {
 4 | 	uint64_t src;
 5 | 
 6 | 	// Load input:
 7 | 	memcpy(&src, *s, sizeof (src));
 8 | 
 9 | 	// Reorder to 64-bit big-endian, if not already in that format. The
10 | 	// workset must be in big-endian, otherwise the shifted bits do not
11 | 	// carry over properly among adjacent bytes:
12 | 	src = BASE64_HTOBE64(src);
13 | 
14 | 	// Four indices for the 12-bit lookup table:
15 | 	const size_t index0 = (src >> 52) & 0xFFFU;
16 | 	const size_t index1 = (src >> 40) & 0xFFFU;
17 | 	const size_t index2 = (src >> 28) & 0xFFFU;
18 | 	const size_t index3 = (src >> 16) & 0xFFFU;
19 | 
20 | 	// Table lookup and store:
21 | 	memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
22 | 	memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
23 | 	memcpy(*o + 4, base64_table_enc_12bit + index2, 2);
24 | 	memcpy(*o + 6, base64_table_enc_12bit + index3, 2);
25 | 
26 | 	*s += 6;
27 | 	*o += 8;
28 | }
29 | 
30 | static inline void
31 | enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
32 | {
33 | 	if (*slen < 8) {
34 | 		return;
35 | 	}
36 | 
37 | 	// Process blocks of 6 bytes at a time. Because blocks are loaded 8
38 | 	// bytes at a time, ensure that there will be at least 2 remaining
39 | 	// bytes after the last round, so that the final read will not pass
40 | 	// beyond the bounds of the input buffer:
41 | 	size_t rounds = (*slen - 2) / 6;
42 | 
43 | 	*slen -= rounds * 6;	// 6 bytes consumed per round
44 | 	*olen += rounds * 8;	// 8 bytes produced per round
45 | 
46 | 	do {
47 | 		if (rounds >= 8) {
48 | 			enc_loop_generic_64_inner(s, o);
49 | 			enc_loop_generic_64_inner(s, o);
50 | 			enc_loop_generic_64_inner(s, o);
51 | 			enc_loop_generic_64_inner(s, o);
52 | 			enc_loop_generic_64_inner(s, o);
53 | 			enc_loop_generic_64_inner(s, o);
54 | 			enc_loop_generic_64_inner(s, o);
55 | 			enc_loop_generic_64_inner(s, o);
56 | 			rounds -= 8;
57 | 			continue;
58 | 		}
59 | 		if (rounds >= 4) {
60 | 			enc_loop_generic_64_inner(s, o);
61 | 			enc_loop_generic_64_inner(s, o);
62 | 			enc_loop_generic_64_inner(s, o);
63 | 			enc_loop_generic_64_inner(s, o);
64 | 			rounds -= 4;
65 | 			continue;
66 | 		}
67 | 		if (rounds >= 2) {
68 | 			enc_loop_generic_64_inner(s, o);
69 | 			enc_loop_generic_64_inner(s, o);
70 | 			rounds -= 2;
71 | 			continue;
72 | 		}
73 | 		enc_loop_generic_64_inner(s, o);
74 | 		break;
75 | 
76 | 	} while (rounds > 0);
77 | }
78 | 


--------------------------------------------------------------------------------
/lib/arch/generic/codec.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stddef.h>
 3 | #include <string.h>
 4 | 
 5 | #include "../../../include/libbase64.h"
 6 | #include "../../tables/tables.h"
 7 | #include "../../codecs.h"
 8 | #include "config.h"
 9 | #include "../../env.h"
10 | 
11 | #if BASE64_WORDSIZE == 32
12 | #  include "32/enc_loop.c"
13 | #elif BASE64_WORDSIZE == 64
14 | #  include "64/enc_loop.c"
15 | #endif
16 | 
17 | #if BASE64_WORDSIZE >= 32
18 | #  include "32/dec_loop.c"
19 | #endif
20 | 
21 | void
22 | base64_stream_encode_plain BASE64_ENC_PARAMS
23 | {
24 | 	#include "enc_head.c"
25 | #if BASE64_WORDSIZE == 32
26 | 	enc_loop_generic_32(&s, &slen, &o, &olen);
27 | #elif BASE64_WORDSIZE == 64
28 | 	enc_loop_generic_64(&s, &slen, &o, &olen);
29 | #endif
30 | 	#include "enc_tail.c"
31 | }
32 | 
33 | int
34 | base64_stream_decode_plain BASE64_DEC_PARAMS
35 | {
36 | 	#include "dec_head.c"
37 | #if BASE64_WORDSIZE >= 32
38 | 	dec_loop_generic_32(&s, &slen, &o, &olen);
39 | #endif
40 | 	#include "dec_tail.c"
41 | }
42 | 


--------------------------------------------------------------------------------
/lib/arch/generic/dec_head.c:
--------------------------------------------------------------------------------
 1 | int ret = 0;
 2 | const uint8_t *s = (const uint8_t *) src;
 3 | uint8_t *o = (uint8_t *) out;
 4 | uint8_t q;
 5 | 
 6 | // Use local temporaries to avoid cache thrashing:
 7 | size_t olen = 0;
 8 | size_t slen = srclen;
 9 | struct base64_state st;
10 | st.eof = state->eof;
11 | st.bytes = state->bytes;
12 | st.carry = state->carry;
13 | 
14 | // If we previously saw an EOF or an invalid character, bail out:
15 | if (st.eof) {
16 | 	*outlen = 0;
17 | 	ret = 0;
18 | 	// If there was a trailing '=' to check, check it:
19 | 	if (slen && (st.eof == BASE64_AEOF)) {
20 | 		state->bytes = 0;
21 | 		state->eof = BASE64_EOF;
22 | 		ret = ((base64_table_dec_8bit[*s++] == 254) && (slen == 1)) ? 1 : 0;
23 | 	}
24 | 	return ret;
25 | }
26 | 
27 | // Turn four 6-bit numbers into three bytes:
28 | // out[0] = 11111122
29 | // out[1] = 22223333
30 | // out[2] = 33444444
31 | 
32 | // Duff's device again:
33 | switch (st.bytes)
34 | {
35 | 	for (;;)
36 | 	{
37 | 	case 0:
38 | 


--------------------------------------------------------------------------------
/lib/arch/generic/dec_tail.c:
--------------------------------------------------------------------------------
 1 | 		if (slen-- == 0) {
 2 | 			ret = 1;
 3 | 			break;
 4 | 		}
 5 | 		if ((q = base64_table_dec_8bit[*s++]) >= 254) {
 6 | 			st.eof = BASE64_EOF;
 7 | 			// Treat character '=' as invalid for byte 0:
 8 | 			break;
 9 | 		}
10 | 		st.carry = q << 2;
11 | 		st.bytes++;
12 | 
13 | 		// Deliberate fallthrough:
14 | 		BASE64_FALLTHROUGH
15 | 
16 | 	case 1:	if (slen-- == 0) {
17 | 			ret = 1;
18 | 			break;
19 | 		}
20 | 		if ((q = base64_table_dec_8bit[*s++]) >= 254) {
21 | 			st.eof = BASE64_EOF;
22 | 			// Treat character '=' as invalid for byte 1:
23 | 			break;
24 | 		}
25 | 		*o++ = st.carry | (q >> 4);
26 | 		st.carry = q << 4;
27 | 		st.bytes++;
28 | 		olen++;
29 | 
30 | 		// Deliberate fallthrough:
31 | 		BASE64_FALLTHROUGH
32 | 
33 | 	case 2:	if (slen-- == 0) {
34 | 			ret = 1;
35 | 			break;
36 | 		}
37 | 		if ((q = base64_table_dec_8bit[*s++]) >= 254) {
38 | 			st.bytes++;
39 | 			// When q == 254, the input char is '='.
40 | 			// Check if next byte is also '=':
41 | 			if (q == 254) {
42 | 				if (slen-- != 0) {
43 | 					st.bytes = 0;
44 | 					// EOF:
45 | 					st.eof = BASE64_EOF;
46 | 					q = base64_table_dec_8bit[*s++];
47 | 					ret = ((q == 254) && (slen == 0)) ? 1 : 0;
48 | 					break;
49 | 				}
50 | 				else {
51 | 					// Almost EOF
52 | 					st.eof = BASE64_AEOF;
53 | 					ret = 1;
54 | 					break;
55 | 				}
56 | 			}
57 | 			// If we get here, there was an error:
58 | 			break;
59 | 		}
60 | 		*o++ = st.carry | (q >> 2);
61 | 		st.carry = q << 6;
62 | 		st.bytes++;
63 | 		olen++;
64 | 
65 | 		// Deliberate fallthrough:
66 | 		BASE64_FALLTHROUGH
67 | 
68 | 	case 3:	if (slen-- == 0) {
69 | 			ret = 1;
70 | 			break;
71 | 		}
72 | 		if ((q = base64_table_dec_8bit[*s++]) >= 254) {
73 | 			st.bytes = 0;
74 | 			st.eof = BASE64_EOF;
75 | 			// When q == 254, the input char is '='. Return 1 and EOF.
76 | 			// When q == 255, the input char is invalid. Return 0 and EOF.
77 | 			ret = ((q == 254) && (slen == 0)) ? 1 : 0;
78 | 			break;
79 | 		}
80 | 		*o++ = st.carry | q;
81 | 		st.carry = 0;
82 | 		st.bytes = 0;
83 | 		olen++;
84 | 	}
85 | }
86 | 
87 | state->eof = st.eof;
88 | state->bytes = st.bytes;
89 | state->carry = st.carry;
90 | *outlen = olen;
91 | return ret;
92 | 


--------------------------------------------------------------------------------
/lib/arch/generic/enc_head.c:
--------------------------------------------------------------------------------
 1 | // Assume that *out is large enough to contain the output.
 2 | // Theoretically it should be 4/3 the length of src.
 3 | const uint8_t *s = (const uint8_t *) src;
 4 | uint8_t *o = (uint8_t *) out;
 5 | 
 6 | // Use local temporaries to avoid cache thrashing:
 7 | size_t olen = 0;
 8 | size_t slen = srclen;
 9 | struct base64_state st;
10 | st.bytes = state->bytes;
11 | st.carry = state->carry;
12 | 
13 | // Turn three bytes into four 6-bit numbers:
14 | // in[0] = 00111111
15 | // in[1] = 00112222
16 | // in[2] = 00222233
17 | // in[3] = 00333333
18 | 
19 | // Duff's device, a for() loop inside a switch() statement. Legal!
20 | switch (st.bytes)
21 | {
22 | 	for (;;)
23 | 	{
24 | 	case 0:
25 | 


--------------------------------------------------------------------------------
/lib/arch/generic/enc_tail.c:
--------------------------------------------------------------------------------
 1 | 		if (slen-- == 0) {
 2 | 			break;
 3 | 		}
 4 | 		*o++ = base64_table_enc_6bit[*s >> 2];
 5 | 		st.carry = (*s++ << 4) & 0x30;
 6 | 		st.bytes++;
 7 | 		olen += 1;
 8 | 
 9 | 		// Deliberate fallthrough:
10 | 		BASE64_FALLTHROUGH
11 | 
12 | 	case 1:	if (slen-- == 0) {
13 | 			break;
14 | 		}
15 | 		*o++ = base64_table_enc_6bit[st.carry | (*s >> 4)];
16 | 		st.carry = (*s++ << 2) & 0x3C;
17 | 		st.bytes++;
18 | 		olen += 1;
19 | 
20 | 		// Deliberate fallthrough:
21 | 		BASE64_FALLTHROUGH
22 | 
23 | 	case 2:	if (slen-- == 0) {
24 | 			break;
25 | 		}
26 | 		*o++ = base64_table_enc_6bit[st.carry | (*s >> 6)];
27 | 		*o++ = base64_table_enc_6bit[*s++ & 0x3F];
28 | 		st.bytes = 0;
29 | 		olen += 2;
30 | 	}
31 | }
32 | state->bytes = st.bytes;
33 | state->carry = st.carry;
34 | *outlen = olen;
35 | 


--------------------------------------------------------------------------------
/lib/arch/neon32/codec.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stddef.h>
 3 | #include <string.h>
 4 | 
 5 | #include "../../../include/libbase64.h"
 6 | #include "../../tables/tables.h"
 7 | #include "../../codecs.h"
 8 | #include "config.h"
 9 | #include "../../env.h"
10 | 
11 | #ifdef __arm__
12 | #  if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32
13 | #    define BASE64_USE_NEON32
14 | #  endif
15 | #endif
16 | 
17 | #ifdef BASE64_USE_NEON32
18 | #include <arm_neon.h>
19 | 
20 | // Only enable inline assembly on supported compilers.
21 | #if defined(__GNUC__) || defined(__clang__)
22 | #define BASE64_NEON32_USE_ASM
23 | #endif
24 | 
25 | static BASE64_FORCE_INLINE uint8x16_t
26 | vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
27 | {
28 | 	// NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate
29 | 	// the NEON64 `vqtbl1q_u8` intrinsic to do 128-bit wide lookups.
30 | 	uint8x8x2_t lut2;
31 | 	uint8x8x2_t result;
32 | 
33 | 	lut2.val[0] = vget_low_u8(lut);
34 | 	lut2.val[1] = vget_high_u8(lut);
35 | 
36 | 	result.val[0] = vtbl2_u8(lut2, vget_low_u8(indices));
37 | 	result.val[1] = vtbl2_u8(lut2, vget_high_u8(indices));
38 | 
39 | 	return vcombine_u8(result.val[0], result.val[1]);
40 | }
41 | 
42 | #include "../generic/32/dec_loop.c"
43 | #include "../generic/32/enc_loop.c"
44 | #include "dec_loop.c"
45 | #include "enc_reshuffle.c"
46 | #include "enc_translate.c"
47 | #include "enc_loop.c"
48 | 
49 | #endif	// BASE64_USE_NEON32
50 | 
51 | // Stride size is so large on these NEON 32-bit functions
52 | // (48 bytes encode, 32 bytes decode) that we inline the
53 | // uint32 codec to stay performant on smaller inputs.
54 | 
55 | void
56 | base64_stream_encode_neon32 BASE64_ENC_PARAMS
57 | {
58 | #ifdef BASE64_USE_NEON32
59 | 	#include "../generic/enc_head.c"
60 | 	enc_loop_neon32(&s, &slen, &o, &olen);
61 | 	enc_loop_generic_32(&s, &slen, &o, &olen);
62 | 	#include "../generic/enc_tail.c"
63 | #else
64 | 	base64_enc_stub(state, src, srclen, out, outlen);
65 | #endif
66 | }
67 | 
68 | int
69 | base64_stream_decode_neon32 BASE64_DEC_PARAMS
70 | {
71 | #ifdef BASE64_USE_NEON32
72 | 	#include "../generic/dec_head.c"
73 | 	dec_loop_neon32(&s, &slen, &o, &olen);
74 | 	dec_loop_generic_32(&s, &slen, &o, &olen);
75 | 	#include "../generic/dec_tail.c"
76 | #else
77 | 	return base64_dec_stub(state, src, srclen, out, outlen);
78 | #endif
79 | }
80 | 


--------------------------------------------------------------------------------
/lib/arch/neon32/dec_loop.c:
--------------------------------------------------------------------------------
  1 | static BASE64_FORCE_INLINE int
  2 | is_nonzero (const uint8x16_t v)
  3 | {
  4 | 	uint64_t u64;
  5 | 	const uint64x2_t v64 = vreinterpretq_u64_u8(v);
  6 | 	const uint32x2_t v32 = vqmovn_u64(v64);
  7 | 
  8 | 	vst1_u64(&u64, vreinterpret_u64_u32(v32));
  9 | 	return u64 != 0;
 10 | }
 11 | 
 12 | static BASE64_FORCE_INLINE uint8x16_t
 13 | delta_lookup (const uint8x16_t v)
 14 | {
 15 | 	const uint8x8_t lut = {
 16 | 		0, 16, 19, 4, (uint8_t) -65, (uint8_t) -65, (uint8_t) -71, (uint8_t) -71,
 17 | 	};
 18 | 
 19 | 	return vcombine_u8(
 20 | 		vtbl1_u8(lut, vget_low_u8(v)),
 21 | 		vtbl1_u8(lut, vget_high_u8(v)));
 22 | }
 23 | 
 24 | static BASE64_FORCE_INLINE uint8x16_t
 25 | dec_loop_neon32_lane (uint8x16_t *lane)
 26 | {
 27 | 	// See the SSSE3 decoder for an explanation of the algorithm.
 28 | 	const uint8x16_t lut_lo = {
 29 | 		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
 30 | 		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A
 31 | 	};
 32 | 
 33 | 	const uint8x16_t lut_hi = {
 34 | 		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
 35 | 		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
 36 | 	};
 37 | 
 38 | 	const uint8x16_t mask_0F = vdupq_n_u8(0x0F);
 39 | 	const uint8x16_t mask_2F = vdupq_n_u8(0x2F);
 40 | 
 41 | 	const uint8x16_t hi_nibbles = vshrq_n_u8(*lane, 4);
 42 | 	const uint8x16_t lo_nibbles = vandq_u8(*lane, mask_0F);
 43 | 	const uint8x16_t eq_2F      = vceqq_u8(*lane, mask_2F);
 44 | 
 45 | 	const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
 46 | 	const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
 47 | 
 48 | 	// Now simply add the delta values to the input:
 49 | 	*lane = vaddq_u8(*lane, delta_lookup(vaddq_u8(eq_2F, hi_nibbles)));
 50 | 
 51 | 	// Return the validity mask:
 52 | 	return vandq_u8(lo, hi);
 53 | }
 54 | 
 55 | static inline void
 56 | dec_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 57 | {
 58 | 	if (*slen < 64) {
 59 | 		return;
 60 | 	}
 61 | 
 62 | 	// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
 63 | 	// extra trailing zero bytes are written, so it is not necessary to
 64 | 	// reserve extra input bytes:
 65 | 	size_t rounds = *slen / 64;
 66 | 
 67 | 	*slen -= rounds * 64;	// 64 bytes consumed per round
 68 | 	*olen += rounds * 48;	// 48 bytes produced per round
 69 | 
 70 | 	do {
 71 | 		uint8x16x3_t dec;
 72 | 
 73 | 		// Load 64 bytes and deinterleave:
 74 | 		uint8x16x4_t str = vld4q_u8(*s);
 75 | 
 76 | 		// Decode each lane, collect a mask of invalid inputs:
 77 | 		const uint8x16_t classified
 78 | 			= dec_loop_neon32_lane(&str.val[0])
 79 | 			| dec_loop_neon32_lane(&str.val[1])
 80 | 			| dec_loop_neon32_lane(&str.val[2])
 81 | 			| dec_loop_neon32_lane(&str.val[3]);
 82 | 
 83 | 		// Check for invalid input: if any of the delta values are
 84 | 		// zero, fall back on bytewise code to do error checking and
 85 | 		// reporting:
 86 | 		if (is_nonzero(classified)) {
 87 | 			break;
 88 | 		}
 89 | 
 90 | 		// Compress four bytes into three:
 91 | 		dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
 92 | 		dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
 93 | 		dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
 94 | 
 95 | 		// Interleave and store decoded result:
 96 | 		vst3q_u8(*o, dec);
 97 | 
 98 | 		*s += 64;
 99 | 		*o += 48;
100 | 
101 | 	} while (--rounds > 0);
102 | 
103 | 	// Adjust for any rounds that were skipped:
104 | 	*slen += rounds * 64;
105 | 	*olen -= rounds * 48;
106 | }
107 | 


--------------------------------------------------------------------------------
/lib/arch/neon32/enc_loop.c:
--------------------------------------------------------------------------------
  1 | #ifdef BASE64_NEON32_USE_ASM
  2 | static BASE64_FORCE_INLINE void
  3 | enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
  4 | {
  5 | 	// This function duplicates the functionality of enc_loop_neon32_inner,
  6 | 	// but entirely with inline assembly. This gives a significant speedup
  7 | 	// over using NEON intrinsics, which do not always generate very good
  8 | 	// code. The logic of the assembly is directly lifted from the
  9 | 	// intrinsics version, so it can be used as a guide to this code.
 10 | 
 11 | 	// Temporary registers, used as scratch space.
 12 | 	uint8x16_t tmp0, tmp1, tmp2, tmp3;
 13 | 	uint8x16_t mask0, mask1, mask2, mask3;
 14 | 
 15 | 	// A lookup table containing the absolute offsets for all ranges.
 16 | 	const uint8x16_t lut = {
 17 | 		  65U,  71U, 252U, 252U,
 18 | 		 252U, 252U, 252U, 252U,
 19 | 		 252U, 252U, 252U, 252U,
 20 | 		 237U, 240U,   0U,   0U
 21 | 	};
 22 | 
 23 | 	// Numeric constants.
 24 | 	const uint8x16_t n51 = vdupq_n_u8(51);
 25 | 	const uint8x16_t n25 = vdupq_n_u8(25);
 26 | 	const uint8x16_t n63 = vdupq_n_u8(63);
 27 | 
 28 | 	__asm__ (
 29 | 
 30 | 		// Load 48 bytes and deinterleave. The bytes are loaded to
 31 | 		// hard-coded registers q12, q13 and q14, to ensure that they
 32 | 		// are contiguous. Increment the source pointer.
 33 | 		"vld3.8 {d24, d26, d28}, [%[src]]! \n\t"
 34 | 		"vld3.8 {d25, d27, d29}, [%[src]]! \n\t"
 35 | 
 36 | 		// Reshuffle the bytes using temporaries.
 37 | 		"vshr.u8 %q[t0], q12,    #2      \n\t"
 38 | 		"vshr.u8 %q[t1], q13,    #4      \n\t"
 39 | 		"vshr.u8 %q[t2], q14,    #6      \n\t"
 40 | 		"vsli.8  %q[t1], q12,    #4      \n\t"
 41 | 		"vsli.8  %q[t2], q13,    #2      \n\t"
 42 | 		"vand.u8 %q[t1], %q[t1], %q[n63] \n\t"
 43 | 		"vand.u8 %q[t2], %q[t2], %q[n63] \n\t"
 44 | 		"vand.u8 %q[t3], q14,    %q[n63] \n\t"
 45 | 
 46 | 		// t0..t3 are the reshuffled inputs. Create LUT indices.
 47 | 		"vqsub.u8 q12, %q[t0], %q[n51] \n\t"
 48 | 		"vqsub.u8 q13, %q[t1], %q[n51] \n\t"
 49 | 		"vqsub.u8 q14, %q[t2], %q[n51] \n\t"
 50 | 		"vqsub.u8 q15, %q[t3], %q[n51] \n\t"
 51 | 
 52 | 		// Create the mask for range #0.
 53 | 		"vcgt.u8 %q[m0], %q[t0], %q[n25] \n\t"
 54 | 		"vcgt.u8 %q[m1], %q[t1], %q[n25] \n\t"
 55 | 		"vcgt.u8 %q[m2], %q[t2], %q[n25] \n\t"
 56 | 		"vcgt.u8 %q[m3], %q[t3], %q[n25] \n\t"
 57 | 
 58 | 		// Subtract -1 to correct the LUT indices.
 59 | 		"vsub.u8 q12, %q[m0] \n\t"
 60 | 		"vsub.u8 q13, %q[m1] \n\t"
 61 | 		"vsub.u8 q14, %q[m2] \n\t"
 62 | 		"vsub.u8 q15, %q[m3] \n\t"
 63 | 
 64 | 		// Lookup the delta values.
 65 | 		"vtbl.u8 d24, {%q[lut]}, d24 \n\t"
 66 | 		"vtbl.u8 d25, {%q[lut]}, d25 \n\t"
 67 | 		"vtbl.u8 d26, {%q[lut]}, d26 \n\t"
 68 | 		"vtbl.u8 d27, {%q[lut]}, d27 \n\t"
 69 | 		"vtbl.u8 d28, {%q[lut]}, d28 \n\t"
 70 | 		"vtbl.u8 d29, {%q[lut]}, d29 \n\t"
 71 | 		"vtbl.u8 d30, {%q[lut]}, d30 \n\t"
 72 | 		"vtbl.u8 d31, {%q[lut]}, d31 \n\t"
 73 | 
 74 | 		// Add the delta values.
 75 | 		"vadd.u8 q12, %q[t0] \n\t"
 76 | 		"vadd.u8 q13, %q[t1] \n\t"
 77 | 		"vadd.u8 q14, %q[t2] \n\t"
 78 | 		"vadd.u8 q15, %q[t3] \n\t"
 79 | 
 80 | 		// Store 64 bytes and interleave. Increment the dest pointer.
 81 | 		"vst4.8 {d24, d26, d28, d30}, [%[dst]]! \n\t"
 82 | 		"vst4.8 {d25, d27, d29, d31}, [%[dst]]! \n\t"
 83 | 
 84 | 		// Outputs (modified).
 85 | 		: [src] "+r"  (*s),
 86 | 		  [dst] "+r"  (*o),
 87 | 		  [t0]  "=&w" (tmp0),
 88 | 		  [t1]  "=&w" (tmp1),
 89 | 		  [t2]  "=&w" (tmp2),
 90 | 		  [t3]  "=&w" (tmp3),
 91 | 		  [m0]  "=&w" (mask0),
 92 | 		  [m1]  "=&w" (mask1),
 93 | 		  [m2]  "=&w" (mask2),
 94 | 		  [m3]  "=&w" (mask3)
 95 | 
 96 | 		// Inputs (not modified).
 97 | 		: [lut] "w" (lut),
 98 | 		  [n25] "w" (n25),
 99 | 		  [n51] "w" (n51),
100 | 		  [n63] "w" (n63)
101 | 
102 | 		// Clobbers.
103 | 		: "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
104 | 		  "cc", "memory"
105 | 	);
106 | }
107 | #endif
108 | 
109 | static BASE64_FORCE_INLINE void
110 | enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
111 | {
112 | #ifdef BASE64_NEON32_USE_ASM
113 | 	enc_loop_neon32_inner_asm(s, o);
114 | #else
115 | 	// Load 48 bytes and deinterleave:
116 | 	uint8x16x3_t src = vld3q_u8(*s);
117 | 
118 | 	// Reshuffle:
119 | 	uint8x16x4_t out = enc_reshuffle(src);
120 | 
121 | 	// Translate reshuffled bytes to the Base64 alphabet:
122 | 	out = enc_translate(out);
123 | 
124 | 	// Interleave and store output:
125 | 	vst4q_u8(*o, out);
126 | 
127 | 	*s += 48;
128 | 	*o += 64;
129 | #endif
130 | }
131 | 
132 | static inline void
133 | enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
134 | {
135 | 	size_t rounds = *slen / 48;
136 | 
137 | 	*slen -= rounds * 48;	// 48 bytes consumed per round
138 | 	*olen += rounds * 64;	// 64 bytes produced per round
139 | 
140 | 	while (rounds > 0) {
141 | 		if (rounds >= 8) {
142 | 			enc_loop_neon32_inner(s, o);
143 | 			enc_loop_neon32_inner(s, o);
144 | 			enc_loop_neon32_inner(s, o);
145 | 			enc_loop_neon32_inner(s, o);
146 | 			enc_loop_neon32_inner(s, o);
147 | 			enc_loop_neon32_inner(s, o);
148 | 			enc_loop_neon32_inner(s, o);
149 | 			enc_loop_neon32_inner(s, o);
150 | 			rounds -= 8;
151 | 			continue;
152 | 		}
153 | 		if (rounds >= 4) {
154 | 			enc_loop_neon32_inner(s, o);
155 | 			enc_loop_neon32_inner(s, o);
156 | 			enc_loop_neon32_inner(s, o);
157 | 			enc_loop_neon32_inner(s, o);
158 | 			rounds -= 4;
159 | 			continue;
160 | 		}
161 | 		if (rounds >= 2) {
162 | 			enc_loop_neon32_inner(s, o);
163 | 			enc_loop_neon32_inner(s, o);
164 | 			rounds -= 2;
165 | 			continue;
166 | 		}
167 | 		enc_loop_neon32_inner(s, o);
168 | 		break;
169 | 	}
170 | }
171 | 


--------------------------------------------------------------------------------
/lib/arch/neon32/enc_reshuffle.c:
--------------------------------------------------------------------------------
 1 | static BASE64_FORCE_INLINE uint8x16x4_t
 2 | enc_reshuffle (uint8x16x3_t in)
 3 | {
 4 | 	uint8x16x4_t out;
 5 | 
 6 | 	// Input:
 7 | 	// in[0]  = a7 a6 a5 a4 a3 a2 a1 a0
 8 | 	// in[1]  = b7 b6 b5 b4 b3 b2 b1 b0
 9 | 	// in[2]  = c7 c6 c5 c4 c3 c2 c1 c0
10 | 
11 | 	// Output:
12 | 	// out[0] = 00 00 a7 a6 a5 a4 a3 a2
13 | 	// out[1] = 00 00 a1 a0 b7 b6 b5 b4
14 | 	// out[2] = 00 00 b3 b2 b1 b0 c7 c6
15 | 	// out[3] = 00 00 c5 c4 c3 c2 c1 c0
16 | 
17 | 	// Move the input bits to where they need to be in the outputs. Except
18 | 	// for the first output, the high two bits are not cleared.
19 | 	out.val[0] = vshrq_n_u8(in.val[0], 2);
20 | 	out.val[1] = vshrq_n_u8(in.val[1], 4);
21 | 	out.val[2] = vshrq_n_u8(in.val[2], 6);
22 | 	out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 4);
23 | 	out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 2);
24 | 
25 | 	// Clear the high two bits in the second, third and fourth output.
26 | 	out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F));
27 | 	out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F));
28 | 	out.val[3] = vandq_u8(in.val[2],  vdupq_n_u8(0x3F));
29 | 
30 | 	return out;
31 | }
32 | 


--------------------------------------------------------------------------------
/lib/arch/neon32/enc_translate.c:
--------------------------------------------------------------------------------
 1 | static BASE64_FORCE_INLINE uint8x16x4_t
 2 | enc_translate (const uint8x16x4_t in)
 3 | {
 4 | 	// A lookup table containing the absolute offsets for all ranges:
 5 | 	const uint8x16_t lut = {
 6 | 		 65U,  71U, 252U, 252U,
 7 | 		252U, 252U, 252U, 252U,
 8 | 		252U, 252U, 252U, 252U,
 9 | 		237U, 240U,   0U,   0U
10 | 	};
11 | 
12 | 	const uint8x16_t offset = vdupq_n_u8(51);
13 | 
14 | 	uint8x16x4_t indices, mask, delta, out;
15 | 
16 | 	// Translate values 0..63 to the Base64 alphabet. There are five sets:
17 | 	// #  From      To         Abs    Index  Characters
18 | 	// 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
19 | 	// 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
20 | 	// 2  [52..61]  [48..57]    -4  [2..11]  0123456789
21 | 	// 3  [62]      [43]       -19       12  +
22 | 	// 4  [63]      [47]       -16       13  /
23 | 
24 | 	// Create LUT indices from input:
25 | 	// the index for range #0 is right, others are 1 less than expected:
26 | 	indices.val[0] = vqsubq_u8(in.val[0], offset);
27 | 	indices.val[1] = vqsubq_u8(in.val[1], offset);
28 | 	indices.val[2] = vqsubq_u8(in.val[2], offset);
29 | 	indices.val[3] = vqsubq_u8(in.val[3], offset);
30 | 
31 | 	// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
32 | 	mask.val[0] = vcgtq_u8(in.val[0], vdupq_n_u8(25));
33 | 	mask.val[1] = vcgtq_u8(in.val[1], vdupq_n_u8(25));
34 | 	mask.val[2] = vcgtq_u8(in.val[2], vdupq_n_u8(25));
35 | 	mask.val[3] = vcgtq_u8(in.val[3], vdupq_n_u8(25));
36 | 
37 | 	// Subtract -1, so add 1 to indices for range #[1..4], All indices are
38 | 	// now correct:
39 | 	indices.val[0] = vsubq_u8(indices.val[0], mask.val[0]);
40 | 	indices.val[1] = vsubq_u8(indices.val[1], mask.val[1]);
41 | 	indices.val[2] = vsubq_u8(indices.val[2], mask.val[2]);
42 | 	indices.val[3] = vsubq_u8(indices.val[3], mask.val[3]);
43 | 
44 | 	// Lookup delta values:
45 | 	delta.val[0] = vqtbl1q_u8(lut, indices.val[0]);
46 | 	delta.val[1] = vqtbl1q_u8(lut, indices.val[1]);
47 | 	delta.val[2] = vqtbl1q_u8(lut, indices.val[2]);
48 | 	delta.val[3] = vqtbl1q_u8(lut, indices.val[3]);
49 | 
50 | 	// Add delta values:
51 | 	out.val[0] = vaddq_u8(in.val[0], delta.val[0]);
52 | 	out.val[1] = vaddq_u8(in.val[1], delta.val[1]);
53 | 	out.val[2] = vaddq_u8(in.val[2], delta.val[2]);
54 | 	out.val[3] = vaddq_u8(in.val[3], delta.val[3]);
55 | 
56 | 	return out;
57 | }
58 | 


--------------------------------------------------------------------------------
/lib/arch/neon64/codec.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stddef.h>
 3 | #include <string.h>
 4 | 
 5 | #include "../../../include/libbase64.h"
 6 | #include "../../tables/tables.h"
 7 | #include "../../codecs.h"
 8 | #include "config.h"
 9 | #include "../../env.h"
10 | 
11 | #if HAVE_NEON64
12 | #include <arm_neon.h>
13 | 
14 | // Only enable inline assembly on supported compilers.
15 | #if defined(__GNUC__) || defined(__clang__)
16 | #define BASE64_NEON64_USE_ASM
17 | #endif
18 | 
19 | static BASE64_FORCE_INLINE uint8x16x4_t
20 | load_64byte_table (const uint8_t *p)
21 | {
22 | #ifdef BASE64_NEON64_USE_ASM
23 | 
24 | 	// Force the table to be loaded into contiguous registers. GCC will not
25 | 	// normally allocate contiguous registers for a `uint8x16x4_t'. These
26 | 	// registers are chosen to not conflict with the ones in the enc loop.
27 | 	register uint8x16_t t0 __asm__ ("v8");
28 | 	register uint8x16_t t1 __asm__ ("v9");
29 | 	register uint8x16_t t2 __asm__ ("v10");
30 | 	register uint8x16_t t3 __asm__ ("v11");
31 | 
32 | 	__asm__ (
33 | 		"ld1 {%[t0].16b, %[t1].16b, %[t2].16b, %[t3].16b}, [%[src]], #64 \n\t"
34 | 		: [src] "+r" (p),
35 | 		  [t0]  "=w" (t0),
36 | 		  [t1]  "=w" (t1),
37 | 		  [t2]  "=w" (t2),
38 | 		  [t3]  "=w" (t3)
39 | 	);
40 | 
41 | 	return (uint8x16x4_t) {
42 | 		.val[0] = t0,
43 | 		.val[1] = t1,
44 | 		.val[2] = t2,
45 | 		.val[3] = t3,
46 | 	};
47 | #else
48 | 	return vld1q_u8_x4(p);
49 | #endif
50 | }
51 | 
52 | #include "../generic/32/dec_loop.c"
53 | #include "../generic/64/enc_loop.c"
54 | #include "dec_loop.c"
55 | 
56 | #ifdef BASE64_NEON64_USE_ASM
57 | # include "enc_loop_asm.c"
58 | #else
59 | # include "enc_reshuffle.c"
60 | # include "enc_loop.c"
61 | #endif
62 | 
63 | #endif	// HAVE_NEON64
64 | 
65 | // Stride size is so large on these NEON 64-bit functions
66 | // (48 bytes encode, 64 bytes decode) that we inline the
67 | // uint64 codec to stay performant on smaller inputs.
68 | 
69 | void
70 | base64_stream_encode_neon64 BASE64_ENC_PARAMS
71 | {
72 | #if HAVE_NEON64
73 | 	#include "../generic/enc_head.c"
74 | 	enc_loop_neon64(&s, &slen, &o, &olen);
75 | 	enc_loop_generic_64(&s, &slen, &o, &olen);
76 | 	#include "../generic/enc_tail.c"
77 | #else
78 | 	base64_enc_stub(state, src, srclen, out, outlen);
79 | #endif
80 | }
81 | 
82 | int
83 | base64_stream_decode_neon64 BASE64_DEC_PARAMS
84 | {
85 | #if HAVE_NEON64
86 | 	#include "../generic/dec_head.c"
87 | 	dec_loop_neon64(&s, &slen, &o, &olen);
88 | 	dec_loop_generic_32(&s, &slen, &o, &olen);
89 | 	#include "../generic/dec_tail.c"
90 | #else
91 | 	return base64_dec_stub(state, src, srclen, out, outlen);
92 | #endif
93 | }
94 | 


--------------------------------------------------------------------------------
/lib/arch/neon64/dec_loop.c:
--------------------------------------------------------------------------------
  1 | // The input consists of five valid character sets in the Base64 alphabet,
  2 | // which we need to map back to the 6-bit values they represent.
  3 | // There are three ranges, two singles, and then there's the rest.
  4 | //
  5 | //   #  From       To        LUT  Characters
  6 | //   1  [0..42]    [255]      #1  invalid input
  7 | //   2  [43]       [62]       #1  +
  8 | //   3  [44..46]   [255]      #1  invalid input
  9 | //   4  [47]       [63]       #1  /
 10 | //   5  [48..57]   [52..61]   #1  0..9
 11 | //   6  [58..63]   [255]      #1  invalid input
 12 | //   7  [64]       [255]      #2  invalid input
 13 | //   8  [65..90]   [0..25]    #2  A..Z
 14 | //   9  [91..96]   [255]      #2  invalid input
 15 | //  10  [97..122]  [26..51]   #2  a..z
 16 | //  11  [123..126] [255]      #2  invalid input
 17 | // (12) Everything else => invalid input
 18 | 
 19 | // The first LUT will use the VTBL instruction (out of range indices are set to
 20 | // 0 in destination).
 21 | static const uint8_t dec_lut1[] = {
 22 | 	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
 23 | 	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
 24 | 	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,  62U, 255U, 255U, 255U,  63U,
 25 | 	 52U,  53U,  54U,  55U,  56U,  57U,  58U,  59U,  60U,  61U, 255U, 255U, 255U, 255U, 255U, 255U,
 26 | };
 27 | 
 28 | // The second LUT will use the VTBX instruction (out of range indices will be
 29 | // unchanged in destination). Input [64..126] will be mapped to index [1..63]
 30 | // in this LUT. Index 0 means that value comes from LUT #1.
 31 | static const uint8_t dec_lut2[] = {
 32 | 	  0U, 255U,   0U,   1U,   2U,   3U,   4U,   5U,   6U,   7U,   8U,   9U,  10U,  11U,  12U,  13U,
 33 | 	 14U,  15U,  16U,  17U,  18U,  19U,  20U,  21U,  22U,  23U,  24U,  25U, 255U, 255U, 255U, 255U,
 34 | 	255U, 255U,  26U,  27U,  28U,  29U,  30U,  31U,  32U,  33U,  34U,  35U,  36U,  37U,  38U,  39U,
 35 | 	 40U,  41U,  42U,  43U,  44U,  45U,  46U,  47U,  48U,  49U,  50U,  51U, 255U, 255U, 255U, 255U,
 36 | };
 37 | 
 38 | // All input values in range for the first look-up will be 0U in the second
 39 | // look-up result. All input values out of range for the first look-up will be
 40 | // 0U in the first look-up result. Thus, the two results can be ORed without
 41 | // conflicts.
 42 | //
 43 | // Invalid characters that are in the valid range for either look-up will be
 44 | // set to 255U in the combined result. Other invalid characters will just be
 45 | // passed through with the second look-up result (using the VTBX instruction).
 46 | // Since the second LUT is 64 bytes, those passed-through values are guaranteed
 47 | // to have a value greater than 63U. Therefore, valid characters will be mapped
 48 | // to the valid [0..63] range and all invalid characters will be mapped to
 49 | // values greater than 63.
 50 | 
 51 | static inline void
 52 | dec_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 53 | {
 54 | 	if (*slen < 64) {
 55 | 		return;
 56 | 	}
 57 | 
 58 | 	// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
 59 | 	// extra trailing zero bytes are written, so it is not necessary to
 60 | 	// reserve extra input bytes:
 61 | 	size_t rounds = *slen / 64;
 62 | 
 63 | 	*slen -= rounds * 64;	// 64 bytes consumed per round
 64 | 	*olen += rounds * 48;	// 48 bytes produced per round
 65 | 
 66 | 	const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_lut1);
 67 | 	const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_lut2);
 68 | 
 69 | 	do {
 70 | 		const uint8x16_t offset = vdupq_n_u8(63U);
 71 | 		uint8x16x4_t dec1, dec2;
 72 | 		uint8x16x3_t dec;
 73 | 
 74 | 		// Load 64 bytes and deinterleave:
 75 | 		uint8x16x4_t str = vld4q_u8((uint8_t *) *s);
 76 | 
 77 | 		// Get indices for second LUT:
 78 | 		dec2.val[0] = vqsubq_u8(str.val[0], offset);
 79 | 		dec2.val[1] = vqsubq_u8(str.val[1], offset);
 80 | 		dec2.val[2] = vqsubq_u8(str.val[2], offset);
 81 | 		dec2.val[3] = vqsubq_u8(str.val[3], offset);
 82 | 
 83 | 		// Get values from first LUT:
 84 | 		dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]);
 85 | 		dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]);
 86 | 		dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]);
 87 | 		dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]);
 88 | 
 89 | 		// Get values from second LUT:
 90 | 		dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]);
 91 | 		dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]);
 92 | 		dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]);
 93 | 		dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]);
 94 | 
 95 | 		// Get final values:
 96 | 		str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]);
 97 | 		str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]);
 98 | 		str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]);
 99 | 		str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]);
100 | 
101 | 		// Check for invalid input, any value larger than 63:
102 | 		const uint8x16_t classified
103 | 			= vorrq_u8(
104 | 				vorrq_u8(vcgtq_u8(str.val[0], vdupq_n_u8(63)), vcgtq_u8(str.val[1], vdupq_n_u8(63))),
105 | 				vorrq_u8(vcgtq_u8(str.val[2], vdupq_n_u8(63)), vcgtq_u8(str.val[3], vdupq_n_u8(63)))
106 | 			);
107 | 
108 | 		// Check that all bits are zero:
109 | 		if (vmaxvq_u8(classified) != 0U) {
110 | 			break;
111 | 		}
112 | 
113 | 		// Compress four bytes into three:
114 | 		dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
115 | 		dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
116 | 		dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
117 | 
118 | 		// Interleave and store decoded result:
119 | 		vst3q_u8((uint8_t *) *o, dec);
120 | 
121 | 		*s += 64;
122 | 		*o += 48;
123 | 
124 | 	} while (--rounds > 0);
125 | 
126 | 	// Adjust for any rounds that were skipped:
127 | 	*slen += rounds * 64;
128 | 	*olen -= rounds * 48;
129 | }
130 | 


--------------------------------------------------------------------------------
/lib/arch/neon64/enc_loop.c:
--------------------------------------------------------------------------------
 1 | static BASE64_FORCE_INLINE void
 2 | enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc)
 3 | {
 4 | 	// Load 48 bytes and deinterleave:
 5 | 	uint8x16x3_t src = vld3q_u8(*s);
 6 | 
 7 | 	// Divide bits of three input bytes over four output bytes:
 8 | 	uint8x16x4_t out = enc_reshuffle(src);
 9 | 
10 | 	// The bits have now been shifted to the right locations;
11 | 	// translate their values 0..63 to the Base64 alphabet.
12 | 	// Use a 64-byte table lookup:
13 | 	out.val[0] = vqtbl4q_u8(tbl_enc, out.val[0]);
14 | 	out.val[1] = vqtbl4q_u8(tbl_enc, out.val[1]);
15 | 	out.val[2] = vqtbl4q_u8(tbl_enc, out.val[2]);
16 | 	out.val[3] = vqtbl4q_u8(tbl_enc, out.val[3]);
17 | 
18 | 	// Interleave and store output:
19 | 	vst4q_u8(*o, out);
20 | 
21 | 	*s += 48;
22 | 	*o += 64;
23 | }
24 | 
25 | static inline void
26 | enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
27 | {
28 | 	size_t rounds = *slen / 48;
29 | 
30 | 	*slen -= rounds * 48;	// 48 bytes consumed per round
31 | 	*olen += rounds * 64;	// 64 bytes produced per round
32 | 
33 | 	// Load the encoding table:
34 | 	const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit);
35 | 
36 | 	while (rounds > 0) {
37 | 		if (rounds >= 8) {
38 | 			enc_loop_neon64_inner(s, o, tbl_enc);
39 | 			enc_loop_neon64_inner(s, o, tbl_enc);
40 | 			enc_loop_neon64_inner(s, o, tbl_enc);
41 | 			enc_loop_neon64_inner(s, o, tbl_enc);
42 | 			enc_loop_neon64_inner(s, o, tbl_enc);
43 | 			enc_loop_neon64_inner(s, o, tbl_enc);
44 | 			enc_loop_neon64_inner(s, o, tbl_enc);
45 | 			enc_loop_neon64_inner(s, o, tbl_enc);
46 | 			rounds -= 8;
47 | 			continue;
48 | 		}
49 | 		if (rounds >= 4) {
50 | 			enc_loop_neon64_inner(s, o, tbl_enc);
51 | 			enc_loop_neon64_inner(s, o, tbl_enc);
52 | 			enc_loop_neon64_inner(s, o, tbl_enc);
53 | 			enc_loop_neon64_inner(s, o, tbl_enc);
54 | 			rounds -= 4;
55 | 			continue;
56 | 		}
57 | 		if (rounds >= 2) {
58 | 			enc_loop_neon64_inner(s, o, tbl_enc);
59 | 			enc_loop_neon64_inner(s, o, tbl_enc);
60 | 			rounds -= 2;
61 | 			continue;
62 | 		}
63 | 		enc_loop_neon64_inner(s, o, tbl_enc);
64 | 		break;
65 | 	}
66 | }
67 | 


--------------------------------------------------------------------------------
/lib/arch/neon64/enc_loop_asm.c:
--------------------------------------------------------------------------------
  1 | // Apologies in advance for combining the preprocessor with inline assembly,
  2 | // two notoriously gnarly parts of C, but it was necessary to avoid a lot of
  3 | // code repetition. The preprocessor is used to template large sections of
  4 | // inline assembly that differ only in the registers used. If the code was
  5 | // written out by hand, it would become very large and hard to audit.
  6 | 
  7 | // Generate a block of inline assembly that loads three user-defined registers
  8 | // A, B, C from memory and deinterleaves them, post-incrementing the src
  9 | // pointer. The register set should be sequential.
 10 | #define LOAD(A, B, C) \
 11 | 	"ld3 {"A".16b, "B".16b, "C".16b}, [%[src]], #48 \n\t"
 12 | 
 13 | // Generate a block of inline assembly that takes three deinterleaved registers
 14 | // and shuffles the bytes. The output is in temporary registers t0..t3.
 15 | #define SHUF(A, B, C) \
 16 | 	"ushr %[t0].16b, "A".16b,   #2         \n\t" \
 17 | 	"ushr %[t1].16b, "B".16b,   #4         \n\t" \
 18 | 	"ushr %[t2].16b, "C".16b,   #6         \n\t" \
 19 | 	"sli  %[t1].16b, "A".16b,   #4         \n\t" \
 20 | 	"sli  %[t2].16b, "B".16b,   #2         \n\t" \
 21 | 	"and  %[t1].16b, %[t1].16b, %[n63].16b \n\t" \
 22 | 	"and  %[t2].16b, %[t2].16b, %[n63].16b \n\t" \
 23 | 	"and  %[t3].16b, "C".16b,   %[n63].16b \n\t"
 24 | 
 25 | // Generate a block of inline assembly that takes temporary registers t0..t3
 26 | // and translates them to the base64 alphabet, using a table loaded into
 27 | // v8..v11. The output is in user-defined registers A..D.
 28 | #define TRAN(A, B, C, D) \
 29 | 	"tbl "A".16b, {v8.16b-v11.16b}, %[t0].16b \n\t" \
 30 | 	"tbl "B".16b, {v8.16b-v11.16b}, %[t1].16b \n\t" \
 31 | 	"tbl "C".16b, {v8.16b-v11.16b}, %[t2].16b \n\t" \
 32 | 	"tbl "D".16b, {v8.16b-v11.16b}, %[t3].16b \n\t"
 33 | 
 34 | // Generate a block of inline assembly that interleaves four registers and
 35 | // stores them, post-incrementing the destination pointer.
 36 | #define STOR(A, B, C, D) \
 37 | 	"st4 {"A".16b, "B".16b, "C".16b, "D".16b}, [%[dst]], #64 \n\t"
 38 | 
 39 | // Generate a block of inline assembly that generates a single self-contained
 40 | // encoder round: fetch the data, process it, and store the result.
 41 | #define ROUND() \
 42 | 	LOAD("v12", "v13", "v14") \
 43 | 	SHUF("v12", "v13", "v14") \
 44 | 	TRAN("v12", "v13", "v14", "v15") \
 45 | 	STOR("v12", "v13", "v14", "v15")
 46 | 
 47 | // Generate a block of assembly that generates a type A interleaved encoder
 48 | // round. It uses registers that were loaded by the previous type B round, and
 49 | // in turn loads registers for the next type B round.
 50 | #define ROUND_A() \
 51 | 	SHUF("v2",  "v3",  "v4") \
 52 | 	LOAD("v12", "v13", "v14") \
 53 | 	TRAN("v2",  "v3",  "v4", "v5") \
 54 | 	STOR("v2",  "v3",  "v4", "v5")
 55 | 
 56 | // Type B interleaved encoder round. Same as type A, but register sets swapped.
 57 | #define ROUND_B() \
 58 | 	SHUF("v12", "v13", "v14") \
 59 | 	LOAD("v2",  "v3",  "v4") \
 60 | 	TRAN("v12", "v13", "v14", "v15") \
 61 | 	STOR("v12", "v13", "v14", "v15")
 62 | 
 63 | // The first type A round needs to load its own registers.
 64 | #define ROUND_A_FIRST() \
 65 | 	LOAD("v2", "v3", "v4") \
 66 | 	ROUND_A()
 67 | 
 68 | // The last type B round omits the load for the next step.
 69 | #define ROUND_B_LAST() \
 70 | 	SHUF("v12", "v13", "v14") \
 71 | 	TRAN("v12", "v13", "v14", "v15") \
 72 | 	STOR("v12", "v13", "v14", "v15")
 73 | 
 74 | // Suppress clang's warning that the literal string in the asm statement is
 75 | // overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
 76 | // compilers). It may be true, but the goal here is not C99 portability.
 77 | #pragma GCC diagnostic push
 78 | #pragma GCC diagnostic ignored "-Woverlength-strings"
 79 | 
 80 | static inline void
 81 | enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
 82 | {
 83 | 	size_t rounds = *slen / 48;
 84 | 
 85 | 	if (rounds == 0) {
 86 | 		return;
 87 | 	}
 88 | 
 89 | 	*slen -= rounds * 48;	// 48 bytes consumed per round.
 90 | 	*olen += rounds * 64;	// 64 bytes produced per round.
 91 | 
 92 | 	// Number of times to go through the 8x loop.
 93 | 	size_t loops = rounds / 8;
 94 | 
 95 | 	// Number of rounds remaining after the 8x loop.
 96 | 	rounds %= 8;
 97 | 
 98 | 	// Temporary registers, used as scratch space.
 99 | 	uint8x16_t tmp0, tmp1, tmp2, tmp3;
100 | 
101 | 	__asm__ volatile (
102 | 
103 | 		// Load the encoding table into v8..v11.
104 | 		"    ld1 {v8.16b-v11.16b}, [%[tbl]] \n\t"
105 | 
106 | 		// If there are eight rounds or more, enter an 8x unrolled loop
107 | 		// of interleaved encoding rounds. The rounds interleave memory
108 | 		// operations (load/store) with data operations to maximize
109 | 		// pipeline throughput.
110 | 		"    cbz %[loops], 4f \n\t"
111 | 
112 | 		// The SIMD instructions do not touch the flags.
113 | 		"88: subs %[loops], %[loops], #1 \n\t"
114 | 		"    " ROUND_A_FIRST()
115 | 		"    " ROUND_B()
116 | 		"    " ROUND_A()
117 | 		"    " ROUND_B()
118 | 		"    " ROUND_A()
119 | 		"    " ROUND_B()
120 | 		"    " ROUND_A()
121 | 		"    " ROUND_B_LAST()
122 | 		"    b.ne 88b \n\t"
123 | 
124 | 		// Enter a 4x unrolled loop for rounds of 4 or more.
125 | 		"4:  cmp  %[rounds], #4 \n\t"
126 | 		"    b.lt 30f           \n\t"
127 | 		"    " ROUND_A_FIRST()
128 | 		"    " ROUND_B()
129 | 		"    " ROUND_A()
130 | 		"    " ROUND_B_LAST()
131 | 		"    sub %[rounds], %[rounds], #4 \n\t"
132 | 
133 | 		// Dispatch the remaining rounds 0..3.
134 | 		"30: cbz  %[rounds], 0f \n\t"
135 | 		"    cmp  %[rounds], #2 \n\t"
136 | 		"    b.eq 2f            \n\t"
137 | 		"    b.lt 1f            \n\t"
138 | 
139 | 		// Block of non-interlaced encoding rounds, which can each
140 | 		// individually be jumped to. Rounds fall through to the next.
141 | 		"3:  " ROUND()
142 | 		"2:  " ROUND()
143 | 		"1:  " ROUND()
144 | 		"0:  \n\t"
145 | 
146 | 		// Outputs (modified).
147 | 		: [loops] "+r"  (loops),
148 | 		  [src]   "+r"  (*s),
149 | 		  [dst]   "+r"  (*o),
150 | 		  [t0]    "=&w" (tmp0),
151 | 		  [t1]    "=&w" (tmp1),
152 | 		  [t2]    "=&w" (tmp2),
153 | 		  [t3]    "=&w" (tmp3)
154 | 
155 | 		// Inputs (not modified).
156 | 		: [rounds] "r" (rounds),
157 | 		  [tbl]    "r" (base64_table_enc_6bit),
158 | 		  [n63]    "w" (vdupq_n_u8(63))
159 | 
160 | 		// Clobbers.
161 | 		: "v2",  "v3",  "v4",  "v5",
162 | 		  "v8",  "v9",  "v10", "v11",
163 | 		  "v12", "v13", "v14", "v15",
164 | 		  "cc", "memory"
165 | 	);
166 | }
167 | 
168 | #pragma GCC diagnostic pop
169 | 


--------------------------------------------------------------------------------
/lib/arch/neon64/enc_reshuffle.c:
--------------------------------------------------------------------------------
 1 | static BASE64_FORCE_INLINE uint8x16x4_t
 2 | enc_reshuffle (const uint8x16x3_t in)
 3 | {
 4 | 	uint8x16x4_t out;
 5 | 
 6 | 	// Input:
 7 | 	// in[0]  = a7 a6 a5 a4 a3 a2 a1 a0
 8 | 	// in[1]  = b7 b6 b5 b4 b3 b2 b1 b0
 9 | 	// in[2]  = c7 c6 c5 c4 c3 c2 c1 c0
10 | 
11 | 	// Output:
12 | 	// out[0] = 00 00 a7 a6 a5 a4 a3 a2
13 | 	// out[1] = 00 00 a1 a0 b7 b6 b5 b4
14 | 	// out[2] = 00 00 b3 b2 b1 b0 c7 c6
15 | 	// out[3] = 00 00 c5 c4 c3 c2 c1 c0
16 | 
17 | 	// Move the input bits to where they need to be in the outputs. Except
18 | 	// for the first output, the high two bits are not cleared.
19 | 	out.val[0] = vshrq_n_u8(in.val[0], 2);
20 | 	out.val[1] = vshrq_n_u8(in.val[1], 4);
21 | 	out.val[2] = vshrq_n_u8(in.val[2], 6);
22 | 	out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 4);
23 | 	out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 2);
24 | 
25 | 	// Clear the high two bits in the second, third and fourth output.
26 | 	out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F));
27 | 	out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F));
28 | 	out.val[3] = vandq_u8(in.val[2],  vdupq_n_u8(0x3F));
29 | 
30 | 	return out;
31 | }
32 | 


--------------------------------------------------------------------------------
/lib/arch/sse41/codec.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stddef.h>
 3 | #include <stdlib.h>
 4 | 
 5 | #include "../../../include/libbase64.h"
 6 | #include "../../tables/tables.h"
 7 | #include "../../codecs.h"
 8 | #include "config.h"
 9 | #include "../../env.h"
10 | 
11 | #if HAVE_SSE41
12 | #include <smmintrin.h>
13 | 
14 | // Only enable inline assembly on supported compilers and on 64-bit CPUs.
15 | #ifndef BASE64_SSE41_USE_ASM
16 | # if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
17 | #  define BASE64_SSE41_USE_ASM 1
18 | # else
19 | #  define BASE64_SSE41_USE_ASM 0
20 | # endif
21 | #endif
22 | 
23 | #include "../ssse3/dec_reshuffle.c"
24 | #include "../ssse3/dec_loop.c"
25 | 
26 | #if BASE64_SSE41_USE_ASM
27 | # include "../ssse3/enc_loop_asm.c"
28 | #else
29 | # include "../ssse3/enc_translate.c"
30 | # include "../ssse3/enc_reshuffle.c"
31 | # include "../ssse3/enc_loop.c"
32 | #endif
33 | 
34 | #endif	// HAVE_SSE41
35 | 
36 | void
37 | base64_stream_encode_sse41 BASE64_ENC_PARAMS
38 | {
39 | #if HAVE_SSE41
40 | 	#include "../generic/enc_head.c"
41 | 	enc_loop_ssse3(&s, &slen, &o, &olen);
42 | 	#include "../generic/enc_tail.c"
43 | #else
44 | 	base64_enc_stub(state, src, srclen, out, outlen);
45 | #endif
46 | }
47 | 
48 | int
49 | base64_stream_decode_sse41 BASE64_DEC_PARAMS
50 | {
51 | #if HAVE_SSE41
52 | 	#include "../generic/dec_head.c"
53 | 	dec_loop_ssse3(&s, &slen, &o, &olen);
54 | 	#include "../generic/dec_tail.c"
55 | #else
56 | 	return base64_dec_stub(state, src, srclen, out, outlen);
57 | #endif
58 | }
59 | 


--------------------------------------------------------------------------------
/lib/arch/sse42/codec.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stddef.h>
 3 | #include <stdlib.h>
 4 | 
 5 | #include "../../../include/libbase64.h"
 6 | #include "../../tables/tables.h"
 7 | #include "../../codecs.h"
 8 | #include "config.h"
 9 | #include "../../env.h"
10 | 
11 | #if HAVE_SSE42
12 | #include <nmmintrin.h>
13 | 
14 | // Only enable inline assembly on supported compilers and on 64-bit CPUs.
15 | #ifndef BASE64_SSE42_USE_ASM
16 | # if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
17 | #  define BASE64_SSE42_USE_ASM 1
18 | # else
19 | #  define BASE64_SSE42_USE_ASM 0
20 | # endif
21 | #endif
22 | 
23 | #include "../ssse3/dec_reshuffle.c"
24 | #include "../ssse3/dec_loop.c"
25 | 
26 | #if BASE64_SSE42_USE_ASM
27 | # include "../ssse3/enc_loop_asm.c"
28 | #else
29 | # include "../ssse3/enc_translate.c"
30 | # include "../ssse3/enc_reshuffle.c"
31 | # include "../ssse3/enc_loop.c"
32 | #endif
33 | 
34 | #endif	// HAVE_SSE42
35 | 
36 | void
37 | base64_stream_encode_sse42 BASE64_ENC_PARAMS
38 | {
39 | #if HAVE_SSE42
40 | 	#include "../generic/enc_head.c"
41 | 	enc_loop_ssse3(&s, &slen, &o, &olen);
42 | 	#include "../generic/enc_tail.c"
43 | #else
44 | 	base64_enc_stub(state, src, srclen, out, outlen);
45 | #endif
46 | }
47 | 
48 | int
49 | base64_stream_decode_sse42 BASE64_DEC_PARAMS
50 | {
51 | #if HAVE_SSE42
52 | 	#include "../generic/dec_head.c"
53 | 	dec_loop_ssse3(&s, &slen, &o, &olen);
54 | 	#include "../generic/dec_tail.c"
55 | #else
56 | 	return base64_dec_stub(state, src, srclen, out, outlen);
57 | #endif
58 | }
59 | 


--------------------------------------------------------------------------------
/lib/arch/ssse3/codec.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stddef.h>
 3 | #include <stdlib.h>
 4 | 
 5 | #include "../../../include/libbase64.h"
 6 | #include "../../tables/tables.h"
 7 | #include "../../codecs.h"
 8 | #include "config.h"
 9 | #include "../../env.h"
10 | 
11 | #if HAVE_SSSE3
12 | #include <tmmintrin.h>
13 | 
14 | // Only enable inline assembly on supported compilers and on 64-bit CPUs.
15 | // 32-bit CPUs with SSSE3 support, such as low-end Atoms, only have eight XMM
16 | // registers, which is not enough to run the inline assembly.
17 | #ifndef BASE64_SSSE3_USE_ASM
18 | # if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
19 | #  define BASE64_SSSE3_USE_ASM 1
20 | # else
21 | #  define BASE64_SSSE3_USE_ASM 0
22 | # endif
23 | #endif
24 | 
25 | #include "dec_reshuffle.c"
26 | #include "dec_loop.c"
27 | 
28 | #if BASE64_SSSE3_USE_ASM
29 | # include "enc_loop_asm.c"
30 | #else
31 | # include "enc_reshuffle.c"
32 | # include "enc_translate.c"
33 | # include "enc_loop.c"
34 | #endif
35 | 
36 | #endif	// HAVE_SSSE3
37 | 
38 | void
39 | base64_stream_encode_ssse3 BASE64_ENC_PARAMS
40 | {
41 | #if HAVE_SSSE3
42 | 	#include "../generic/enc_head.c"
43 | 	enc_loop_ssse3(&s, &slen, &o, &olen);
44 | 	#include "../generic/enc_tail.c"
45 | #else
46 | 	base64_enc_stub(state, src, srclen, out, outlen);
47 | #endif
48 | }
49 | 
50 | int
51 | base64_stream_decode_ssse3 BASE64_DEC_PARAMS
52 | {
53 | #if HAVE_SSSE3
54 | 	#include "../generic/dec_head.c"
55 | 	dec_loop_ssse3(&s, &slen, &o, &olen);
56 | 	#include "../generic/dec_tail.c"
57 | #else
58 | 	return base64_dec_stub(state, src, srclen, out, outlen);
59 | #endif
60 | }
61 | 


--------------------------------------------------------------------------------
/lib/arch/ssse3/dec_loop.c:
--------------------------------------------------------------------------------
  1 | // The input consists of six character sets in the Base64 alphabet, which we
  2 | // need to map back to the 6-bit values they represent. There are three ranges,
  3 | // two singles, and then there's the rest.
  4 | //
  5 | //  #  From       To        Add  Characters
  6 | //  1  [43]       [62]      +19  +
  7 | //  2  [47]       [63]      +16  /
  8 | //  3  [48..57]   [52..61]   +4  0..9
  9 | //  4  [65..90]   [0..25]   -65  A..Z
 10 | //  5  [97..122]  [26..51]  -71  a..z
 11 | // (6) Everything else => invalid input
 12 | //
 13 | // We will use lookup tables for character validation and offset computation.
 14 | // Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this
 15 | // allows to mask with 0x2F instead of 0x0F and thus save one constant
 16 | // declaration (register and/or memory access).
 17 | //
 18 | // For offsets:
 19 | // Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00)
 20 | // 0000 = garbage
 21 | // 0001 = /
 22 | // 0010 = +
 23 | // 0011 = 0-9
 24 | // 0100 = A-Z
 25 | // 0101 = A-Z
 26 | // 0110 = a-z
 27 | // 0111 = a-z
 28 | // 1000 >= garbage
 29 | //
 30 | // For validation, here's the table.
 31 | // A character is valid if and only if the AND of the 2 lookups equals 0:
 32 | //
 33 | // hi \ lo              0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
 34 | //      LUT             0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A
 35 | //
 36 | // 0000 0x10 char        NUL  SOH  STX  ETX  EOT  ENQ  ACK  BEL   BS   HT   LF   VT   FF   CR   SO   SI
 37 | //           andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
 38 | //
 39 | // 0001 0x10 char        DLE  DC1  DC2  DC3  DC4  NAK  SYN  ETB  CAN   EM  SUB  ESC   FS   GS   RS   US
 40 | //           andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
 41 | //
 42 | // 0010 0x01 char               !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
 43 | //           andlut     0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00
 44 | //
 45 | // 0011 0x02 char          0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
 46 | //           andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
 47 | //
 48 | // 0100 0x04 char          @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
 49 | //           andlut     0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
 50 | //
 51 | // 0101 0x08 char          P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
 52 | //           andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
 53 | //
 54 | // 0110 0x04 char          `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
 55 | //           andlut     0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
 56 | // 0111 0x08 char          p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
 57 | //           andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
 58 | //
 59 | // 1000 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
 60 | // 1001 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
 61 | // 1010 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
 62 | // 1011 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
 63 | // 1100 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
 64 | // 1101 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
 65 | // 1110 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
 66 | // 1111 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
 67 | 
 68 | static BASE64_FORCE_INLINE int
 69 | dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
 70 | {
 71 | 	const __m128i lut_lo = _mm_setr_epi8(
 72 | 		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
 73 | 		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
 74 | 
 75 | 	const __m128i lut_hi = _mm_setr_epi8(
 76 | 		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
 77 | 		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
 78 | 
 79 | 	const __m128i lut_roll = _mm_setr_epi8(
 80 | 		0,  16,  19,   4, -65, -65, -71, -71,
 81 | 		0,   0,   0,   0,   0,   0,   0,   0);
 82 | 
 83 | 	const __m128i mask_2F = _mm_set1_epi8(0x2F);
 84 | 
 85 | 	// Load input:
 86 | 	__m128i str = _mm_loadu_si128((__m128i *) *s);
 87 | 
 88 | 	// Table lookups:
 89 | 	const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F);
 90 | 	const __m128i lo_nibbles = _mm_and_si128(str, mask_2F);
 91 | 	const __m128i hi         = _mm_shuffle_epi8(lut_hi, hi_nibbles);
 92 | 	const __m128i lo         = _mm_shuffle_epi8(lut_lo, lo_nibbles);
 93 | 
 94 | 	// Check for invalid input: if any "and" values from lo and hi are not
 95 | 	// zero, fall back on bytewise code to do error checking and reporting:
 96 | 	if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) {
 97 | 		return 0;
 98 | 	}
 99 | 
100 | 	const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F);
101 | 	const __m128i roll  = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles));
102 | 
103 | 	// Now simply add the delta values to the input:
104 | 	str = _mm_add_epi8(str, roll);
105 | 
106 | 	// Reshuffle the input to packed 12-byte output format:
107 | 	str = dec_reshuffle(str);
108 | 
109 | 	// Store the output:
110 | 	_mm_storeu_si128((__m128i *) *o, str);
111 | 
112 | 	*s += 16;
113 | 	*o += 12;
114 | 	*rounds -= 1;
115 | 
116 | 	return 1;
117 | }
118 | 
119 | static inline void
120 | dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
121 | {
122 | 	if (*slen < 24) {
123 | 		return;
124 | 	}
125 | 
126 | 	// Process blocks of 16 bytes per round. Because 4 extra zero bytes are
127 | 	// written after the output, ensure that there will be at least 8 bytes
128 | 	// of input data left to cover the gap. (6 data bytes and up to two
129 | 	// end-of-string markers.)
130 | 	size_t rounds = (*slen - 8) / 16;
131 | 
132 | 	*slen -= rounds * 16;	// 16 bytes consumed per round
133 | 	*olen += rounds * 12;	// 12 bytes produced per round
134 | 
135 | 	do {
136 | 		if (rounds >= 8) {
137 | 			if (dec_loop_ssse3_inner(s, o, &rounds) &&
138 | 			    dec_loop_ssse3_inner(s, o, &rounds) &&
139 | 			    dec_loop_ssse3_inner(s, o, &rounds) &&
140 | 			    dec_loop_ssse3_inner(s, o, &rounds) &&
141 | 			    dec_loop_ssse3_inner(s, o, &rounds) &&
142 | 			    dec_loop_ssse3_inner(s, o, &rounds) &&
143 | 			    dec_loop_ssse3_inner(s, o, &rounds) &&
144 | 			    dec_loop_ssse3_inner(s, o, &rounds)) {
145 | 				continue;
146 | 			}
147 | 			break;
148 | 		}
149 | 		if (rounds >= 4) {
150 | 			if (dec_loop_ssse3_inner(s, o, &rounds) &&
151 | 			    dec_loop_ssse3_inner(s, o, &rounds) &&
152 | 			    dec_loop_ssse3_inner(s, o, &rounds) &&
153 | 			    dec_loop_ssse3_inner(s, o, &rounds)) {
154 | 				continue;
155 | 			}
156 | 			break;
157 | 		}
158 | 		if (rounds >= 2) {
159 | 			if (dec_loop_ssse3_inner(s, o, &rounds) &&
160 | 			    dec_loop_ssse3_inner(s, o, &rounds)) {
161 | 				continue;
162 | 			}
163 | 			break;
164 | 		}
165 | 		dec_loop_ssse3_inner(s, o, &rounds);
166 | 		break;
167 | 
168 | 	} while (rounds > 0);
169 | 
170 | 	// Adjust for any rounds that were skipped:
171 | 	*slen += rounds * 16;
172 | 	*olen -= rounds * 12;
173 | }
174 | 


--------------------------------------------------------------------------------
/lib/arch/ssse3/dec_reshuffle.c:
--------------------------------------------------------------------------------
 1 | static BASE64_FORCE_INLINE __m128i
 2 | dec_reshuffle (const __m128i in)
 3 | {
 4 | 	// in, bits, upper case are most significant bits, lower case are least significant bits
 5 | 	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
 6 | 	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
 7 | 	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
 8 | 	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
 9 | 
10 | 	const __m128i merge_ab_and_bc = _mm_maddubs_epi16(in, _mm_set1_epi32(0x01400140));
11 | 	// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
12 | 	// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
13 | 	// 0000eeee FFffffff 0000DDDD DDddEEEE
14 | 	// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
15 | 
16 | 	const __m128i out = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000));
17 | 	// 00000000 JJJJJJjj KKKKkkkk LLllllll
18 | 	// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
19 | 	// 00000000 DDDDDDdd EEEEeeee FFffffff
20 | 	// 00000000 AAAAAAaa BBBBbbbb CCcccccc
21 | 
22 | 	// Pack bytes together:
23 | 	return  _mm_shuffle_epi8(out, _mm_setr_epi8(
24 | 		 2,  1,  0,
25 | 		 6,  5,  4,
26 | 		10,  9,  8,
27 | 		14, 13, 12,
28 | 		-1, -1, -1, -1));
29 | 	// 00000000 00000000 00000000 00000000
30 | 	// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
31 | 	// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
32 | 	// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
33 | }
34 | 


--------------------------------------------------------------------------------
/lib/arch/ssse3/enc_loop.c:
--------------------------------------------------------------------------------
 1 | static BASE64_FORCE_INLINE void
 2 | enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o)
 3 | {
 4 | 	// Load input:
 5 | 	__m128i str = _mm_loadu_si128((__m128i *) *s);
 6 | 
 7 | 	// Reshuffle:
 8 | 	str = enc_reshuffle(str);
 9 | 
10 | 	// Translate reshuffled bytes to the Base64 alphabet:
11 | 	str = enc_translate(str);
12 | 
13 | 	// Store:
14 | 	_mm_storeu_si128((__m128i *) *o, str);
15 | 
16 | 	*s += 12;
17 | 	*o += 16;
18 | }
19 | 
20 | static inline void
21 | enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
22 | {
23 | 	if (*slen < 16) {
24 | 		return;
25 | 	}
26 | 
27 | 	// Process blocks of 12 bytes at a time. Because blocks are loaded 16
28 | 	// bytes at a time, ensure that there will be at least 4 remaining
29 | 	// bytes after the last round, so that the final read will not pass
30 | 	// beyond the bounds of the input buffer:
31 | 	size_t rounds = (*slen - 4) / 12;
32 | 
33 | 	*slen -= rounds * 12;	// 12 bytes consumed per round
34 | 	*olen += rounds * 16;	// 16 bytes produced per round
35 | 
36 | 	do {
37 | 		if (rounds >= 8) {
38 | 			enc_loop_ssse3_inner(s, o);
39 | 			enc_loop_ssse3_inner(s, o);
40 | 			enc_loop_ssse3_inner(s, o);
41 | 			enc_loop_ssse3_inner(s, o);
42 | 			enc_loop_ssse3_inner(s, o);
43 | 			enc_loop_ssse3_inner(s, o);
44 | 			enc_loop_ssse3_inner(s, o);
45 | 			enc_loop_ssse3_inner(s, o);
46 | 			rounds -= 8;
47 | 			continue;
48 | 		}
49 | 		if (rounds >= 4) {
50 | 			enc_loop_ssse3_inner(s, o);
51 | 			enc_loop_ssse3_inner(s, o);
52 | 			enc_loop_ssse3_inner(s, o);
53 | 			enc_loop_ssse3_inner(s, o);
54 | 			rounds -= 4;
55 | 			continue;
56 | 		}
57 | 		if (rounds >= 2) {
58 | 			enc_loop_ssse3_inner(s, o);
59 | 			enc_loop_ssse3_inner(s, o);
60 | 			rounds -= 2;
61 | 			continue;
62 | 		}
63 | 		enc_loop_ssse3_inner(s, o);
64 | 		break;
65 | 
66 | 	} while (rounds > 0);
67 | }
68 | 


--------------------------------------------------------------------------------
/lib/arch/ssse3/enc_loop_asm.c:
--------------------------------------------------------------------------------
  1 | // Apologies in advance for combining the preprocessor with inline assembly,
  2 | // two notoriously gnarly parts of C, but it was necessary to avoid a lot of
  3 | // code repetition. The preprocessor is used to template large sections of
  4 | // inline assembly that differ only in the registers used. If the code was
  5 | // written out by hand, it would become very large and hard to audit.
  6 | 
  7 | // Generate a block of inline assembly that loads register R0 from memory. The
  8 | // offset at which the register is loaded is set by the given round.
  9 | #define LOAD(R0, ROUND) \
 10 | 	"lddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t"
 11 | 
 12 | // Generate a block of inline assembly that deinterleaves and shuffles register
 13 | // R0 using preloaded constants. Outputs in R0 and R1.
 14 | #define SHUF(R0, R1) \
 15 | 	"pshufb  %[lut0], %["R0"] \n\t" \
 16 | 	"movdqa  %["R0"], %["R1"] \n\t" \
 17 | 	"pand    %[msk0], %["R0"] \n\t" \
 18 | 	"pand    %[msk2], %["R1"] \n\t" \
 19 | 	"pmulhuw %[msk1], %["R0"] \n\t" \
 20 | 	"pmullw  %[msk3], %["R1"] \n\t" \
 21 | 	"por     %["R1"], %["R0"] \n\t"
 22 | 
 23 | // Generate a block of inline assembly that takes R0 and R1 and translates
 24 | // their contents to the base64 alphabet, using preloaded constants.
 25 | #define TRAN(R0, R1, R2) \
 26 | 	"movdqa  %["R0"], %["R1"] \n\t" \
 27 | 	"movdqa  %["R0"], %["R2"] \n\t" \
 28 | 	"psubusb %[n51],  %["R1"] \n\t" \
 29 | 	"pcmpgtb %[n25],  %["R2"] \n\t" \
 30 | 	"psubb   %["R2"], %["R1"] \n\t" \
 31 | 	"movdqa  %[lut1], %["R2"] \n\t" \
 32 | 	"pshufb  %["R1"], %["R2"] \n\t" \
 33 | 	"paddb   %["R2"], %["R0"] \n\t"
 34 | 
 35 | // Generate a block of inline assembly that stores the given register R0 at an
 36 | // offset set by the given round.
 37 | #define STOR(R0, ROUND) \
 38 | 	"movdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t"
 39 | 
 40 | // Generate a block of inline assembly that generates a single self-contained
 41 | // encoder round: fetch the data, process it, and store the result. Then update
 42 | // the source and destination pointers.
 43 | #define ROUND() \
 44 | 	LOAD("a", 0) \
 45 | 	SHUF("a", "b") \
 46 | 	TRAN("a", "b", "c") \
 47 | 	STOR("a", 0) \
 48 | 	"add $12, %[src] \n\t" \
 49 | 	"add $16, %[dst] \n\t"
 50 | 
 51 | // Define a macro that initiates a three-way interleaved encoding round by
 52 | // preloading registers a, b and c from memory.
 53 | // The register graph shows which registers are in use during each step, and
 54 | // is a visual aid for choosing registers for that step. Symbol index:
 55 | //
 56 | //  +  indicates that a register is loaded by that step.
 57 | //  |  indicates that a register is in use and must not be touched.
 58 | //  -  indicates that a register is decommissioned by that step.
 59 | //  x  indicates that a register is used as a temporary by that step.
 60 | //  V  indicates that a register is an input or output to the macro.
 61 | //
 62 | #define ROUND_3_INIT() 			/*  a b c d e f  */ \
 63 | 	LOAD("a", 0)			/*  +            */ \
 64 | 	SHUF("a", "d")			/*  |     +      */ \
 65 | 	LOAD("b", 1)			/*  | +   |      */ \
 66 | 	TRAN("a", "d", "e")		/*  | |   - x    */ \
 67 | 	LOAD("c", 2)			/*  V V V        */
 68 | 
 69 | // Define a macro that translates, shuffles and stores the input registers A, B
 70 | // and C, and preloads registers D, E and F for the next round.
 71 | // This macro can be arbitrarily daisy-chained by feeding output registers D, E
 72 | // and F back into the next round as input registers A, B and C. The macro
 73 | // carefully interleaves memory operations with data operations for optimal
 74 | // pipelined performance.
 75 | 
 76 | #define ROUND_3(ROUND, A,B,C,D,E,F) 	/*  A B C D E F  */ \
 77 | 	LOAD(D, (ROUND + 3))		/*  V V V +      */ \
 78 | 	SHUF(B, E)			/*  | | | | +    */ \
 79 | 	STOR(A, (ROUND + 0))		/*  - | | | |    */ \
 80 | 	TRAN(B, E, F)			/*    | | | - x  */ \
 81 | 	LOAD(E, (ROUND + 4))		/*    | | | +    */ \
 82 | 	SHUF(C, A)			/*  + | | | |    */ \
 83 | 	STOR(B, (ROUND + 1))		/*  | - | | |    */ \
 84 | 	TRAN(C, A, F)			/*  -   | | | x  */ \
 85 | 	LOAD(F, (ROUND + 5))		/*      | | | +  */ \
 86 | 	SHUF(D, A)			/*  +   | | | |  */ \
 87 | 	STOR(C, (ROUND + 2))		/*  |   - | | |  */ \
 88 | 	TRAN(D, A, B)			/*  - x   V V V  */
 89 | 
 90 | // Define a macro that terminates a ROUND_3 macro by taking pre-loaded
 91 | // registers D, E and F, and translating, shuffling and storing them.
 92 | #define ROUND_3_END(ROUND, A,B,C,D,E,F)	/*  A B C D E F  */ \
 93 | 	SHUF(E, A)			/*  +     V V V  */ \
 94 | 	STOR(D, (ROUND + 3))		/*  |     - | |  */ \
 95 | 	TRAN(E, A, B)			/*  - x     | |  */ \
 96 | 	SHUF(F, C)			/*      +   | |  */ \
 97 | 	STOR(E, (ROUND + 4))		/*      |   - |  */ \
 98 | 	TRAN(F, C, D)			/*      - x   |  */ \
 99 | 	STOR(F, (ROUND + 5))		/*            -  */
100 | 
101 | // Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
102 | #define ROUND_3_A(ROUND) \
103 | 	ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
104 | 
105 | // Define a type B round. Inputs and outputs are swapped with regard to type A.
106 | #define ROUND_3_B(ROUND) \
107 | 	ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
108 | 
109 | // Terminating macro for a type A round.
110 | #define ROUND_3_A_LAST(ROUND) \
111 | 	ROUND_3_A(ROUND) \
112 | 	ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
113 | 
114 | // Terminating macro for a type B round.
115 | #define ROUND_3_B_LAST(ROUND) \
116 | 	ROUND_3_B(ROUND) \
117 | 	ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
118 | 
119 | // Suppress clang's warning that the literal string in the asm statement is
120 | // overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
121 | // compilers). It may be true, but the goal here is not C99 portability.
122 | #pragma GCC diagnostic push
123 | #pragma GCC diagnostic ignored "-Woverlength-strings"
124 | 
125 | static inline void
126 | enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
127 | {
128 | 	// For a clearer explanation of the algorithm used by this function,
129 | 	// please refer to the plain (not inline assembly) implementation. This
130 | 	// function follows the same basic logic.
131 | 
132 | 	if (*slen < 16) {
133 | 		return;
134 | 	}
135 | 
136 | 	// Process blocks of 12 bytes at a time. Input is read in blocks of 16
137 | 	// bytes, so "reserve" four bytes from the input buffer to ensure that
138 | 	// we never read beyond the end of the input buffer.
139 | 	size_t rounds = (*slen - 4) / 12;
140 | 
141 | 	*slen -= rounds * 12;   // 12 bytes consumed per round
142 | 	*olen += rounds * 16;   // 16 bytes produced per round
143 | 
144 | 	// Number of times to go through the 36x loop.
145 | 	size_t loops = rounds / 36;
146 | 
147 | 	// Number of rounds remaining after the 36x loop.
148 | 	rounds %= 36;
149 | 
150 | 	// Lookup tables.
151 | 	const __m128i lut0 = _mm_set_epi8(
152 | 		10, 11,  9, 10,  7,  8,  6,  7,  4,  5,  3,  4,  1,  2,  0,  1);
153 | 
154 | 	const __m128i lut1 = _mm_setr_epi8(
155 | 		65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
156 | 
157 | 	// Temporary registers.
158 | 	__m128i a, b, c, d, e, f;
159 | 
160 | 	__asm__ volatile (
161 | 
162 | 		// If there are 36 rounds or more, enter a 36x unrolled loop of
163 | 		// interleaved encoding rounds. The rounds interleave memory
164 | 		// operations (load/store) with data operations (table lookups,
165 | 		// etc) to maximize pipeline throughput.
166 | 		"    test %[loops], %[loops] \n\t"
167 | 		"    jz   18f                \n\t"
168 | 		"    jmp  36f                \n\t"
169 | 		"                            \n\t"
170 | 		".balign 64                  \n\t"
171 | 		"36: " ROUND_3_INIT()
172 | 		"    " ROUND_3_A( 0)
173 | 		"    " ROUND_3_B( 3)
174 | 		"    " ROUND_3_A( 6)
175 | 		"    " ROUND_3_B( 9)
176 | 		"    " ROUND_3_A(12)
177 | 		"    " ROUND_3_B(15)
178 | 		"    " ROUND_3_A(18)
179 | 		"    " ROUND_3_B(21)
180 | 		"    " ROUND_3_A(24)
181 | 		"    " ROUND_3_B(27)
182 | 		"    " ROUND_3_A_LAST(30)
183 | 		"    add $(12 * 36), %[src] \n\t"
184 | 		"    add $(16 * 36), %[dst] \n\t"
185 | 		"    dec %[loops]           \n\t"
186 | 		"    jnz 36b                \n\t"
187 | 
188 | 		// Enter an 18x unrolled loop for rounds of 18 or more.
189 | 		"18: cmp $18, %[rounds] \n\t"
190 | 		"    jl  9f             \n\t"
191 | 		"    " ROUND_3_INIT()
192 | 		"    " ROUND_3_A(0)
193 | 		"    " ROUND_3_B(3)
194 | 		"    " ROUND_3_A(6)
195 | 		"    " ROUND_3_B(9)
196 | 		"    " ROUND_3_A_LAST(12)
197 | 		"    sub $18,        %[rounds] \n\t"
198 | 		"    add $(12 * 18), %[src]    \n\t"
199 | 		"    add $(16 * 18), %[dst]    \n\t"
200 | 
201 | 		// Enter a 9x unrolled loop for rounds of 9 or more.
202 | 		"9:  cmp $9, %[rounds] \n\t"
203 | 		"    jl  6f            \n\t"
204 | 		"    " ROUND_3_INIT()
205 | 		"    " ROUND_3_A(0)
206 | 		"    " ROUND_3_B_LAST(3)
207 | 		"    sub $9,        %[rounds] \n\t"
208 | 		"    add $(12 * 9), %[src]    \n\t"
209 | 		"    add $(16 * 9), %[dst]    \n\t"
210 | 
211 | 		// Enter a 6x unrolled loop for rounds of 6 or more.
212 | 		"6:  cmp $6, %[rounds] \n\t"
213 | 		"    jl  55f           \n\t"
214 | 		"    " ROUND_3_INIT()
215 | 		"    " ROUND_3_A_LAST(0)
216 | 		"    sub $6,        %[rounds] \n\t"
217 | 		"    add $(12 * 6), %[src]    \n\t"
218 | 		"    add $(16 * 6), %[dst]    \n\t"
219 | 
220 | 		// Dispatch the remaining rounds 0..5.
221 | 		"55: cmp $3, %[rounds] \n\t"
222 | 		"    jg  45f           \n\t"
223 | 		"    je  3f            \n\t"
224 | 		"    cmp $1, %[rounds] \n\t"
225 | 		"    jg  2f            \n\t"
226 | 		"    je  1f            \n\t"
227 | 		"    jmp 0f            \n\t"
228 | 
229 | 		"45: cmp $4, %[rounds] \n\t"
230 | 		"    je  4f            \n\t"
231 | 
232 | 		// Block of non-interlaced encoding rounds, which can each
233 | 		// individually be jumped to. Rounds fall through to the next.
234 | 		"5: " ROUND()
235 | 		"4: " ROUND()
236 | 		"3: " ROUND()
237 | 		"2: " ROUND()
238 | 		"1: " ROUND()
239 | 		"0: \n\t"
240 | 
241 | 		// Outputs (modified).
242 | 		: [rounds] "+r"  (rounds),
243 | 		  [loops]  "+r"  (loops),
244 | 		  [src]    "+r"  (*s),
245 | 		  [dst]    "+r"  (*o),
246 | 		  [a]      "=&x" (a),
247 | 		  [b]      "=&x" (b),
248 | 		  [c]      "=&x" (c),
249 | 		  [d]      "=&x" (d),
250 | 		  [e]      "=&x" (e),
251 | 		  [f]      "=&x" (f)
252 | 
253 | 		// Inputs (not modified).
254 | 		: [lut0] "x" (lut0),
255 | 		  [lut1] "x" (lut1),
256 | 		  [msk0] "x" (_mm_set1_epi32(0x0FC0FC00)),
257 | 		  [msk1] "x" (_mm_set1_epi32(0x04000040)),
258 | 		  [msk2] "x" (_mm_set1_epi32(0x003F03F0)),
259 | 		  [msk3] "x" (_mm_set1_epi32(0x01000010)),
260 | 		  [n51]  "x" (_mm_set1_epi8(51)),
261 | 		  [n25]  "x" (_mm_set1_epi8(25))
262 | 
263 | 		// Clobbers.
264 | 		: "cc", "memory"
265 | 	);
266 | }
267 | 
268 | #pragma GCC diagnostic pop
269 | 


--------------------------------------------------------------------------------
/lib/arch/ssse3/enc_reshuffle.c:
--------------------------------------------------------------------------------
 1 | static BASE64_FORCE_INLINE __m128i
 2 | enc_reshuffle (__m128i in)
 3 | {
 4 | 	// Input, bytes MSB to LSB:
 5 | 	// 0 0 0 0 l k j i h g f e d c b a
 6 | 
 7 | 	in = _mm_shuffle_epi8(in, _mm_set_epi8(
 8 | 		10, 11,  9, 10,
 9 | 		 7,  8,  6,  7,
10 | 		 4,  5,  3,  4,
11 | 		 1,  2,  0,  1));
12 | 	// in, bytes MSB to LSB:
13 | 	// k l j k
14 | 	// h i g h
15 | 	// e f d e
16 | 	// b c a b
17 | 
18 | 	const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0FC0FC00));
19 | 	// bits, upper case are most significant bits, lower case are least significant bits
20 | 	// 0000kkkk LL000000 JJJJJJ00 00000000
21 | 	// 0000hhhh II000000 GGGGGG00 00000000
22 | 	// 0000eeee FF000000 DDDDDD00 00000000
23 | 	// 0000bbbb CC000000 AAAAAA00 00000000
24 | 
25 | 	const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
26 | 	// 00000000 00kkkkLL 00000000 00JJJJJJ
27 | 	// 00000000 00hhhhII 00000000 00GGGGGG
28 | 	// 00000000 00eeeeFF 00000000 00DDDDDD
29 | 	// 00000000 00bbbbCC 00000000 00AAAAAA
30 | 
31 | 	const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003F03F0));
32 | 	// 00000000 00llllll 000000jj KKKK0000
33 | 	// 00000000 00iiiiii 000000gg HHHH0000
34 | 	// 00000000 00ffffff 000000dd EEEE0000
35 | 	// 00000000 00cccccc 000000aa BBBB0000
36 | 
37 | 	const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
38 | 	// 00llllll 00000000 00jjKKKK 00000000
39 | 	// 00iiiiii 00000000 00ggHHHH 00000000
40 | 	// 00ffffff 00000000 00ddEEEE 00000000
41 | 	// 00cccccc 00000000 00aaBBBB 00000000
42 | 
43 | 	return _mm_or_si128(t1, t3);
44 | 	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
45 | 	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
46 | 	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
47 | 	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
48 | }
49 | 


--------------------------------------------------------------------------------
/lib/arch/ssse3/enc_translate.c:
--------------------------------------------------------------------------------
 1 | static BASE64_FORCE_INLINE __m128i
 2 | enc_translate (const __m128i in)
 3 | {
 4 | 	// A lookup table containing the absolute offsets for all ranges:
 5 | 	const __m128i lut = _mm_setr_epi8(
 6 | 		 65,  71, -4, -4,
 7 | 		 -4,  -4, -4, -4,
 8 | 		 -4,  -4, -4, -4,
 9 | 		-19, -16,  0,  0
10 | 	);
11 | 
12 | 	// Translate values 0..63 to the Base64 alphabet. There are five sets:
13 | 	// #  From      To         Abs    Index  Characters
14 | 	// 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
15 | 	// 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
16 | 	// 2  [52..61]  [48..57]    -4  [2..11]  0123456789
17 | 	// 3  [62]      [43]       -19       12  +
18 | 	// 4  [63]      [47]       -16       13  /
19 | 
20 | 	// Create LUT indices from the input. The index for range #0 is right,
21 | 	// others are 1 less than expected:
22 | 	__m128i indices = _mm_subs_epu8(in, _mm_set1_epi8(51));
23 | 
24 | 	// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
25 | 	__m128i mask = _mm_cmpgt_epi8(in, _mm_set1_epi8(25));
26 | 
27 | 	// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
28 | 	// now correct:
29 | 	indices = _mm_sub_epi8(indices, mask);
30 | 
31 | 	// Add offsets to input values:
32 | 	return _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices));
33 | }
34 | 


--------------------------------------------------------------------------------
/lib/codec_choose.c:
--------------------------------------------------------------------------------
  1 | #include <stdbool.h>
  2 | #include <stdint.h>
  3 | #include <stddef.h>
  4 | #include <stdint.h>
  5 | #include <stdio.h>
  6 | 
  7 | #include "../include/libbase64.h"
  8 | #include "codecs.h"
  9 | #include "config.h"
 10 | #include "env.h"
 11 | 
 12 | #if (__x86_64__ || __i386__ || _M_X86 || _M_X64)
 13 |   #define BASE64_X86
 14 |   #if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2 || HAVE_AVX512)
 15 |     #define BASE64_X86_SIMD
 16 |   #endif
 17 | #endif
 18 | 
 19 | #ifdef BASE64_X86
 20 | #ifdef _MSC_VER
 21 | 	#include <intrin.h>
 22 | 	#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
 23 | 	{						\
 24 | 		int info[4];				\
 25 | 		__cpuidex(info, __level, __count);	\
 26 | 		__eax = info[0];			\
 27 | 		__ebx = info[1];			\
 28 | 		__ecx = info[2];			\
 29 | 		__edx = info[3];			\
 30 | 	}
 31 | 	#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
 32 | 		__cpuid_count(__level, 0, __eax, __ebx, __ecx, __edx)
 33 | #else
 34 | 	#include <cpuid.h>
 35 | 	#if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX
 36 | 		#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3))
 37 | 			static inline uint64_t _xgetbv (uint32_t index)
 38 | 			{
 39 | 				uint32_t eax, edx;
 40 | 				__asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
 41 | 				return ((uint64_t)edx << 32) | eax;
 42 | 			}
 43 | 		#else
 44 | 			#error "Platform not supported"
 45 | 		#endif
 46 | 	#endif
 47 | #endif
 48 | 
 49 | #ifndef bit_AVX512vl
 50 | #define bit_AVX512vl (1 << 31)
 51 | #endif
 52 | #ifndef bit_AVX512vbmi
 53 | #define bit_AVX512vbmi (1 << 1)
 54 | #endif
 55 | #ifndef bit_AVX2
 56 | #define bit_AVX2 (1 << 5)
 57 | #endif
 58 | #ifndef bit_SSSE3
 59 | #define bit_SSSE3 (1 << 9)
 60 | #endif
 61 | #ifndef bit_SSE41
 62 | #define bit_SSE41 (1 << 19)
 63 | #endif
 64 | #ifndef bit_SSE42
 65 | #define bit_SSE42 (1 << 20)
 66 | #endif
 67 | #ifndef bit_AVX
 68 | #define bit_AVX (1 << 28)
 69 | #endif
 70 | 
 71 | #define bit_XSAVE_XRSTORE (1 << 27)
 72 | 
 73 | #ifndef _XCR_XFEATURE_ENABLED_MASK
 74 | #define _XCR_XFEATURE_ENABLED_MASK 0
 75 | #endif
 76 | 
 77 | #define bit_XMM      (1 << 1)
 78 | #define bit_YMM      (1 << 2)
 79 | #define bit_OPMASK   (1 << 5)
 80 | #define bit_ZMM      (1 << 6)
 81 | #define bit_HIGH_ZMM (1 << 7)
 82 | 
 83 | #define _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS (bit_XMM | bit_YMM)
 84 | 
 85 | #define _AVX_512_ENABLED_BY_OS (bit_XMM | bit_YMM | bit_OPMASK | bit_ZMM | bit_HIGH_ZMM)
 86 | 
 87 | #endif
 88 | 
 89 | // Function declarations:
 90 | #define BASE64_CODEC_FUNCS(arch)					\
 91 | 	extern void base64_stream_encode_ ## arch BASE64_ENC_PARAMS;	\
 92 | 	extern int  base64_stream_decode_ ## arch BASE64_DEC_PARAMS;
 93 | 
 94 | BASE64_CODEC_FUNCS(avx512)
 95 | BASE64_CODEC_FUNCS(avx2)
 96 | BASE64_CODEC_FUNCS(neon32)
 97 | BASE64_CODEC_FUNCS(neon64)
 98 | BASE64_CODEC_FUNCS(plain)
 99 | BASE64_CODEC_FUNCS(ssse3)
100 | BASE64_CODEC_FUNCS(sse41)
101 | BASE64_CODEC_FUNCS(sse42)
102 | BASE64_CODEC_FUNCS(avx)
103 | 
104 | static bool
105 | codec_choose_forced (struct codec *codec, int flags)
106 | {
107 | 	// If the user wants to use a certain codec,
108 | 	// always allow it, even if the codec is a no-op.
109 | 	// For testing purposes.
110 | 
111 | 	if (!(flags & 0xFFFF)) {
112 | 		return false;
113 | 	}
114 | 
115 | 	if (flags & BASE64_FORCE_AVX2) {
116 | 		codec->enc = base64_stream_encode_avx2;
117 | 		codec->dec = base64_stream_decode_avx2;
118 | 		return true;
119 | 	}
120 | 	if (flags & BASE64_FORCE_NEON32) {
121 | 		codec->enc = base64_stream_encode_neon32;
122 | 		codec->dec = base64_stream_decode_neon32;
123 | 		return true;
124 | 	}
125 | 	if (flags & BASE64_FORCE_NEON64) {
126 | 		codec->enc = base64_stream_encode_neon64;
127 | 		codec->dec = base64_stream_decode_neon64;
128 | 		return true;
129 | 	}
130 | 	if (flags & BASE64_FORCE_PLAIN) {
131 | 		codec->enc = base64_stream_encode_plain;
132 | 		codec->dec = base64_stream_decode_plain;
133 | 		return true;
134 | 	}
135 | 	if (flags & BASE64_FORCE_SSSE3) {
136 | 		codec->enc = base64_stream_encode_ssse3;
137 | 		codec->dec = base64_stream_decode_ssse3;
138 | 		return true;
139 | 	}
140 | 	if (flags & BASE64_FORCE_SSE41) {
141 | 		codec->enc = base64_stream_encode_sse41;
142 | 		codec->dec = base64_stream_decode_sse41;
143 | 		return true;
144 | 	}
145 | 	if (flags & BASE64_FORCE_SSE42) {
146 | 		codec->enc = base64_stream_encode_sse42;
147 | 		codec->dec = base64_stream_decode_sse42;
148 | 		return true;
149 | 	}
150 | 	if (flags & BASE64_FORCE_AVX) {
151 | 		codec->enc = base64_stream_encode_avx;
152 | 		codec->dec = base64_stream_decode_avx;
153 | 		return true;
154 | 	}
155 | 	if (flags & BASE64_FORCE_AVX512) {
156 | 		codec->enc = base64_stream_encode_avx512;
157 | 		codec->dec = base64_stream_decode_avx512;
158 | 		return true;
159 | 	}
160 | 	return false;
161 | }
162 | 
163 | static bool
164 | codec_choose_arm (struct codec *codec)
165 | {
166 | #if HAVE_NEON64 || ((defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32)
167 | 
168 | 	// Unfortunately there is no portable way to check for NEON
169 | 	// support at runtime from userland in the same way that x86
170 | 	// has cpuid, so just stick to the compile-time configuration:
171 | 
172 | 	#if HAVE_NEON64
173 | 	codec->enc = base64_stream_encode_neon64;
174 | 	codec->dec = base64_stream_decode_neon64;
175 | 	#else
176 | 	codec->enc = base64_stream_encode_neon32;
177 | 	codec->dec = base64_stream_decode_neon32;
178 | 	#endif
179 | 
180 | 	return true;
181 | 
182 | #else
183 | 	(void)codec;
184 | 	return false;
185 | #endif
186 | }
187 | 
188 | static bool
189 | codec_choose_x86 (struct codec *codec)
190 | {
191 | #ifdef BASE64_X86_SIMD
192 | 
193 | 	unsigned int eax, ebx = 0, ecx = 0, edx;
194 | 	unsigned int max_level;
195 | 
196 | 	#ifdef _MSC_VER
197 | 	int info[4];
198 | 	__cpuidex(info, 0, 0);
199 | 	max_level = info[0];
200 | 	#else
201 | 	max_level = __get_cpuid_max(0, NULL);
202 | 	#endif
203 | 
204 | 	#if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX
205 | 	// Check for AVX/AVX2/AVX512 support:
206 | 	// Checking for AVX requires 3 things:
207 | 	// 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions
208 | 	//    (allowing saving YMM registers on context switch)
209 | 	// 2) CPUID indicates support for AVX
210 | 	// 3) XGETBV indicates the AVX registers will be saved and restored on
211 | 	//    context switch
212 | 	//
213 | 	// Note that XGETBV is only available on 686 or later CPUs, so the
214 | 	// instruction needs to be conditionally run.
215 | 	if (max_level >= 1) {
216 | 		__cpuid_count(1, 0, eax, ebx, ecx, edx);
217 | 		if (ecx & bit_XSAVE_XRSTORE) {
218 | 			uint64_t xcr_mask;
219 | 			xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
220 | 			if ((xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) == _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) { // check multiple bits at once
221 | 				#if HAVE_AVX512
222 | 				if (max_level >= 7 && ((xcr_mask & _AVX_512_ENABLED_BY_OS) == _AVX_512_ENABLED_BY_OS)) {
223 | 					__cpuid_count(7, 0, eax, ebx, ecx, edx);
224 | 					if ((ebx & bit_AVX512vl) && (ecx & bit_AVX512vbmi)) {
225 | 						codec->enc = base64_stream_encode_avx512;
226 | 						codec->dec = base64_stream_decode_avx512;
227 | 						return true;
228 | 					}
229 | 				}
230 | 				#endif
231 | 				#if HAVE_AVX2
232 | 				if (max_level >= 7) {
233 | 					__cpuid_count(7, 0, eax, ebx, ecx, edx);
234 | 					if (ebx & bit_AVX2) {
235 | 						codec->enc = base64_stream_encode_avx2;
236 | 						codec->dec = base64_stream_decode_avx2;
237 | 						return true;
238 | 					}
239 | 				}
240 | 				#endif
241 | 				#if HAVE_AVX
242 | 				__cpuid_count(1, 0, eax, ebx, ecx, edx);
243 | 				if (ecx & bit_AVX) {
244 | 					codec->enc = base64_stream_encode_avx;
245 | 					codec->dec = base64_stream_decode_avx;
246 | 					return true;
247 | 				}
248 | 				#endif
249 | 			}
250 | 		}
251 | 	}
252 | 	#endif
253 | 
254 | 	#if HAVE_SSE42
255 | 	// Check for SSE42 support:
256 | 	if (max_level >= 1) {
257 | 		__cpuid(1, eax, ebx, ecx, edx);
258 | 		if (ecx & bit_SSE42) {
259 | 			codec->enc = base64_stream_encode_sse42;
260 | 			codec->dec = base64_stream_decode_sse42;
261 | 			return true;
262 | 		}
263 | 	}
264 | 	#endif
265 | 
266 | 	#if HAVE_SSE41
267 | 	// Check for SSE41 support:
268 | 	if (max_level >= 1) {
269 | 		__cpuid(1, eax, ebx, ecx, edx);
270 | 		if (ecx & bit_SSE41) {
271 | 			codec->enc = base64_stream_encode_sse41;
272 | 			codec->dec = base64_stream_decode_sse41;
273 | 			return true;
274 | 		}
275 | 	}
276 | 	#endif
277 | 
278 | 	#if HAVE_SSSE3
279 | 	// Check for SSSE3 support:
280 | 	if (max_level >= 1) {
281 | 		__cpuid(1, eax, ebx, ecx, edx);
282 | 		if (ecx & bit_SSSE3) {
283 | 			codec->enc = base64_stream_encode_ssse3;
284 | 			codec->dec = base64_stream_decode_ssse3;
285 | 			return true;
286 | 		}
287 | 	}
288 | 	#endif
289 | 
290 | #else
291 | 	(void)codec;
292 | #endif
293 | 
294 | 	return false;
295 | }
296 | 
297 | void
298 | codec_choose (struct codec *codec, int flags)
299 | {
300 | 	// User forced a codec:
301 | 	if (codec_choose_forced(codec, flags)) {
302 | 		return;
303 | 	}
304 | 
305 | 	// Runtime feature detection:
306 | 	if (codec_choose_arm(codec)) {
307 | 		return;
308 | 	}
309 | 	if (codec_choose_x86(codec)) {
310 | 		return;
311 | 	}
312 | 	codec->enc = base64_stream_encode_plain;
313 | 	codec->dec = base64_stream_decode_plain;
314 | }
315 | 


--------------------------------------------------------------------------------
/lib/codecs.h:
--------------------------------------------------------------------------------
 1 | #include "../include/libbase64.h"
 2 | 
 3 | // Function parameters for encoding functions:
 4 | #define BASE64_ENC_PARAMS			\
 5 | 	( struct base64_state	*state		\
 6 | 	, const char		*src		\
 7 | 	, size_t		 srclen		\
 8 | 	, char			*out		\
 9 | 	, size_t		*outlen		\
10 | 	)
11 | 
12 | // Function parameters for decoding functions:
13 | #define BASE64_DEC_PARAMS			\
14 | 	( struct base64_state	*state		\
15 | 	, const char		*src		\
16 | 	, size_t		 srclen		\
17 | 	, char			*out		\
18 | 	, size_t		*outlen		\
19 | 	)
20 | 
21 | // This function is used as a stub when a certain encoder is not compiled in.
22 | // It discards the inputs and returns zero output bytes.
23 | static inline void
24 | base64_enc_stub BASE64_ENC_PARAMS
25 | {
26 | 	(void) state;
27 | 	(void) src;
28 | 	(void) srclen;
29 | 	(void) out;
30 | 
31 | 	*outlen = 0;
32 | }
33 | 
34 | // This function is used as a stub when a certain decoder is not compiled in.
35 | // It discards the inputs and returns an invalid decoding result.
36 | static inline int
37 | base64_dec_stub BASE64_DEC_PARAMS
38 | {
39 | 	(void) state;
40 | 	(void) src;
41 | 	(void) srclen;
42 | 	(void) out;
43 | 	(void) outlen;
44 | 
45 | 	return -1;
46 | }
47 | 
48 | typedef void (* base64_enc_fn) BASE64_ENC_PARAMS;
49 | typedef int  (* base64_dec_fn) BASE64_DEC_PARAMS;
50 | 
51 | struct codec
52 | {
53 | 	base64_enc_fn enc;
54 | 	base64_dec_fn dec;
55 | };
56 | 
57 | extern void codec_choose (struct codec *, int flags);
58 | 


--------------------------------------------------------------------------------
/lib/env.h:
--------------------------------------------------------------------------------
 1 | #ifndef BASE64_ENV_H
 2 | #define BASE64_ENV_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | // This header file contains macro definitions that describe certain aspects of
 7 | // the compile-time environment. Compatibility and portability macros go here.
 8 | 
 9 | // Define machine endianness. This is for GCC:
10 | #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
11 | #  define BASE64_LITTLE_ENDIAN 1
12 | #else
13 | #  define BASE64_LITTLE_ENDIAN 0
14 | #endif
15 | 
16 | // This is for Clang:
17 | #ifdef __LITTLE_ENDIAN__
18 | #  define BASE64_LITTLE_ENDIAN 1
19 | #endif
20 | 
21 | #ifdef __BIG_ENDIAN__
22 | #  define BASE64_LITTLE_ENDIAN 0
23 | #endif
24 | 
25 | // MSVC++ needs intrin.h for _byteswap_uint64 (issue #68):
26 | #if BASE64_LITTLE_ENDIAN && defined(_MSC_VER)
27 | #  include <intrin.h>
28 | #endif
29 | 
30 | // Endian conversion functions:
31 | #if BASE64_LITTLE_ENDIAN
32 | #  ifdef _MSC_VER
33 | //   Microsoft Visual C++:
34 | #    define BASE64_HTOBE32(x)	_byteswap_ulong(x)
35 | #    define BASE64_HTOBE64(x)	_byteswap_uint64(x)
36 | #  else
37 | //   GCC and Clang:
38 | #    define BASE64_HTOBE32(x)	__builtin_bswap32(x)
39 | #    define BASE64_HTOBE64(x)	__builtin_bswap64(x)
40 | #  endif
41 | #else
42 | // No conversion needed:
43 | #  define BASE64_HTOBE32(x)	(x)
44 | #  define BASE64_HTOBE64(x)	(x)
45 | #endif
46 | 
47 | // Detect word size:
48 | #if defined (__x86_64__)
49 | // This also works for the x32 ABI, which has a 64-bit word size.
50 | #  define BASE64_WORDSIZE 64
51 | #elif SIZE_MAX == UINT32_MAX
52 | #  define BASE64_WORDSIZE 32
53 | #elif SIZE_MAX == UINT64_MAX
54 | #  define BASE64_WORDSIZE 64
55 | #else
56 | #  error BASE64_WORDSIZE_NOT_DEFINED
57 | #endif
58 | 
59 | // End-of-file definitions.
60 | // Almost end-of-file when waiting for the last '=' character:
61 | #define BASE64_AEOF 1
62 | // End-of-file when stream end has been reached or invalid input provided:
63 | #define BASE64_EOF 2
64 | 
65 | // GCC 7 defaults to issuing a warning for fallthrough in switch statements,
66 | // unless the fallthrough cases are marked with an attribute. As we use
67 | // fallthrough deliberately, define an alias for the attribute:
68 | #if __GNUC__ >= 7
69 | #  define BASE64_FALLTHROUGH  __attribute__((fallthrough));
70 | #else
71 | #  define BASE64_FALLTHROUGH
72 | #endif
73 | 
74 | // Declare macros to ensure that functions that are intended to be inlined, are
75 | // actually inlined, even when no optimization is applied. A lot of inner loop
76 | // code is factored into separate functions for reasons of readability, but
77 | // that code should always be inlined (and optimized) in the main loop.
78 | #ifdef _MSC_VER
79 | #  define BASE64_FORCE_INLINE	__forceinline
80 | #else
81 | #  define BASE64_FORCE_INLINE  inline __attribute__((always_inline))
82 | #endif
83 | 
84 | #endif	// BASE64_ENV_H
85 | 


--------------------------------------------------------------------------------
/lib/exports.txt:
--------------------------------------------------------------------------------
1 | base64_encode
2 | base64_stream_encode
3 | base64_stream_encode_init
4 | base64_stream_encode_final
5 | base64_decode
6 | base64_stream_decode
7 | base64_stream_decode_init
8 | 


--------------------------------------------------------------------------------
/lib/lib.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stddef.h>
  3 | #ifdef _OPENMP
  4 | #include <omp.h>
  5 | #endif
  6 | 
  7 | #include "../include/libbase64.h"
  8 | #include "tables/tables.h"
  9 | #include "codecs.h"
 10 | #include "env.h"
 11 | 
 12 | // These static function pointers are initialized once when the library is
 13 | // first used, and remain in use for the remaining lifetime of the program.
 14 | // The idea being that CPU features don't change at runtime.
 15 | static struct codec codec = { NULL, NULL };
 16 | 
 17 | void
 18 | base64_stream_encode_init (struct base64_state *state, int flags)
 19 | {
 20 | 	// If any of the codec flags are set, redo choice:
 21 | 	if (codec.enc == NULL || flags & 0xFF) {
 22 | 		codec_choose(&codec, flags);
 23 | 	}
 24 | 	state->eof = 0;
 25 | 	state->bytes = 0;
 26 | 	state->carry = 0;
 27 | 	state->flags = flags;
 28 | }
 29 | 
 30 | void
 31 | base64_stream_encode
 32 | 	( struct base64_state	*state
 33 | 	, const char		*src
 34 | 	, size_t		 srclen
 35 | 	, char			*out
 36 | 	, size_t		*outlen
 37 | 	)
 38 | {
 39 | 	codec.enc(state, src, srclen, out, outlen);
 40 | }
 41 | 
 42 | void
 43 | base64_stream_encode_final
 44 | 	( struct base64_state	*state
 45 | 	, char			*out
 46 | 	, size_t		*outlen
 47 | 	)
 48 | {
 49 | 	uint8_t *o = (uint8_t *)out;
 50 | 
 51 | 	if (state->bytes == 1) {
 52 | 		*o++ = base64_table_enc_6bit[state->carry];
 53 | 		*o++ = '=';
 54 | 		*o++ = '=';
 55 | 		*outlen = 3;
 56 | 		return;
 57 | 	}
 58 | 	if (state->bytes == 2) {
 59 | 		*o++ = base64_table_enc_6bit[state->carry];
 60 | 		*o++ = '=';
 61 | 		*outlen = 2;
 62 | 		return;
 63 | 	}
 64 | 	*outlen = 0;
 65 | }
 66 | 
 67 | void
 68 | base64_stream_decode_init (struct base64_state *state, int flags)
 69 | {
 70 | 	// If any of the codec flags are set, redo choice:
 71 | 	if (codec.dec == NULL || flags & 0xFFFF) {
 72 | 		codec_choose(&codec, flags);
 73 | 	}
 74 | 	state->eof = 0;
 75 | 	state->bytes = 0;
 76 | 	state->carry = 0;
 77 | 	state->flags = flags;
 78 | }
 79 | 
 80 | int
 81 | base64_stream_decode
 82 | 	( struct base64_state	*state
 83 | 	, const char		*src
 84 | 	, size_t		 srclen
 85 | 	, char			*out
 86 | 	, size_t		*outlen
 87 | 	)
 88 | {
 89 | 	return codec.dec(state, src, srclen, out, outlen);
 90 | }
 91 | 
 92 | #ifdef _OPENMP
 93 | 
 94 | 	// Due to the overhead of initializing OpenMP and creating a team of
 95 | 	// threads, we require the data length to be larger than a threshold:
 96 | 	#define OMP_THRESHOLD 20000
 97 | 
 98 | 	// Conditionally include OpenMP-accelerated codec implementations:
 99 | 	#include "lib_openmp.c"
100 | #endif
101 | 
102 | void
103 | base64_encode
104 | 	( const char	*src
105 | 	, size_t	 srclen
106 | 	, char		*out
107 | 	, size_t	*outlen
108 | 	, int		 flags
109 | 	)
110 | {
111 | 	size_t s;
112 | 	size_t t;
113 | 	struct base64_state state;
114 | 
115 | 	#ifdef _OPENMP
116 | 	if (srclen >= OMP_THRESHOLD) {
117 | 		base64_encode_openmp(src, srclen, out, outlen, flags);
118 | 		return;
119 | 	}
120 | 	#endif
121 | 
122 | 	// Init the stream reader:
123 | 	base64_stream_encode_init(&state, flags);
124 | 
125 | 	// Feed the whole string to the stream reader:
126 | 	base64_stream_encode(&state, src, srclen, out, &s);
127 | 
128 | 	// Finalize the stream by writing trailer if any:
129 | 	base64_stream_encode_final(&state, out + s, &t);
130 | 
131 | 	// Final output length is stream length plus tail:
132 | 	*outlen = s + t;
133 | }
134 | 
135 | int
136 | base64_decode
137 | 	( const char	*src
138 | 	, size_t	 srclen
139 | 	, char		*out
140 | 	, size_t	*outlen
141 | 	, int		 flags
142 | 	)
143 | {
144 | 	int ret;
145 | 	struct base64_state state;
146 | 
147 | 	#ifdef _OPENMP
148 | 	if (srclen >= OMP_THRESHOLD) {
149 | 		return base64_decode_openmp(src, srclen, out, outlen, flags);
150 | 	}
151 | 	#endif
152 | 
153 | 	// Init the stream reader:
154 | 	base64_stream_decode_init(&state, flags);
155 | 
156 | 	// Feed the whole string to the stream reader:
157 | 	ret = base64_stream_decode(&state, src, srclen, out, outlen);
158 | 
159 | 	// If when decoding a whole block, we're still waiting for input then fail:
160 | 	if (ret && (state.bytes == 0)) {
161 | 		return ret;
162 | 	}
163 | 	return 0;
164 | }
165 | 


--------------------------------------------------------------------------------
/lib/lib_openmp.c:
--------------------------------------------------------------------------------
  1 | // This code makes some assumptions on the implementation of
  2 | // base64_stream_encode_init(), base64_stream_encode() and base64_stream_decode().
  3 | // Basically these assumptions boil down to that when breaking the src into
  4 | // parts, out parts can be written without side effects.
  5 | // This is met when:
  6 | // 1) base64_stream_encode() and base64_stream_decode() don't use globals;
  7 | // 2) the shared variables src and out are not read or written outside of the
  8 | //    bounds of their parts, i.e.  when base64_stream_encode() reads a multiple
  9 | //    of 3 bytes, it must write no more then a multiple of 4 bytes, not even
 10 | //    temporarily;
 11 | // 3) the state flag can be discarded after base64_stream_encode() and
 12 | //    base64_stream_decode() on the parts.
 13 | 
 14 | static inline void
 15 | base64_encode_openmp
 16 | 	( const char	*src
 17 | 	, size_t	 srclen
 18 | 	, char		*out
 19 | 	, size_t	*outlen
 20 | 	, int		 flags
 21 | 	)
 22 | {
 23 | 	size_t s;
 24 | 	size_t t;
 25 | 	size_t sum = 0, len, last_len;
 26 | 	struct base64_state state, initial_state;
 27 | 	int num_threads, i;
 28 | 
 29 | 	// Request a number of threads but not necessarily get them:
 30 | 	#pragma omp parallel
 31 | 	{
 32 | 		// Get the number of threads used from one thread only,
 33 | 		// as num_threads is a shared var:
 34 | 		#pragma omp single
 35 | 		{
 36 | 			num_threads = omp_get_num_threads();
 37 | 
 38 | 			// Split the input string into num_threads parts, each
 39 | 			// part a multiple of 3 bytes. The remaining bytes will
 40 | 			// be done later:
 41 | 			len = srclen / (num_threads * 3);
 42 | 			len *= 3;
 43 | 			last_len = srclen - num_threads * len;
 44 | 
 45 | 			// Init the stream reader:
 46 | 			base64_stream_encode_init(&state, flags);
 47 | 			initial_state = state;
 48 | 		}
 49 | 
 50 | 		// Single has an implicit barrier for all threads to wait here
 51 | 		// for the above to complete:
 52 | 		#pragma omp for firstprivate(state) private(s) reduction(+:sum) schedule(static,1)
 53 | 		for (i = 0; i < num_threads; i++)
 54 | 		{
 55 | 			// Feed each part of the string to the stream reader:
 56 | 			base64_stream_encode(&state, src + i * len, len, out + i * len * 4 / 3, &s);
 57 | 			sum += s;
 58 | 		}
 59 | 	}
 60 | 
 61 | 	// As encoding should never fail and we encode an exact multiple
 62 | 	// of 3 bytes, we can discard state:
 63 | 	state = initial_state;
 64 | 
 65 | 	// Encode the remaining bytes:
 66 | 	base64_stream_encode(&state, src + num_threads * len, last_len, out + num_threads * len * 4 / 3, &s);
 67 | 
 68 | 	// Finalize the stream by writing trailer if any:
 69 | 	base64_stream_encode_final(&state, out + num_threads * len * 4 / 3 + s, &t);
 70 | 
 71 | 	// Final output length is stream length plus tail:
 72 | 	sum += s + t;
 73 | 	*outlen = sum;
 74 | }
 75 | 
 76 | static inline int
 77 | base64_decode_openmp
 78 | 	( const char	*src
 79 | 	, size_t	 srclen
 80 | 	, char		*out
 81 | 	, size_t	*outlen
 82 | 	, int		 flags
 83 | 	)
 84 | {
 85 | 	int num_threads, result = 0, i;
 86 | 	size_t sum = 0, len, last_len, s;
 87 | 	struct base64_state state, initial_state;
 88 | 
 89 | 	// Request a number of threads but not necessarily get them:
 90 | 	#pragma omp parallel
 91 | 	{
 92 | 		// Get the number of threads used from one thread only,
 93 | 		// as num_threads is a shared var:
 94 | 		#pragma omp single
 95 | 		{
 96 | 			num_threads = omp_get_num_threads();
 97 | 
 98 | 			// Split the input string into num_threads parts, each
 99 | 			// part a multiple of 4 bytes. The remaining bytes will
100 | 			// be done later:
101 | 			len = srclen / (num_threads * 4);
102 | 			len *= 4;
103 | 			last_len = srclen - num_threads * len;
104 | 
105 | 			// Init the stream reader:
106 | 			base64_stream_decode_init(&state, flags);
107 | 
108 | 			initial_state = state;
109 | 		}
110 | 
111 | 		// Single has an implicit barrier to wait here for the above to
112 | 		// complete:
113 | 		#pragma omp for firstprivate(state) private(s) reduction(+:sum, result) schedule(static,1)
114 | 		for (i = 0; i < num_threads; i++)
115 | 		{
116 | 			int this_result;
117 | 
118 | 			// Feed each part of the string to the stream reader:
119 | 			this_result = base64_stream_decode(&state, src + i * len, len, out + i * len * 3 / 4, &s);
120 | 			sum += s;
121 | 			result += this_result;
122 | 		}
123 | 	}
124 | 
125 | 	// If `result' equals `-num_threads', then all threads returned -1,
126 | 	// indicating that the requested codec is not available:
127 | 	if (result == -num_threads) {
128 | 		return -1;
129 | 	}
130 | 
131 | 	// If `result' does not equal `num_threads', then at least one of the
132 | 	// threads hit a decode error:
133 | 	if (result != num_threads) {
134 | 		return 0;
135 | 	}
136 | 
137 | 	// So far so good, now decode whatever remains in the buffer. Reuse the
138 | 	// initial state, since we are at a 4-byte boundary:
139 | 	state = initial_state;
140 | 	result = base64_stream_decode(&state, src + num_threads * len, last_len, out + num_threads * len * 3 / 4, &s);
141 | 	sum += s;
142 | 	*outlen = sum;
143 | 
144 | 	// If when decoding a whole block, we're still waiting for input then fail:
145 | 	if (result && (state.bytes == 0)) {
146 | 		return result;
147 | 	}
148 | 	return 0;
149 | }
150 | 


--------------------------------------------------------------------------------
/lib/tables/.gitignore:
--------------------------------------------------------------------------------
1 | table_generator
2 | 


--------------------------------------------------------------------------------
/lib/tables/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all clean
 2 | 
 3 | TARGETS := table_dec_32bit.h table_enc_12bit.h table_generator
 4 | 
 5 | all: $(TARGETS)
 6 | 
 7 | clean:
 8 | 	$(RM) $(TARGETS)
 9 | 
10 | table_dec_32bit.h: table_generator
11 | 	./$^ > $@
12 | 
13 | table_enc_12bit.h: table_enc_12bit.py
14 | 	./$^ > $@
15 | 
16 | table_generator: table_generator.c
17 | 	$(CC) $(CFLAGS) -o $@ $^
18 | 


--------------------------------------------------------------------------------
/lib/tables/table_enc_12bit.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | def tr(x):
 4 |     """Translate a 6-bit value to the Base64 alphabet."""
 5 |     s = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' \
 6 |       + 'abcdefghijklmnopqrstuvwxyz' \
 7 |       + '0123456789' \
 8 |       + '+/'
 9 |     return ord(s[x])
10 | 
11 | def table(fn):
12 |     """Generate a 12-bit lookup table."""
13 |     ret = []
14 |     for n in range(0, 2**12):
15 |         pre = "\n\t" if n % 8 == 0 else " "
16 |         pre = "\t" if n == 0 else pre
17 |         ret.append("{}0x{:04X}U,".format(pre, fn(n)))
18 |     return "".join(ret)
19 | 
20 | def table_be():
21 |     """Generate a 12-bit big-endian lookup table."""
22 |     return table(lambda n: (tr(n & 0x3F) << 0) | (tr(n >> 6) << 8))
23 | 
24 | def table_le():
25 |     """Generate a 12-bit little-endian lookup table."""
26 |     return table(lambda n: (tr(n >> 6) << 0) | (tr(n & 0x3F) << 8))
27 | 
28 | def main():
29 |     """Entry point."""
30 |     lines = [
31 |         "#include <stdint.h>",
32 |         "",
33 |         "const uint16_t base64_table_enc_12bit[] = {",
34 |         "#if BASE64_LITTLE_ENDIAN",
35 |         table_le(),
36 |         "#else",
37 |         table_be(),
38 |         "#endif",
39 |         "};"
40 |     ]
41 |     for line in lines:
42 |         print(line)
43 | 
44 | if __name__ == "__main__":
45 |     main()
46 | 


--------------------------------------------------------------------------------
/lib/tables/table_generator.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *
  3 |  * Copyright 2005, 2006 Nick Galbreath -- nickg [at] modp [dot] com
  4 |  * Copyright 2017 Matthieu Darbois
  5 |  * All rights reserved.
  6 |  *
  7 |  * http://modp.com/release/base64
  8 |  *
  9 |  * Redistribution and use in source and binary forms, with or without
 10 |  * modification, are permitted provided that the following conditions are
 11 |  * met:
 12 |  *
 13 |  * - Redistributions of source code must retain the above copyright notice,
 14 |  *  this list of conditions and the following disclaimer.
 15 |  *
 16 |  * - Redistributions in binary form must reproduce the above copyright
 17 |  *   notice, this list of conditions and the following disclaimer in the
 18 |  *   documentation and/or other materials provided with the distribution.
 19 |  *
 20 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 21 |  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 22 |  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 23 |  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 |  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 26 |  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 27 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 28 |  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 29 |  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 30 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 |  *
 32 |  */
 33 | 
 34 | /****************************/
 35 | 
 36 | #include <stdint.h>
 37 | #include <stdio.h>
 38 | #include <stdlib.h>
 39 | #include <string.h>
 40 | #include <inttypes.h>
 41 | 
 42 | static uint8_t b64chars[64] = {
 43 | 	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 44 | 	'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 45 | 	'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 46 | 	'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 47 | 	'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 48 | };
 49 | 
 50 | static uint8_t padchar = '=';
 51 | 
 52 | static void printStart(void)
 53 | {
 54 | 	printf("#include <stdint.h>\n");
 55 | 	printf("#define CHAR62 '%c'\n", b64chars[62]);
 56 | 	printf("#define CHAR63 '%c'\n", b64chars[63]);
 57 | 	printf("#define CHARPAD '%c'\n", padchar);
 58 | }
 59 | 
 60 | static void clearDecodeTable(uint32_t* ary)
 61 | {
 62 | 	int i = 0;
 63 | 	for (i = 0; i < 256; ++i) {
 64 | 		ary[i] = 0xFFFFFFFF;
 65 | 	}
 66 | }
 67 | 
 68 | /* dump uint32_t as hex digits */
 69 | void uint32_array_to_c_hex(const uint32_t* ary, size_t sz, const char* name)
 70 | {
 71 | 	size_t i = 0;
 72 | 
 73 | 	printf("const uint32_t %s[%d] = {\n", name, (int)sz);
 74 | 	for (;;) {
 75 | 		printf("0x%08" PRIx32, ary[i]);
 76 | 		++i;
 77 | 		if (i == sz)
 78 | 			break;
 79 | 		if (i % 6 == 0) {
 80 | 			printf(",\n");
 81 | 		} else {
 82 | 			printf(", ");
 83 | 		}
 84 | 	}
 85 | 	printf("\n};\n");
 86 | }
 87 | 
 88 | int main(int argc, char** argv)
 89 | {
 90 | 	uint32_t x;
 91 | 	uint32_t i = 0;
 92 | 	uint32_t ary[256];
 93 | 
 94 | 	/*  over-ride standard alphabet */
 95 | 	if (argc == 2) {
 96 | 		uint8_t* replacements = (uint8_t*)argv[1];
 97 | 		if (strlen((char*)replacements) != 3) {
 98 | 			fprintf(stderr, "input must be a string of 3 characters '-', '.' or '_'\n");
 99 | 			exit(1);
100 | 		}
101 | 		fprintf(stderr, "fusing '%s' as replacements in base64 encoding\n", replacements);
102 | 		b64chars[62] = replacements[0];
103 | 		b64chars[63] = replacements[1];
104 | 		padchar = replacements[2];
105 | 	}
106 | 
107 | 	printStart();
108 | 
109 | 	printf("\n\n#if BASE64_LITTLE_ENDIAN\n");
110 | 
111 | 	printf("\n\n/* SPECIAL DECODE TABLES FOR LITTLE ENDIAN (INTEL) CPUS */\n\n");
112 | 
113 | 	clearDecodeTable(ary);
114 | 	for (i = 0; i < 64; ++i) {
115 | 		x = b64chars[i];
116 | 		ary[x] = i << 2;
117 | 	}
118 | 	uint32_array_to_c_hex(ary, sizeof(ary) / sizeof(uint32_t), "base64_table_dec_32bit_d0");
119 | 	printf("\n\n");
120 | 
121 | 	clearDecodeTable(ary);
122 | 	for (i = 0; i < 64; ++i) {
123 | 		x = b64chars[i];
124 | 		ary[x] = ((i & 0x30) >> 4) | ((i & 0x0F) << 12);
125 | 	}
126 | 	uint32_array_to_c_hex(ary, sizeof(ary) / sizeof(uint32_t), "base64_table_dec_32bit_d1");
127 | 	printf("\n\n");
128 | 
129 | 	clearDecodeTable(ary);
130 | 	for (i = 0; i < 64; ++i) {
131 | 		x = b64chars[i];
132 | 		ary[x] = ((i & 0x03) << 22) | ((i & 0x3c) << 6);
133 | 	}
134 | 	uint32_array_to_c_hex(ary, sizeof(ary) / sizeof(uint32_t), "base64_table_dec_32bit_d2");
135 | 	printf("\n\n");
136 | 
137 | 	clearDecodeTable(ary);
138 | 	for (i = 0; i < 64; ++i) {
139 | 		x = b64chars[i];
140 | 		ary[x] = i << 16;
141 | 	}
142 | 	uint32_array_to_c_hex(ary, sizeof(ary) / sizeof(uint32_t), "base64_table_dec_32bit_d3");
143 | 	printf("\n\n");
144 | 
145 | 	printf("#else\n");
146 | 
147 | 	printf("\n\n/* SPECIAL DECODE TABLES FOR BIG ENDIAN (IBM/MOTOROLA/SUN) CPUS */\n\n");
148 | 
149 | 	clearDecodeTable(ary);
150 | 	for (i = 0; i < 64; ++i) {
151 | 		x = b64chars[i];
152 | 		ary[x] = i << 26;
153 | 	}
154 | 	uint32_array_to_c_hex(ary, sizeof(ary) / sizeof(uint32_t), "base64_table_dec_32bit_d0");
155 | 	printf("\n\n");
156 | 
157 | 	clearDecodeTable(ary);
158 | 	for (i = 0; i < 64; ++i) {
159 | 		x = b64chars[i];
160 | 		ary[x] = i << 20;
161 | 	}
162 | 	uint32_array_to_c_hex(ary, sizeof(ary) / sizeof(uint32_t), "base64_table_dec_32bit_d1");
163 | 	printf("\n\n");
164 | 
165 | 	clearDecodeTable(ary);
166 | 	for (i = 0; i < 64; ++i) {
167 | 		x = b64chars[i];
168 | 		ary[x] = i << 14;
169 | 	}
170 | 	uint32_array_to_c_hex(ary, sizeof(ary) / sizeof(uint32_t), "base64_table_dec_32bit_d2");
171 | 	printf("\n\n");
172 | 
173 | 	clearDecodeTable(ary);
174 | 	for (i = 0; i < 64; ++i) {
175 | 		x = b64chars[i];
176 | 		ary[x] = i << 8;
177 | 	}
178 | 	uint32_array_to_c_hex(ary, sizeof(ary) / sizeof(uint32_t), "base64_table_dec_32bit_d3");
179 | 	printf("\n\n");
180 | 
181 | 	printf("#endif\n");
182 | 
183 | 	return 0;
184 | }
185 | 


--------------------------------------------------------------------------------
/lib/tables/tables.c:
--------------------------------------------------------------------------------
 1 | #include "tables.h"
 2 | 
 3 | const uint8_t
 4 | base64_table_enc_6bit[] =
 5 | 	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 6 | 	"abcdefghijklmnopqrstuvwxyz"
 7 | 	"0123456789"
 8 | 	"+/";
 9 | 
10 | // In the lookup table below, note that the value for '=' (character 61) is
11 | // 254, not 255. This character is used for in-band signaling of the end of
12 | // the datastream, and we will use that later. The characters A-Z, a-z, 0-9
13 | // and + / are mapped to their "decoded" values. The other bytes all map to
14 | // the value 255, which flags them as "invalid input".
15 | 
16 | const uint8_t
17 | base64_table_dec_8bit[] =
18 | {
19 | 	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		//   0..15
20 | 	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		//  16..31
21 | 	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,  62, 255, 255, 255,  63,		//  32..47
22 | 	 52,  53,  54,  55,  56,  57,  58,  59,  60,  61, 255, 255, 255, 254, 255, 255,		//  48..63
23 | 	255,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,		//  64..79
24 | 	 15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25, 255, 255, 255, 255, 255,		//  80..95
25 | 	255,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,		//  96..111
26 | 	 41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51, 255, 255, 255, 255, 255,		// 112..127
27 | 	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		// 128..143
28 | 	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
29 | 	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
30 | 	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
31 | 	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
32 | 	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
33 | 	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
34 | 	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
35 | };
36 | 
37 | #if BASE64_WORDSIZE >= 32
38 | #  include "table_dec_32bit.h"
39 | #  include "table_enc_12bit.h"
40 | #endif
41 | 


--------------------------------------------------------------------------------
/lib/tables/tables.h:
--------------------------------------------------------------------------------
 1 | #ifndef BASE64_TABLES_H
 2 | #define BASE64_TABLES_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | #include "../env.h"
 7 | 
 8 | // These tables are used by all codecs for fallback plain encoding/decoding:
 9 | extern const uint8_t base64_table_enc_6bit[];
10 | extern const uint8_t base64_table_dec_8bit[];
11 | 
12 | // These tables are used for the 32-bit and 64-bit generic decoders:
13 | #if BASE64_WORDSIZE >= 32
14 | extern const uint32_t base64_table_dec_32bit_d0[];
15 | extern const uint32_t base64_table_dec_32bit_d1[];
16 | extern const uint32_t base64_table_dec_32bit_d2[];
17 | extern const uint32_t base64_table_dec_32bit_d3[];
18 | 
19 | // This table is used by the 32 and 64-bit generic encoders:
20 | extern const uint16_t base64_table_enc_12bit[];
21 | #endif
22 | 
23 | #endif	// BASE64_TABLES_H
24 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Written in 2016 by Henrik Steffen Gaßmann henrik@gassmann.onl
 2 | #
 3 | # To the extent possible under law, the author(s) have dedicated all
 4 | # copyright and related and neighboring rights to this software to the
 5 | # public domain worldwide. This software is distributed without any warranty.
 6 | #
 7 | # You should have received a copy of the CC0 Public Domain Dedication
 8 | # along with this software. If not, see
 9 | #
10 | #     http://creativecommons.org/publicdomain/zero/1.0/
11 | #
12 | ########################################################################
13 | 
14 | function(add_base64_test TEST_NAME)
15 |     unset(SRC_FILE)
16 |     foreach(SRC_FILE ${ARGN})
17 |         list(APPEND SRC_FILES "${SRC_FILE}")
18 |     endforeach()
19 | 
20 |     add_executable(${TEST_NAME} ${SRC_FILES})
21 |     target_link_libraries(${TEST_NAME} PRIVATE base64)
22 | 
23 |     add_test(NAME ${TEST_NAME}
24 |         COMMAND ${TEST_NAME}
25 |     )
26 |     install(TARGETS ${TEST_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR})
27 | endfunction()
28 | 
29 | 
30 | add_base64_test(test_base64
31 |     codec_supported.c
32 |     test_base64.c
33 | )
34 | 
35 | add_base64_test(benchmark
36 |     codec_supported.c
37 |     benchmark.c
38 | )
39 | 
40 | if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
41 |     target_link_libraries(benchmark PRIVATE rt)
42 | endif()
43 | 
44 | 


--------------------------------------------------------------------------------
/test/Makefile:
--------------------------------------------------------------------------------
 1 | CFLAGS += -std=c99 -O3 -Wall -Wextra -pedantic -DBASE64_STATIC_DEFINE
 2 | ifdef OPENMP
 3 |   CFLAGS += -fopenmp
 4 | endif
 5 | 
 6 | TARGET := $(shell $(CC) -dumpmachine)
 7 | ifneq (, $(findstring darwin, $(TARGET)))
 8 |   BENCH_LDFLAGS=
 9 | else ifneq (, $(findstring mingw, $(TARGET)))
10 |   BENCH_LDFLAGS=
11 | else
12 |   # default to linux, -lrt needed
13 |   BENCH_LDFLAGS=-lrt
14 | endif
15 | 
16 | .PHONY: clean test valgrind
17 | 
18 | test: clean test_base64 benchmark
19 | 	./test_base64
20 | 	./benchmark
21 | 
22 | valgrind: clean test_base64
23 | 	valgrind --error-exitcode=2 ./test_base64
24 | 
25 | test_base64: test_base64.c codec_supported.o ../lib/libbase64.o
26 | 	$(CC) $(CFLAGS) -o $@ $^
27 | 
28 | benchmark: benchmark.c codec_supported.o ../lib/libbase64.o
29 | 	$(CC) $(CFLAGS) -o $@ $^ $(BENCH_LDFLAGS)
30 | 
31 | ../%:
32 | 	make -C .. $*
33 | 
34 | %.o: %.c
35 | 	$(CC) $(CFLAGS) -o $@ -c $<
36 | 
37 | clean:
38 | 	rm -f benchmark test_base64 *.o
39 | 


--------------------------------------------------------------------------------
/test/benchmark.c:
--------------------------------------------------------------------------------
  1 | // For clock_gettime(2):
  2 | #ifndef _POSIX_C_SOURCE
  3 | #define _POSIX_C_SOURCE 199309L
  4 | #endif
  5 | 
  6 | // For CLOCK_REALTIME on FreeBSD:
  7 | #ifndef _XOPEN_SOURCE
  8 | #define _XOPEN_SOURCE   600
  9 | #endif
 10 | 
 11 | // Standard cross-platform includes.
 12 | #include <stdbool.h>
 13 | #include <stdlib.h>
 14 | #include <stdio.h>
 15 | 
 16 | // Platform-specific includes.
 17 | #if defined(_WIN32) || defined(_WIN64)
 18 | #  include <windows.h>
 19 | #  include <wincrypt.h>
 20 | #else
 21 | #  include <sys/types.h>
 22 | #  include <sys/stat.h>
 23 | #  include <fcntl.h>
 24 | #  include <unistd.h>
 25 | #  include <time.h>
 26 | #endif
 27 | 
 28 | #if defined(__MACH__)
 29 | #  include <mach/mach_time.h>
 30 | #endif
 31 | 
 32 | #include "../include/libbase64.h"
 33 | #include "codec_supported.h"
 34 | 
 35 | #define KB	1024
 36 | #define MB	(1024 * KB)
 37 | 
 38 | #define RANDOMDEV  "/dev/urandom"
 39 | 
 40 | struct buffers {
 41 | 	char *reg;
 42 | 	char *enc;
 43 | 	size_t regsz;
 44 | 	size_t encsz;
 45 | };
 46 | 
 47 | // Define buffer sizes to test with:
 48 | static struct bufsize {
 49 | 	char	*label;
 50 | 	size_t	 len;
 51 | 	int	 repeat;
 52 | 	int	 batch;
 53 | }
 54 | sizes[] = {
 55 | 	{ "10 MB",	MB * 10,	10,	1	},
 56 | 	{ "1 MB",	MB * 1,		10,	10	},
 57 | 	{ "100 KB",	KB * 100,	10,	100	},
 58 | 	{ "10 KB",	KB * 10,	100,	100	},
 59 | 	{ "1 KB",	KB * 1,		100,	1000	},
 60 | };
 61 | 
 62 | static inline float
 63 | bytes_to_mb (size_t bytes)
 64 | {
 65 | 	return bytes / (float) MB;
 66 | }
 67 | 
 68 | static bool
 69 | get_random_data (struct buffers *b, char **errmsg)
 70 | {
 71 | #if defined(_WIN32) || defined(_WIN64)
 72 | 	HCRYPTPROV hProvider = 0;
 73 | 
 74 | 	if (!CryptAcquireContext(&hProvider, 0, 0, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT | CRYPT_SILENT)) {
 75 | 		*errmsg = "Error: CryptAcquireContext";
 76 | 		return false;
 77 | 	}
 78 | 
 79 | 	if (!CryptGenRandom(hProvider, b->regsz, b->reg)) {
 80 | 		CryptReleaseContext(hProvider, 0);
 81 | 		*errmsg = "Error: CryptGenRandom";
 82 | 		return false;
 83 | 	}
 84 | 
 85 | 	if (!CryptReleaseContext(hProvider, 0)) {
 86 | 		*errmsg = "Error: CryptReleaseContext";
 87 | 		return false;
 88 | 	}
 89 | 
 90 | 	return true;
 91 | #else
 92 | 	int fd;
 93 | 	ssize_t nread;
 94 | 	size_t total_read = 0;
 95 | 
 96 | 	// Open random device for semi-random data:
 97 | 	if ((fd = open(RANDOMDEV, O_RDONLY)) < 0) {
 98 | 		*errmsg = "Cannot open " RANDOMDEV;
 99 | 		return false;
100 | 	}
101 | 
102 | 	printf("Filling buffer with %.1f MB of random data...\n", bytes_to_mb(b->regsz));
103 | 
104 | 	while (total_read < b->regsz) {
105 | 		if ((nread = read(fd, b->reg + total_read, b->regsz - total_read)) < 0) {
106 | 			*errmsg = "Read error";
107 | 			close(fd);
108 | 			return false;
109 | 		}
110 | 		total_read += nread;
111 | 	}
112 | 
113 | 	close(fd);
114 | 	return true;
115 | #endif
116 | }
117 | 
118 | #if defined(__MACH__)
119 | typedef uint64_t base64_timespec;
120 | 
121 | static void
122 | base64_gettime (base64_timespec *t)
123 | {
124 | 	*t = mach_absolute_time();
125 | }
126 | 
127 | static float
128 | timediff_sec (base64_timespec *start, base64_timespec *end)
129 | {
130 | 	uint64_t diff = *end - *start;
131 | 	mach_timebase_info_data_t tb = { 0, 0 };
132 | 	mach_timebase_info(&tb);
133 | 
134 | 	return (float)((diff * tb.numer) / tb.denom) / 1e9f;
135 | }
136 | #elif defined(_WIN32) || defined(_WIN64)
137 | typedef ULARGE_INTEGER base64_timespec;
138 | 
139 | static void
140 | base64_gettime (base64_timespec *t)
141 | {
142 | 	FILETIME current_time_ft;
143 | 
144 | 	GetSystemTimePreciseAsFileTime(&current_time_ft);
145 | 
146 | 	t->LowPart  = current_time_ft.dwLowDateTime;
147 | 	t->HighPart = current_time_ft.dwHighDateTime;
148 | }
149 | 
150 | static float
151 | timediff_sec (base64_timespec *start, base64_timespec *end)
152 | {
153 | 	// Timer resolution is 100 nanoseconds (10^-7 sec).
154 | 	return (end->QuadPart - start->QuadPart) / 1e7f;
155 | }
156 | #else
157 | typedef struct timespec base64_timespec;
158 | 
159 | static void
160 | base64_gettime (base64_timespec *t)
161 | {
162 | 	clock_gettime(CLOCK_REALTIME, t);
163 | }
164 | 
165 | static float
166 | timediff_sec (base64_timespec *start, base64_timespec *end)
167 | {
168 | 	return (end->tv_sec - start->tv_sec) + (end->tv_nsec - start->tv_nsec) / 1e9f;
169 | }
170 | #endif
171 | 
172 | static void
173 | codec_bench_enc (struct buffers *b, const struct bufsize *bs, const char *name, unsigned int flags)
174 | {
175 | 	float timediff, fastest = -1.0f;
176 | 	base64_timespec start, end;
177 | 
178 | 	// Reset buffer size:
179 | 	b->regsz = bs->len;
180 | 
181 | 	// Repeat benchmark a number of times for a fair test:
182 | 	for (int i = bs->repeat; i; i--) {
183 | 
184 | 		// Timing loop, use batches to increase timer resolution:
185 | 		base64_gettime(&start);
186 | 		for (int j = bs->batch; j; j--)
187 | 			base64_encode(b->reg, b->regsz, b->enc, &b->encsz, flags);
188 | 		base64_gettime(&end);
189 | 
190 | 		// Calculate average time of batch:
191 | 		timediff = timediff_sec(&start, &end) / bs->batch;
192 | 
193 | 		// Update fastest time seen:
194 | 		if (fastest < 0.0f || timediff < fastest)
195 | 			fastest = timediff;
196 | 	}
197 | 
198 | 	printf("%s\tencode\t%.02f MB/sec\n", name, bytes_to_mb(b->regsz) / fastest);
199 | }
200 | 
201 | static void
202 | codec_bench_dec (struct buffers *b, const struct bufsize *bs, const char *name, unsigned int flags)
203 | {
204 | 	float timediff, fastest = -1.0f;
205 | 	base64_timespec start, end;
206 | 
207 | 	// Reset buffer size:
208 | 	b->encsz = bs->len;
209 | 
210 | 	// Repeat benchmark a number of times for a fair test:
211 | 	for (int i = bs->repeat; i; i--) {
212 | 
213 | 		// Timing loop, use batches to increase timer resolution:
214 | 		base64_gettime(&start);
215 | 		for (int j = bs->batch; j; j--)
216 | 			base64_decode(b->enc, b->encsz, b->reg, &b->regsz, flags);
217 | 		base64_gettime(&end);
218 | 
219 | 		// Calculate average time of batch:
220 | 		timediff = timediff_sec(&start, &end) / bs->batch;
221 | 
222 | 		// Update fastest time seen:
223 | 		if (fastest < 0.0f || timediff < fastest)
224 | 			fastest = timediff;
225 | 	}
226 | 
227 | 	printf("%s\tdecode\t%.02f MB/sec\n", name, bytes_to_mb(b->encsz) / fastest);
228 | }
229 | 
230 | static void
231 | codec_bench (struct buffers *b, const struct bufsize *bs, const char *name, unsigned int flags)
232 | {
233 | 	codec_bench_enc(b, bs, name, flags);
234 | 	codec_bench_dec(b, bs, name, flags);
235 | }
236 | 
237 | int
238 | main ()
239 | {
240 | 	int ret = 0;
241 | 	char *errmsg = NULL;
242 | 	struct buffers b;
243 | 
244 | 	// Set buffer sizes to largest buffer length:
245 | 	b.regsz = sizes[0].len;
246 | 	b.encsz = sizes[0].len * 5 / 3;
247 | 
248 | 	// Allocate space for megabytes of random data:
249 | 	if ((b.reg = malloc(b.regsz)) == NULL) {
250 | 		errmsg = "Out of memory";
251 | 		ret = 1;
252 | 		goto err0;
253 | 	}
254 | 
255 | 	// Allocate space for encoded output:
256 | 	if ((b.enc = malloc(b.encsz)) == NULL) {
257 | 		errmsg = "Out of memory";
258 | 		ret = 1;
259 | 		goto err1;
260 | 	}
261 | 
262 | 	// Fill buffer with random data:
263 | 	if (get_random_data(&b, &errmsg) == false) {
264 | 		ret = 1;
265 | 		goto err2;
266 | 	}
267 | 
268 | 	// Loop over all buffer sizes:
269 | 	for (size_t i = 0; i < sizeof(sizes) / sizeof(sizes[0]); i++) {
270 | 		printf("Testing with buffer size %s, fastest of %d * %d\n",
271 | 			sizes[i].label, sizes[i].repeat, sizes[i].batch);
272 | 
273 | 		// Loop over all codecs:
274 | 		for (size_t j = 0; codecs[j]; j++) {
275 | 			int flags = codec_supported(j);
276 | 			if (flags)
277 | 				codec_bench(&b, &sizes[i], codecs[j], flags);
278 | 		}
279 | 	}
280 | 
281 | 	// Free memory:
282 | err2:	free(b.enc);
283 | err1:	free(b.reg);
284 | err0:	if (errmsg)
285 | 		fputs(errmsg, stderr);
286 | 
287 | 	return ret;
288 | }
289 | 


--------------------------------------------------------------------------------
/test/ci/.gitattributes:
--------------------------------------------------------------------------------
1 | 😵‍💫.bin binary -text
2 | checksums.txt text eol=lf
3 | 


--------------------------------------------------------------------------------
/test/ci/analysis.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ve
 3 | 
 4 | MACHINE=$(uname -m)
 5 | export CC=gcc
 6 | 
 7 | uname -a
 8 | clang --version  # make analyse
 9 | ${CC} --version  # make -C test valgrind
10 | 
11 | for USE_ASSEMBLY in 0 1; do
12 | 	if [ "${MACHINE}" == "x86_64" ]; then
13 | 		export SSSE3_CFLAGS="-mssse3 -DBASE64_SSSE3_USE_ASM=${USE_ASSEMBLY}"
14 | 		export SSE41_CFLAGS="-msse4.1 -DBASE64_SSE41_USE_ASM=${USE_ASSEMBLY}"
15 | 		export SSE42_CFLAGS="-msse4.2 -DBASE64_SSE42_USE_ASM=${USE_ASSEMBLY}"
16 | 		export AVX_CFLAGS="-mavx -DBASE64_AVX_USE_ASM=${USE_ASSEMBLY}"
17 | 		export AVX2_CFLAGS="-mavx2 -DBASE64_AVX2_USE_ASM=${USE_ASSEMBLY}"
18 | 		export AVX512_CFLAGS="-mavx512vl -mavx512vbmi"
19 | 		# Temporarily disable AVX512; it is not available in CI yet.
20 | 		export BASE64_TEST_SKIP_AVX512=1
21 | 	elif [ "${MACHINE}" == "aarch64" ]; then
22 | 		export NEON64_CFLAGS="-march=armv8-a"
23 | 	elif [ "${MACHINE}" == "armv7l" ]; then
24 | 		export NEON32_CFLAGS="-march=armv7-a -mfloat-abi=hard -mfpu=neon"
25 | 	fi
26 | 
27 | 	if [ ${USE_ASSEMBLY} -eq 0 ]; then
28 | 		echo "::group::analyze"
29 | 		make analyze
30 | 		echo "::endgroup::"
31 | 	fi
32 | 
33 | 	echo "::group::valgrind (USE_ASSEMBLY=${USE_ASSEMBLY})"
34 | 	make clean
35 | 	make
36 | 	make -C test valgrind
37 | 	echo "::endgroup::"
38 | done
39 | 


--------------------------------------------------------------------------------
/test/ci/checksums.txt:
--------------------------------------------------------------------------------
1 | 4cd842ba8dce30216f77d2d5bf8a648e2dba6d95be5b12884f81e55bfab7b3cc *😵‍💫.bin
2 | 


--------------------------------------------------------------------------------
/test/ci/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ve
 3 | 
 4 | MACHINE=$(uname -m)
 5 | if [ "${MACHINE}" == "x86_64" ]; then
 6 | 	export SSSE3_CFLAGS=-mssse3
 7 | 	export SSE41_CFLAGS=-msse4.1
 8 | 	export SSE42_CFLAGS=-msse4.2
 9 | 	export AVX_CFLAGS=-mavx
10 | 	export AVX2_CFLAGS=-mavx2
11 | 	export AVX512_CFLAGS="-mavx512vl -mavx512vbmi"
12 | 	# Temporarily disable AVX512; it is not available in CI yet.
13 | 	export BASE64_TEST_SKIP_AVX512=1
14 | elif [ "${MACHINE}" == "aarch64" ]; then
15 | 	export NEON64_CFLAGS="-march=armv8-a"
16 | elif [ "${MACHINE}" == "armv7l" ]; then
17 | 	export NEON32_CFLAGS="-march=armv7-a -mfloat-abi=hard -mfpu=neon"
18 | fi
19 | 
20 | if [ "${OPENMP:-}" == "0" ]; then
21 | 	unset OPENMP
22 | fi
23 | 
24 | uname -a
25 | ${CC} --version
26 | 
27 | make
28 | make -C test
29 | 


--------------------------------------------------------------------------------
/test/ci/😵‍💫.bin:
--------------------------------------------------------------------------------
1 | Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
2 | 😵‍💫
3 | 


--------------------------------------------------------------------------------
/test/codec_supported.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | 
 5 | #include "../include/libbase64.h"
 6 | 
 7 | static char *_codecs[] =
 8 | { "AVX2"
 9 | , "NEON32"
10 | , "NEON64"
11 | , "plain"
12 | , "SSSE3"
13 | , "SSE41"
14 | , "SSE42"
15 | , "AVX"
16 | , "AVX512"
17 | , NULL
18 | } ;
19 | 
20 | char **codecs = _codecs;
21 | 
22 | int
23 | codec_supported (size_t index)
24 | {
25 | 	if (index >= (sizeof(_codecs) / sizeof(_codecs[0])) - 1) {
26 | 		return 0;
27 | 	}
28 | 	// Check if given codec is supported by trying to decode a test string:
29 | 	char *a = "aGVsbG8=";
30 | 	char b[10];
31 | 	size_t outlen;
32 | 	char envVariable[32];
33 | 	sprintf(envVariable, "BASE64_TEST_SKIP_%s", _codecs[index]);
34 | 	const char* envOverride = getenv(envVariable);
35 | 	if ((envOverride != NULL) && (strcmp(envOverride, "1") == 0)) {
36 | 		return 0;
37 | 	}
38 | 	int flags = 1 << index;
39 | 	return (base64_decode(a, strlen(a), b, &outlen, flags) != -1) ? flags : 0;
40 | }
41 | 


--------------------------------------------------------------------------------
/test/codec_supported.h:
--------------------------------------------------------------------------------
1 | extern char **codecs;
2 | 
3 | int codec_supported (size_t index);
4 | 


--------------------------------------------------------------------------------
/test/moby_dick.h:
--------------------------------------------------------------------------------
 1 | static const char *moby_dick_plain =
 2 | 	"Call me Ishmael. Some years ago--never mind how long precisely--having\n"
 3 | 	"little or no money in my purse, and nothing particular to interest me on\n"
 4 | 	"shore, I thought I would sail about a little and see the watery part of\n"
 5 | 	"the world. It is a way I have of driving off the spleen and regulating\n"
 6 | 	"the circulation. Whenever I find myself growing grim about the mouth;\n"
 7 | 	"whenever it is a damp, drizzly November in my soul; whenever I find\n"
 8 | 	"myself involuntarily pausing before coffin warehouses, and bringing up\n"
 9 | 	"the rear of every funeral I meet; and especially whenever my hypos get\n"
10 | 	"such an upper hand of me, that it requires a strong moral principle to\n"
11 | 	"prevent me from deliberately stepping into the street, and methodically\n"
12 | 	"knocking people's hats off--then, I account it high time to get to sea\n"
13 | 	"as soon as I can. This is my substitute for pistol and ball. With a\n"
14 | 	"philosophical flourish Cato throws himself upon his sword; I quietly\n"
15 | 	"take to the ship. There is nothing surprising in this. If they but knew\n"
16 | 	"it, almost all men in their degree, some time or other, cherish very\n"
17 | 	"nearly the same feelings towards the ocean with me.\n";
18 | 
19 | static const char *moby_dick_base64 =
20 | 	"Q2FsbCBtZSBJc2htYWVsLiBTb21lIHllYXJzIGFnby0tbmV2ZXIgbWluZCBob3cgbG9uZ"
21 | 	"yBwcmVjaXNlbHktLWhhdmluZwpsaXR0bGUgb3Igbm8gbW9uZXkgaW4gbXkgcHVyc2UsIG"
22 | 	"FuZCBub3RoaW5nIHBhcnRpY3VsYXIgdG8gaW50ZXJlc3QgbWUgb24Kc2hvcmUsIEkgdGh"
23 | 	"vdWdodCBJIHdvdWxkIHNhaWwgYWJvdXQgYSBsaXR0bGUgYW5kIHNlZSB0aGUgd2F0ZXJ5"
24 | 	"IHBhcnQgb2YKdGhlIHdvcmxkLiBJdCBpcyBhIHdheSBJIGhhdmUgb2YgZHJpdmluZyBvZ"
25 | 	"mYgdGhlIHNwbGVlbiBhbmQgcmVndWxhdGluZwp0aGUgY2lyY3VsYXRpb24uIFdoZW5ldm"
26 | 	"VyIEkgZmluZCBteXNlbGYgZ3Jvd2luZyBncmltIGFib3V0IHRoZSBtb3V0aDsKd2hlbmV"
27 | 	"2ZXIgaXQgaXMgYSBkYW1wLCBkcml6emx5IE5vdmVtYmVyIGluIG15IHNvdWw7IHdoZW5l"
28 | 	"dmVyIEkgZmluZApteXNlbGYgaW52b2x1bnRhcmlseSBwYXVzaW5nIGJlZm9yZSBjb2Zma"
29 | 	"W4gd2FyZWhvdXNlcywgYW5kIGJyaW5naW5nIHVwCnRoZSByZWFyIG9mIGV2ZXJ5IGZ1bm"
30 | 	"VyYWwgSSBtZWV0OyBhbmQgZXNwZWNpYWxseSB3aGVuZXZlciBteSBoeXBvcyBnZXQKc3V"
31 | 	"jaCBhbiB1cHBlciBoYW5kIG9mIG1lLCB0aGF0IGl0IHJlcXVpcmVzIGEgc3Ryb25nIG1v"
32 | 	"cmFsIHByaW5jaXBsZSB0bwpwcmV2ZW50IG1lIGZyb20gZGVsaWJlcmF0ZWx5IHN0ZXBwa"
33 | 	"W5nIGludG8gdGhlIHN0cmVldCwgYW5kIG1ldGhvZGljYWxseQprbm9ja2luZyBwZW9wbG"
34 | 	"UncyBoYXRzIG9mZi0tdGhlbiwgSSBhY2NvdW50IGl0IGhpZ2ggdGltZSB0byBnZXQgdG8"
35 | 	"gc2VhCmFzIHNvb24gYXMgSSBjYW4uIFRoaXMgaXMgbXkgc3Vic3RpdHV0ZSBmb3IgcGlz"
36 | 	"dG9sIGFuZCBiYWxsLiBXaXRoIGEKcGhpbG9zb3BoaWNhbCBmbG91cmlzaCBDYXRvIHRoc"
37 | 	"m93cyBoaW1zZWxmIHVwb24gaGlzIHN3b3JkOyBJIHF1aWV0bHkKdGFrZSB0byB0aGUgc2"
38 | 	"hpcC4gVGhlcmUgaXMgbm90aGluZyBzdXJwcmlzaW5nIGluIHRoaXMuIElmIHRoZXkgYnV"
39 | 	"0IGtuZXcKaXQsIGFsbW9zdCBhbGwgbWVuIGluIHRoZWlyIGRlZ3JlZSwgc29tZSB0aW1l"
40 | 	"IG9yIG90aGVyLCBjaGVyaXNoIHZlcnkKbmVhcmx5IHRoZSBzYW1lIGZlZWxpbmdzIHRvd"
41 | 	"2FyZHMgdGhlIG9jZWFuIHdpdGggbWUuCg==";
42 | 


--------------------------------------------------------------------------------
/test/moby_dick_base64.txt:
--------------------------------------------------------------------------------
1 | Q2FsbCBtZSBJc2htYWVsLiBTb21lIHllYXJzIGFnby0tbmV2ZXIgbWluZCBob3cgbG9uZyBwcmVjaXNlbHktLWhhdmluZwpsaXR0bGUgb3Igbm8gbW9uZXkgaW4gbXkgcHVyc2UsIGFuZCBub3RoaW5nIHBhcnRpY3VsYXIgdG8gaW50ZXJlc3QgbWUgb24Kc2hvcmUsIEkgdGhvdWdodCBJIHdvdWxkIHNhaWwgYWJvdXQgYSBsaXR0bGUgYW5kIHNlZSB0aGUgd2F0ZXJ5IHBhcnQgb2YKdGhlIHdvcmxkLiBJdCBpcyBhIHdheSBJIGhhdmUgb2YgZHJpdmluZyBvZmYgdGhlIHNwbGVlbiBhbmQgcmVndWxhdGluZwp0aGUgY2lyY3VsYXRpb24uIFdoZW5ldmVyIEkgZmluZCBteXNlbGYgZ3Jvd2luZyBncmltIGFib3V0IHRoZSBtb3V0aDsKd2hlbmV2ZXIgaXQgaXMgYSBkYW1wLCBkcml6emx5IE5vdmVtYmVyIGluIG15IHNvdWw7IHdoZW5ldmVyIEkgZmluZApteXNlbGYgaW52b2x1bnRhcmlseSBwYXVzaW5nIGJlZm9yZSBjb2ZmaW4gd2FyZWhvdXNlcywgYW5kIGJyaW5naW5nIHVwCnRoZSByZWFyIG9mIGV2ZXJ5IGZ1bmVyYWwgSSBtZWV0OyBhbmQgZXNwZWNpYWxseSB3aGVuZXZlciBteSBoeXBvcyBnZXQKc3VjaCBhbiB1cHBlciBoYW5kIG9mIG1lLCB0aGF0IGl0IHJlcXVpcmVzIGEgc3Ryb25nIG1vcmFsIHByaW5jaXBsZSB0bwpwcmV2ZW50IG1lIGZyb20gZGVsaWJlcmF0ZWx5IHN0ZXBwaW5nIGludG8gdGhlIHN0cmVldCwgYW5kIG1ldGhvZGljYWxseQprbm9ja2luZyBwZW9wbGUncyBoYXRzIG9mZi0tdGhlbiwgSSBhY2NvdW50IGl0IGhpZ2ggdGltZSB0byBnZXQgdG8gc2VhCmFzIHNvb24gYXMgSSBjYW4uIFRoaXMgaXMgbXkgc3Vic3RpdHV0ZSBmb3IgcGlzdG9sIGFuZCBiYWxsLiBXaXRoIGEKcGhpbG9zb3BoaWNhbCBmbG91cmlzaCBDYXRvIHRocm93cyBoaW1zZWxmIHVwb24gaGlzIHN3b3JkOyBJIHF1aWV0bHkKdGFrZSB0byB0aGUgc2hpcC4gVGhlcmUgaXMgbm90aGluZyBzdXJwcmlzaW5nIGluIHRoaXMuIElmIHRoZXkgYnV0IGtuZXcKaXQsIGFsbW9zdCBhbGwgbWVuIGluIHRoZWlyIGRlZ3JlZSwgc29tZSB0aW1lIG9yIG90aGVyLCBjaGVyaXNoIHZlcnkKbmVhcmx5IHRoZSBzYW1lIGZlZWxpbmdzIHRvd2FyZHMgdGhlIG9jZWFuIHdpdGggbWUuCg==


--------------------------------------------------------------------------------
/test/moby_dick_plain.txt:
--------------------------------------------------------------------------------
 1 | Call me Ishmael. Some years ago--never mind how long precisely--having
 2 | little or no money in my purse, and nothing particular to interest me on
 3 | shore, I thought I would sail about a little and see the watery part of
 4 | the world. It is a way I have of driving off the spleen and regulating
 5 | the circulation. Whenever I find myself growing grim about the mouth;
 6 | whenever it is a damp, drizzly November in my soul; whenever I find
 7 | myself involuntarily pausing before coffin warehouses, and bringing up
 8 | the rear of every funeral I meet; and especially whenever my hypos get
 9 | such an upper hand of me, that it requires a strong moral principle to
10 | prevent me from deliberately stepping into the street, and methodically
11 | knocking people's hats off--then, I account it high time to get to sea
12 | as soon as I can. This is my substitute for pistol and ball. With a
13 | philosophical flourish Cato throws himself upon his sword; I quietly
14 | take to the ship. There is nothing surprising in this. If they but knew
15 | it, almost all men in their degree, some time or other, cherish very
16 | nearly the same feelings towards the ocean with me.
17 | 


--------------------------------------------------------------------------------
/test/test_base64.c:
--------------------------------------------------------------------------------
  1 | #include <stdbool.h>
  2 | #include <string.h>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include "../include/libbase64.h"
  6 | #include "codec_supported.h"
  7 | #include "moby_dick.h"
  8 | 
  9 | static char out[2000];
 10 | static size_t outlen;
 11 | 
 12 | static bool
 13 | assert_enc (int flags, const char *src, const char *dst)
 14 | {
 15 | 	size_t srclen = strlen(src);
 16 | 	size_t dstlen = strlen(dst);
 17 | 
 18 | 	base64_encode(src, srclen, out, &outlen, flags);
 19 | 
 20 | 	if (outlen != dstlen) {
 21 | 		printf("FAIL: encoding of '%s': length expected %lu, got %lu\n", src,
 22 | 			(unsigned long)dstlen,
 23 | 			(unsigned long)outlen
 24 | 		);
 25 | 		return true;
 26 | 	}
 27 | 	if (strncmp(dst, out, outlen) != 0) {
 28 | 		out[outlen] = '\0';
 29 | 		printf("FAIL: encoding of '%s': expected output '%s', got '%s'\n", src, dst, out);
 30 | 		return true;
 31 | 	}
 32 | 	return false;
 33 | }
 34 | 
 35 | static bool
 36 | assert_dec (int flags, const char *src, const char *dst)
 37 | {
 38 | 	size_t srclen = strlen(src);
 39 | 	size_t dstlen = strlen(dst);
 40 | 
 41 | 	if (!base64_decode(src, srclen, out, &outlen, flags)) {
 42 | 		printf("FAIL: decoding of '%s': decoding error\n", src);
 43 | 		return true;
 44 | 	}
 45 | 	if (outlen != dstlen) {
 46 | 		printf("FAIL: encoding of '%s': "
 47 | 			"length expected %lu, got %lu\n", src,
 48 | 			(unsigned long)dstlen,
 49 | 			(unsigned long)outlen
 50 | 		);
 51 | 		return true;
 52 | 	}
 53 | 	if (strncmp(dst, out, outlen) != 0) {
 54 | 		out[outlen] = '\0';
 55 | 		printf("FAIL: decoding of '%s': expected output '%s', got '%s'\n", src, dst, out);
 56 | 		return true;
 57 | 	}
 58 | 	return false;
 59 | }
 60 | 
 61 | static int
 62 | assert_roundtrip (int flags, const char *src)
 63 | {
 64 | 	char tmp[1500];
 65 | 	size_t tmplen;
 66 | 	size_t srclen = strlen(src);
 67 | 
 68 | 	// Encode the input into global buffer:
 69 | 	base64_encode(src, srclen, out, &outlen, flags);
 70 | 
 71 | 	// Decode the global buffer into local temp buffer:
 72 | 	if (!base64_decode(out, outlen, tmp, &tmplen, flags)) {
 73 | 		printf("FAIL: decoding of '%s': decoding error\n", out);
 74 | 		return true;
 75 | 	}
 76 | 
 77 | 	// Check that 'src' is identical to 'tmp':
 78 | 	if (srclen != tmplen) {
 79 | 		printf("FAIL: roundtrip of '%s': "
 80 | 			"length expected %lu, got %lu\n", src,
 81 | 			(unsigned long)srclen,
 82 | 			(unsigned long)tmplen
 83 | 		);
 84 | 		return true;
 85 | 	}
 86 | 	if (strncmp(src, tmp, tmplen) != 0) {
 87 | 		tmp[tmplen] = '\0';
 88 | 		printf("FAIL: roundtrip of '%s': got '%s'\n", src, tmp);
 89 | 		return true;
 90 | 	}
 91 | 
 92 | 	return false;
 93 | }
 94 | 
 95 | static int
 96 | test_char_table (int flags, bool use_malloc)
 97 | {
 98 | 	bool fail = false;
 99 | 	char chr[256];
100 | 	char enc[400], dec[400];
101 | 	size_t enclen, declen;
102 | 
103 | 	// Fill array with all characters 0..255:
104 | 	for (int i = 0; i < 256; i++)
105 | 		chr[i] = (unsigned char)i;
106 | 
107 | 	// Loop, using each char as a starting position to increase test coverage:
108 | 	for (int i = 0; i < 256; i++) {
109 | 
110 | 		size_t chrlen = 256 - i;
111 | 		char* src = &chr[i];
112 | 		if (use_malloc) {
113 | 			src = malloc(chrlen); /* malloc/copy this so valgrind can find out-of-bound access */
114 | 			if (src == NULL) {
115 | 				printf(
116 | 					"FAIL: encoding @ %d: allocation of %lu bytes failed\n",
117 | 					i, (unsigned long)chrlen
118 | 				);
119 | 				fail = true;
120 | 				continue;
121 | 			}
122 | 			memcpy(src, &chr[i], chrlen);
123 | 		}
124 | 
125 | 		base64_encode(src, chrlen, enc, &enclen, flags);
126 | 		if (use_malloc) {
127 | 			free(src);
128 | 		}
129 | 
130 | 		if (!base64_decode(enc, enclen, dec, &declen, flags)) {
131 | 			printf("FAIL: decoding @ %d: decoding error\n", i);
132 | 			fail = true;
133 | 			continue;
134 | 		}
135 | 		if (declen != chrlen) {
136 | 			printf("FAIL: roundtrip @ %d: "
137 | 				"length expected %lu, got %lu\n", i,
138 | 				(unsigned long)chrlen,
139 | 				(unsigned long)declen
140 | 			);
141 | 			fail = true;
142 | 			continue;
143 | 		}
144 | 		if (strncmp(&chr[i], dec, declen) != 0) {
145 | 			printf("FAIL: roundtrip @ %d: decoded output not same as input\n", i);
146 | 			fail = true;
147 | 		}
148 | 	}
149 | 
150 | 	return fail;
151 | }
152 | 
153 | static int
154 | test_streaming (int flags)
155 | {
156 | 	bool fail = false;
157 | 	char chr[256];
158 | 	char ref[400], enc[400];
159 | 	size_t reflen;
160 | 	struct base64_state state;
161 | 
162 | 	// Fill array with all characters 0..255:
163 | 	for (int i = 0; i < 256; i++)
164 | 		chr[i] = (unsigned char)i;
165 | 
166 | 	// Create reference base64 encoding:
167 | 	base64_encode(chr, 256, ref, &reflen, BASE64_FORCE_PLAIN);
168 | 
169 | 	// Encode the table with various block sizes and compare to reference:
170 | 	for (size_t bs = 1; bs < 255; bs++)
171 | 	{
172 | 		size_t inpos   = 0;
173 | 		size_t partlen = 0;
174 | 		size_t enclen  = 0;
175 | 
176 | 		base64_stream_encode_init(&state, flags);
177 | 		memset(enc, 0, 400);
178 | 		for (;;) {
179 | 			base64_stream_encode(&state, &chr[inpos], (inpos + bs > 256) ? 256 - inpos : bs, &enc[enclen], &partlen);
180 | 			enclen += partlen;
181 | 			if (inpos + bs > 256) {
182 | 				break;
183 | 			}
184 | 			inpos += bs;
185 | 		}
186 | 		base64_stream_encode_final(&state, &enc[enclen], &partlen);
187 | 		enclen += partlen;
188 | 
189 | 		if (enclen != reflen) {
190 | 			printf("FAIL: stream encoding gave incorrect size: "
191 | 				"%lu instead of %lu\n",
192 | 				(unsigned long)enclen,
193 | 				(unsigned long)reflen
194 | 			);
195 | 			fail = true;
196 | 		}
197 | 		if (strncmp(ref, enc, reflen) != 0) {
198 | 			printf("FAIL: stream encoding with blocksize %lu failed\n",
199 | 				(unsigned long)bs
200 | 			);
201 | 			fail = true;
202 | 		}
203 | 	}
204 | 
205 | 	// Decode the reference encoding with various block sizes and
206 | 	// compare to input char table:
207 | 	for (size_t bs = 1; bs < 255; bs++)
208 | 	{
209 | 		size_t inpos   = 0;
210 | 		size_t partlen = 0;
211 | 		size_t enclen  = 0;
212 | 
213 | 		base64_stream_decode_init(&state, flags);
214 | 		memset(enc, 0, 400);
215 | 		while (base64_stream_decode(&state, &ref[inpos], (inpos + bs > reflen) ? reflen - inpos : bs, &enc[enclen], &partlen)) {
216 | 			enclen += partlen;
217 | 			inpos += bs;
218 | 
219 | 			// Has the entire buffer been consumed?
220 | 			if (inpos >= 400) {
221 | 				break;
222 | 			}
223 | 		}
224 | 		if (enclen != 256) {
225 | 			printf("FAIL: stream decoding gave incorrect size: "
226 | 				"%lu instead of 255\n",
227 | 				(unsigned long)enclen
228 | 			);
229 | 			fail = true;
230 | 		}
231 | 		if (strncmp(chr, enc, 256) != 0) {
232 | 			printf("FAIL: stream decoding with blocksize %lu failed\n",
233 | 				(unsigned long)bs
234 | 			);
235 | 			fail = true;
236 | 		}
237 | 	}
238 | 
239 | 	return fail;
240 | }
241 | 
242 | static int
243 | test_invalid_dec_input (int flags)
244 | {
245 | 	// Subset of invalid characters to cover all ranges
246 | 	static const char invalid_set[] = { '\0', -1, '!', '-', ';', '_', '|' };
247 | 	static const char* invalid_strings[] = {
248 | 		"Zm9vYg=",
249 | 		"Zm9vYg",
250 | 		"Zm9vY",
251 | 		"Zm9vYmF=Zm9v"
252 | 	};
253 | 
254 | 	bool fail = false;
255 | 	char chr[256];
256 | 	char enc[400], dec[400];
257 | 	size_t enclen, declen;
258 | 
259 | 	// Fill array with all characters 0..255:
260 | 	for (int i = 0; i < 256; i++)
261 | 		chr[i] = (unsigned char)i;
262 | 
263 | 	// Create reference base64 encoding:
264 | 	base64_encode(chr, 256, enc, &enclen, BASE64_FORCE_PLAIN);
265 | 
266 | 	// Test invalid strings returns error.
267 | 	for (size_t i = 0U; i < sizeof(invalid_strings) / sizeof(invalid_strings[0]); ++i) {
268 | 		if (base64_decode(invalid_strings[i], strlen(invalid_strings[i]), dec, &declen, flags)) {
269 | 			printf("FAIL: decoding invalid input \"%s\": no decoding error\n", invalid_strings[i]);
270 | 			fail = true;
271 | 		}
272 | 	}
273 | 
274 | 	// Loop, corrupting each char to increase test coverage:
275 | 	for (size_t c = 0U; c < sizeof(invalid_set); ++c) {
276 | 		for (size_t i = 0U; i < enclen; i++) {
277 | 			char backup = enc[i];
278 | 
279 | 			enc[i] = invalid_set[c];
280 | 
281 | 			if (base64_decode(enc, enclen, dec, &declen, flags)) {
282 | 				printf("FAIL: decoding invalid input @ %d: no decoding error\n", (int)i);
283 | 				fail = true;
284 | 				enc[i] = backup;
285 | 				continue;
286 | 			}
287 | 			enc[i] = backup;
288 | 		}
289 | 	}
290 | 
291 | 	// Loop, corrupting two chars to increase test coverage:
292 | 	for (size_t c = 0U; c < sizeof(invalid_set); ++c) {
293 | 		for (size_t i = 0U; i < enclen - 2U; i++) {
294 | 			char backup  = enc[i+0];
295 | 			char backup2 = enc[i+2];
296 | 
297 | 			enc[i+0] = invalid_set[c];
298 | 			enc[i+2] = invalid_set[c];
299 | 
300 | 			if (base64_decode(enc, enclen, dec, &declen, flags)) {
301 | 				printf("FAIL: decoding invalid input @ %d: no decoding error\n", (int)i);
302 | 				fail = true;
303 | 				enc[i+0] = backup;
304 | 				enc[i+2] = backup2;
305 | 				continue;
306 | 			}
307 | 			enc[i+0] = backup;
308 | 			enc[i+2] = backup2;
309 | 		}
310 | 	}
311 | 
312 | 	return fail;
313 | }
314 | 
315 | static int
316 | test_one_codec (size_t codec_index)
317 | {
318 | 	bool fail = false;
319 | 	const char *codec = codecs[codec_index];
320 | 
321 | 	printf("Codec %s:\n", codec);
322 | 
323 | 	// Skip if this codec is not supported:
324 | 	int flags = codec_supported(codec_index);
325 | 	if (flags == 0) {
326 | 		puts("  skipping");
327 | 		return false;
328 | 	}
329 | 
330 | 	// Test vectors:
331 | 	struct {
332 | 		const char *in;
333 | 		const char *out;
334 | 	} vec[] = {
335 | 
336 | 		// These are the test vectors from RFC4648:
337 | 		{ "",		""         },
338 | 		{ "f",		"Zg=="     },
339 | 		{ "fo",		"Zm8="     },
340 | 		{ "foo",	"Zm9v"     },
341 | 		{ "foob",	"Zm9vYg==" },
342 | 		{ "fooba",	"Zm9vYmE=" },
343 | 		{ "foobar",	"Zm9vYmFy" },
344 | 
345 | 		// The first paragraph from Moby Dick,
346 | 		// to test the SIMD codecs with larger blocksize:
347 | 		{ moby_dick_plain, moby_dick_base64 },
348 | 	};
349 | 
350 | 	for (size_t i = 0; i < sizeof(vec) / sizeof(vec[0]); i++) {
351 | 
352 | 		// Encode plain string, check against output:
353 | 		fail |= assert_enc(flags, vec[i].in, vec[i].out);
354 | 
355 | 		// Decode the output string, check if we get the input:
356 | 		fail |= assert_dec(flags, vec[i].out, vec[i].in);
357 | 
358 | 		// Do a roundtrip on the inputs and the outputs:
359 | 		fail |= assert_roundtrip(flags, vec[i].in);
360 | 		fail |= assert_roundtrip(flags, vec[i].out);
361 | 	}
362 | 
363 | 	fail |= test_char_table(flags, false); /* test with unaligned input buffer */
364 | 	fail |= test_char_table(flags, true); /* test for out-of-bound input read */
365 | 	fail |= test_streaming(flags);
366 | 	fail |= test_invalid_dec_input(flags);
367 | 
368 | 	if (!fail)
369 | 		puts("  all tests passed.");
370 | 
371 | 	return fail;
372 | }
373 | 
374 | int
375 | main ()
376 | {
377 | 	bool fail = false;
378 | 
379 | 	// Loop over all codecs:
380 | 	for (size_t i = 0; codecs[i]; i++) {
381 | 		// Test this codec, merge the results:
382 | 		fail |= test_one_codec(i);
383 | 	}
384 | 
385 | 	return (fail) ? 1 : 0;
386 | }
387 | 


--------------------------------------------------------------------------------