├── .appveyor.yml ├── .gitignore ├── CMakeLists.txt ├── INSTALL ├── LICENSE ├── README ├── examples ├── c-example1.c ├── c-example2.c ├── cpp-example3.cc ├── cpp-rcnb-cli.cc └── python3-example.py ├── include └── rcnb │ ├── cdecode.h │ ├── cencode.h │ ├── decode.h │ ├── encode.h │ └── rcnb.h ├── librcnb-config-version.cmake.in ├── librcnb-config.cmake.in └── src ├── cdecode.c ├── cencode.c ├── rcnb.c ├── rcnb_arm64.c └── rcnb_x86.c /.appveyor.yml: -------------------------------------------------------------------------------- 1 | image: 2 | - Ubuntu1604 3 | - Ubuntu1804 4 | - Visual Studio 2015 5 | - Visual Studio 2017 6 | - Visual Studio 2019 7 | 8 | configuration: 9 | - Release 10 | 11 | before_build: 12 | - |- 13 | mkdir build 14 | cd build 15 | cmake --version 16 | - sh: cmake .. 17 | - cmd: cmake .. -D CMAKE_BUILD_TYPE=%CONFIGURATION% 18 | 19 | build_script: 20 | - sh: cmake --build . --config $CONFIGURATION 21 | - cmd: cmake --build . --config %CONFIGURATION% 22 | 23 | test_script: 24 | - echo "The Quick Brown RC Jumps Over the NB Dog." > in.txt 25 | - cmd: |- 26 | .\\%CONFIGURATION%\\rcnb -e in.txt out.rcnb 27 | .\\%CONFIGURATION%\\rcnb -d out.rcnb out.txt 28 | fc /b in.txt out.txt && exit /b 0 || exit /b 1 29 | - sh: |- 30 | ./rcnb -e in.txt out.rcnb 31 | ./rcnb -d out.rcnb out.txt 32 | cmp in.txt out.txt 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | cmake-build-debug 3 | .vscode 4 | .vs 5 | build 6 | *.o 7 | *.a 8 | *.so 9 | *.dll 10 | out 11 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ### 2 | ### CMake settings 3 | ### 4 | cmake_minimum_required(VERSION 3.1) 5 | 6 | ### 7 | ### Project settings 8 | ### 9 | project(librcnb) 10 | 11 | set(PROJECT_VERSION_MAJOR "1") 12 | set(PROJECT_VERSION_MINOR "0") 13 | set(PROJECT_VERSION_PATCH "0") 14 | set(PROJECT_VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}") 15 | if(CMAKE_C_COMPILER_ID MATCHES "MSVC") 16 | set(CMAKE_C_FLAGS "/utf-8 ${CMAKE_C_FLAGS}") 17 | endif() 18 | 19 | option(ENABLE_AVX2 "Enable AVX2 optimized code." OFF) 20 | option(ENABLE_SSSE3 "Enable SSSE3 optimized code." OFF) 21 | option(ENABLE_NEON "Enable NEON optimized code." OFF) 22 | option(NATIVE_ASM "Allow compiler use best instruction set on current environment." OFF) 23 | 24 | ### 25 | ### Sources, headers, directories and libs 26 | ### 27 | include_directories(include) 28 | 29 | set(RCNB_SOURCES 30 | src/cencode.c 31 | src/cdecode.c 32 | src/rcnb.c 33 | ) 34 | 35 | if(ENABLE_AVX2) 36 | if(CMAKE_C_COMPILER_ID MATCHES "MSVC") 37 | add_compile_options(/arch:AVX2) 38 | else() 39 | add_compile_options(-mavx2) 40 | endif() 41 | add_compile_definitions(ENABLE_AVX2) 42 | set(RCNB_SOURCES ${RCNB_SOURCES} src/rcnb_x86.c) 43 | elseif(ENABLE_SSSE3) 44 | if(NOT CMAKE_C_COMPILER_ID MATCHES "MSVC") 45 | add_compile_options(-mssse3) 46 | endif() 47 | add_compile_definitions(ENABLE_SSSE3) 48 | set(RCNB_SOURCES ${RCNB_SOURCES} src/rcnb_x86.c) 49 | endif() 50 | 51 | if(ENABLE_NEON) 52 | add_compile_definitions(ENABLE_NEON) 53 | set(RCNB_SOURCES ${RCNB_SOURCES} src/rcnb_arm64.c) 54 | endif() 55 | 56 | if(NATIVE_ASM) 57 | if(NOT CMAKE_C_COMPILER_ID MATCHES "MSVC") 58 | add_compile_options(-march=native) 59 | endif() 60 | endif() 61 | 62 | add_library(rcnb SHARED ${RCNB_SOURCES}) 63 | add_library(rcnb-static STATIC ${RCNB_SOURCES}) 64 | set_target_properties(rcnb PROPERTIES 65 | VERSION ${PROJECT_VERSION} 66 | SOVERSION ${PROJECT_VERSION_MAJOR} 67 | PUBLIC_HEADER "include/rcnb/cencode.h;include/rcnb/cdecode.h;include/rcnb/encode.h;include/rcnb/decode.h") 68 | set_target_properties(rcnb-static PROPERTIES 69 | VERSION ${PROJECT_VERSION} 70 | SOVERSION ${PROJECT_VERSION_MAJOR} 71 | PUBLIC_HEADER "include/rcnb/cencode.h;include/rcnb/cdecode.h;include/rcnb/encode.h;include/rcnb/decode.h") 72 | if (NOT CMAKE_VERSION VERSION_LESS 2.8.12) 73 | target_include_directories(rcnb-static 74 | PUBLIC $ 75 | $) 76 | endif() 77 | add_executable(example1 examples/c-example1.c) 78 | target_link_libraries(example1 rcnb-static) 79 | add_executable(example2 examples/c-example2.c) 80 | target_link_libraries(example2 rcnb-static) 81 | add_executable(example3 examples/cpp-example3.cc) 82 | target_link_libraries(example3 rcnb-static) 83 | add_executable(rcnb-cli examples/cpp-rcnb-cli.cc) 84 | set_target_properties(example3 85 | PROPERTIES CXX_STANDARD 11) 86 | target_link_libraries(rcnb-cli rcnb-static) 87 | set_target_properties(rcnb-cli 88 | PROPERTIES OUTPUT_NAME rcnb 89 | CXX_STANDARD 11) 90 | 91 | ### 92 | ### General compilation settings 93 | ### 94 | if(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE) 95 | set(CMAKE_BUILD_TYPE Release) 96 | endif() 97 | 98 | ### 99 | ### General install settings 100 | ### 101 | include(GNUInstallDirs) 102 | 103 | export( 104 | TARGETS rcnb-static 105 | FILE "${PROJECT_BINARY_DIR}/${PROJECT_NAME}-targets.cmake") 106 | export(PACKAGE ${PROJECT_NAME}) 107 | set(EXPORT_TARGETS rcnb-static CACHE INTERNAL "export targets") 108 | 109 | set(CONFIG_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/include") 110 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}-config.cmake.in 111 | "${PROJECT_BINARY_DIR}/${PROJECT_NAME}-config.cmake" @ONLY) 112 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}-config-version.cmake.in 113 | "${PROJECT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake" @ONLY) 114 | 115 | install(TARGETS rcnb EXPORT ${PROJECT_NAME}-config 116 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 117 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 118 | PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rcnb) 119 | install(EXPORT ${PROJECT_NAME}-config DESTINATION share/${PROJECT_NAME}/cmake) 120 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | librcnb: RCNB Encoding/Decoding Routines 2 | ====================================== 3 | 4 | Compiling: 5 | --------- 6 | ``` 7 | cmake . 8 | make 9 | ``` 10 | 11 | Installing: 12 | ---------- 13 | ``` 14 | sudo make install 15 | ``` 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 rikakomoe 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | librcnb: RCNB Encoding/Decoding Routines 2 | ====================================== 3 | 4 | Overview: 5 | -------- 6 | librcnb is a library of ANSI C routines for fast encoding/decoding data into 7 | and from a rcnb-encoded format. C++ wrappers are included, as well as the 8 | source code for standalone encoding and decoding executables. 9 | 10 | References: 11 | ---------- 12 | * RCNB.js: 13 | https://github.com/rcnbapp/RCNB.js 14 | * RCNB.php, another implementation of the rcnb encoding: 15 | https://github.com/rcnbapp/RCNB.php 16 | 17 | Commandline Use: 18 | --------------- 19 | There is an executable available, it is simply called rcnb. 20 | It can encode and decode files, as instructed by the user. 21 | 22 | To encode a file: 23 | $ ./rcnb -e filea fileb 24 | fileb will now be the rcnb-encoded version of filea. 25 | 26 | To decode a file: 27 | $ ./rcnb -d fileb filec 28 | filec will now be identical to filea. 29 | 30 | Programming: 31 | ----------- 32 | Some C++ wrappers are provided as well, so you don't have to get your hands 33 | dirty. Encoding from standard input to standard output is as simple as 34 | 35 | #include 36 | #include 37 | int main() 38 | { 39 | rcnb::encoder E; 40 | setlocale(LC_ALL, ""); 41 | E.encode(std::cin, std::wcout); 42 | return 0; 43 | } 44 | 45 | Both standalone executables and a static library is provided in the package, 46 | 47 | Example code: 48 | ------------ 49 | The 'examples' directory contains some simple example code, that demonstrates 50 | how to use the interface of the library. 51 | 52 | More information: 53 | ------------ 54 | Go to https://github.com/rcnbapp/librcnb/wiki to find out more information 55 | about librcnb. 56 | -------------------------------------------------------------------------------- /examples/c-example1.c: -------------------------------------------------------------------------------- 1 | /* 2 | c-example1.c - librcnb example code 3 | 4 | This is part of the librcnb project, and has been placed in the public domain. 5 | For details, see https://github.com/rikakomoe/librcnb 6 | 7 | This is a short example of how to use librcnb's C interface to encode 8 | and decode a string directly. 9 | */ 10 | 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | /* arbitrary buffer size */ 22 | #define SIZE 256 23 | 24 | int main() 25 | { 26 | const char* input = "The Quick Brown RC Jumps Over the NB Dog."; 27 | const wchar_t* rcnb_contrast = L"ȐčnÞȒċƝÞȐĈnƁȒȼǹþȓĆǹƃřČŇbȓƇńƄȓċȵƀȐĉņþŕƇNƅɌĉŇBȓƈȠßŕƇŃBɌċnþȓȼǸƅɌćÑbȒċƝÞƦȻƝƃŕƇNbȓƇNþŕC"; 28 | wchar_t* encoded = malloc(SIZE * sizeof(wchar_t)); 29 | char* decoded = malloc(SIZE); 30 | 31 | setlocale(LC_ALL, ""); 32 | 33 | /* encode the data */ 34 | rcnb_encode(input, strlen(input), encoded); 35 | wprintf(L"encoded: %ls\n", encoded); 36 | 37 | /* decode the data */ 38 | ptrdiff_t res = rcnb_decode(encoded, wcslen(encoded), decoded); 39 | if (res < 0) 40 | wprintf(L"decode failed\n"); 41 | printf("decoded: %s\n", decoded); 42 | 43 | /* compare the original and decoded data */ 44 | assert(strcmp(input, decoded) == 0); 45 | assert(wcscmp(encoded, rcnb_contrast) == 0); 46 | 47 | free(encoded); 48 | free(decoded); 49 | return 0; 50 | } -------------------------------------------------------------------------------- /examples/c-example2.c: -------------------------------------------------------------------------------- 1 | /* 2 | c-example2.c - librcnb example code 3 | 4 | This is part of the librcnb project, and has been placed in the public domain. 5 | For details, see https://github.com/rikakomoe/librcnb 6 | 7 | This is a short example of how to use librcnb's C interface to encode 8 | and decode a file directly. 9 | 10 | The main work is done between the START/STOP ENCODING and DECODING lines. 11 | The main difference between this code and c-example1.c is that we do not 12 | know the size of the input file before hand, and so we use to iterate over 13 | encoding and decoding the data. 14 | */ 15 | 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | /* arbitrary buffer size */ 26 | #define SIZE 100 27 | 28 | void encode(FILE* inputFile, FILE* outputFile) 29 | { 30 | /* set up a destination buffer large enough to hold the encoded data */ 31 | int size = SIZE; 32 | char* input = (char*)malloc(size); 33 | wchar_t* encoded = (wchar_t*)malloc(3 * size * sizeof(wchar_t)); /* ~2 x input */ 34 | /* we need an encoder and decoder state */ 35 | rcnb_encodestate es; 36 | /* store the number of bytes encoded by a single call */ 37 | size_t cnt = 0; 38 | 39 | /*---------- START ENCODING ----------*/ 40 | /* initialise the encoder state */ 41 | rcnb_init_encodestate(&es); 42 | /* gather data from the input and send it to the output */ 43 | while (true) 44 | { 45 | cnt = fread(input, sizeof(char), size, inputFile); 46 | if (cnt == 0) break; 47 | rcnb_encode_block(input, cnt, encoded, &es); 48 | /* output the encoded bytes to the output file */ 49 | fputws(encoded, outputFile); 50 | } 51 | /* since we have reached the end of the input file, we know that 52 | there is no more input data; finalise the encoding */ 53 | rcnb_encode_blockend(encoded, &es); 54 | /* write the last bytes to the output file */ 55 | fputws(encoded, outputFile); 56 | /*---------- STOP ENCODING ----------*/ 57 | 58 | free(encoded); 59 | free(input); 60 | } 61 | 62 | void decode(FILE* inputFile, FILE* outputFile) 63 | { 64 | /* set up a destination buffer large enough to hold the decoded data */ 65 | int size = SIZE; 66 | wchar_t* encoded = (wchar_t*)malloc(3 * size * sizeof(wchar_t)); 67 | char* decoded = (char*)malloc(size); /* ~1/2 x encoded */ 68 | /* we need an encoder and decoder state */ 69 | rcnb_decodestate ds; 70 | /* store the number of bytes encoded by a single call */ 71 | size_t cnt = 0; 72 | 73 | /*---------- START DECODING ----------*/ 74 | /* initialise the encoder state */ 75 | rcnb_init_decodestate(&ds); 76 | /* gather data from the input and send it to the output */ 77 | while (fgetws(encoded, size, inputFile)) 78 | { 79 | cnt = rcnb_decode_block(encoded, wcslen(encoded), decoded, &ds); 80 | /* output the encoded bytes to the output file */ 81 | fwrite(decoded, sizeof(char), cnt, outputFile); 82 | } 83 | /* since we have reached the end of the input file, we know that 84 | there is no more input data; finalise the decoding */ 85 | cnt = rcnb_decode_blockend(decoded, &ds); 86 | /* write the last bytes to the output file */ 87 | fwrite(decoded, sizeof(char), cnt, outputFile); 88 | /*---------- STOP DECODING ----------*/ 89 | 90 | free(encoded); 91 | free(decoded); 92 | } 93 | 94 | int main(int argc, char** argv) 95 | { 96 | FILE* inputFile; 97 | FILE* encodedFile; 98 | FILE* decodedFile; 99 | 100 | if (argc < 4) 101 | { 102 | printf("please supply three filenames: input, encoded & decoded\n"); 103 | exit(-1); 104 | } 105 | 106 | /* encode the input file */ 107 | 108 | setlocale(LC_ALL, ""); 109 | inputFile = fopen(argv[1], "rb, ccs=UTF-8"); 110 | encodedFile = fopen(argv[2], "wb, ccs=UTF-8"); 111 | 112 | encode(inputFile, encodedFile); 113 | 114 | fclose(inputFile); 115 | fclose(encodedFile); 116 | 117 | /* decode the encoded file */ 118 | 119 | encodedFile = fopen(argv[2], "rb, ccs=UTF-8"); 120 | decodedFile = fopen(argv[3], "wb, ccs=UTF-8"); 121 | 122 | decode(encodedFile, decodedFile); 123 | 124 | fclose(encodedFile); 125 | fclose(decodedFile); 126 | 127 | return 0; 128 | } 129 | 130 | -------------------------------------------------------------------------------- /examples/cpp-example3.cc: -------------------------------------------------------------------------------- 1 | /* 2 | cpp-example3.cc - c++ source to a rcnb reference encoder and decoder 3 | 4 | This is part of the librcnb project, and has been placed in the public domain. 5 | For details, see https://github.com/rikakomoe/librcnb 6 | */ 7 | 8 | #include 9 | #include 10 | 11 | int main() 12 | { 13 | rcnb::encoder E; 14 | setlocale(LC_ALL, ""); 15 | E.encode(std::cin, std::wcout); 16 | return 0; 17 | } 18 | -------------------------------------------------------------------------------- /examples/cpp-rcnb-cli.cc: -------------------------------------------------------------------------------- 1 | /* 2 | rcnb.cc - c++ source to a rcnb reference encoder and decoder 3 | 4 | This is part of the librcnb project, and has been placed in the public domain. 5 | For details, see https://github.com/rikakomoe/librcnb 6 | */ 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | void usage() 18 | { 19 | std::cerr << \ 20 | "rcnb: Encodes and Decodes files using rcnb\n" \ 21 | "Usage: rcnb [-e|-d] [input] [output]\n" \ 22 | " Where [-e] will encode the input file into the output file,\n" \ 23 | " [-d] will decode the input file into the output file, and\n" \ 24 | " [input] and [output] are the input and output files, respectively.\n"; 25 | } 26 | 27 | void usage(const std::string& message) 28 | { 29 | usage(); 30 | std::cerr << "Incorrect invocation of rcnb:\n"; 31 | std::cerr << message << std::endl; 32 | } 33 | 34 | int main(int argc, char** argv) 35 | { 36 | if (argc == 1) 37 | { 38 | usage(); 39 | exit(-1); 40 | } 41 | if (argc != 4) 42 | { 43 | usage("Wrong number of arguments!"); 44 | exit(-1); 45 | } 46 | 47 | std::string input = argv[2]; 48 | std::string output = argv[3]; 49 | 50 | // determine whether we need to encode or decode: 51 | std::string choice = argv[1]; 52 | if (choice == "-d") 53 | { 54 | std::wifstream instream(input.c_str(), std::ios_base::in | std::ios_base::binary); 55 | if (!instream.is_open()) 56 | { 57 | usage("Could not open input file!"); 58 | exit(-1); 59 | } 60 | std::locale utf8_locale(std::locale(), new std::codecvt_utf8); 61 | instream.imbue(utf8_locale); 62 | 63 | std::ofstream outstream(output.c_str(), std::ios_base::out | std::ios_base::binary); 64 | if (!outstream.is_open()) 65 | { 66 | usage("Could not open output file!"); 67 | exit(-1); 68 | } 69 | rcnb::decoder D; 70 | D.decode(instream, outstream); 71 | } 72 | else if (choice == "-e") 73 | { 74 | std::ifstream instream(input.c_str(), std::ios_base::in | std::ios_base::binary); 75 | if (!instream.is_open()) 76 | { 77 | usage("Could not open input file!"); 78 | exit(-1); 79 | } 80 | 81 | std::wofstream outstream(output.c_str(), std::ios_base::out | std::ios_base::binary); 82 | if (!outstream.is_open()) 83 | { 84 | usage("Could not open output file!"); 85 | exit(-1); 86 | } 87 | std::locale utf8_locale(std::locale(), new std::codecvt_utf8); 88 | outstream.imbue(utf8_locale); 89 | rcnb::encoder E; 90 | E.encode(instream, outstream); 91 | } 92 | else 93 | { 94 | std::cout<<"["< 12 | #include 13 | 14 | typedef struct 15 | { 16 | size_t i; 17 | wchar_t trailing_code[4]; 18 | } rcnb_decodestate; 19 | 20 | void rcnb_init_decodestate(rcnb_decodestate* state_in); 21 | ptrdiff_t rcnb_decode_block(const wchar_t* code_in, size_t length_in, char* plaintext_out, rcnb_decodestate* state_in); 22 | ptrdiff_t rcnb_decode_blockend(char* plaintext_out, rcnb_decodestate* state_in); 23 | ptrdiff_t rcnb_decode(const wchar_t* code_in, size_t length_in, char* plaintext_out); 24 | 25 | int rcnb_decode_32n_asm(const char *value_in, char *value_out, size_t n); 26 | 27 | #endif //RCNB_CDECODE_H 28 | -------------------------------------------------------------------------------- /include/rcnb/cencode.h: -------------------------------------------------------------------------------- 1 | /* 2 | cencode.h - c header for an rcnb encoding algorithm 3 | 4 | This is part of the librcnb project, and has been placed in the public domain. 5 | For details, see https://github.com/rikakomoe/librcnb 6 | */ 7 | 8 | #ifndef RCNB_CENCODE_H 9 | #define RCNB_CENCODE_H 10 | 11 | #include 12 | #include 13 | 14 | typedef struct 15 | { 16 | bool cached; 17 | char trailing_byte; 18 | } rcnb_encodestate; 19 | 20 | void rcnb_init_encodestate(rcnb_encodestate* state_in); 21 | size_t rcnb_encode_block(const char* plaintext_in, size_t length_in, wchar_t* code_out, rcnb_encodestate* state_in); 22 | size_t rcnb_encode_blockend(wchar_t* code_out, rcnb_encodestate* state_in); 23 | size_t rcnb_encode(const char* plaintext_in, size_t length_in, wchar_t* code_out); 24 | 25 | void rcnb_encode_32n_asm(const char *value_in, char *value_out, size_t n); 26 | 27 | #endif /* RCNB_CENCODE_H */ 28 | -------------------------------------------------------------------------------- /include/rcnb/decode.h: -------------------------------------------------------------------------------- 1 | // :mode=c++: 2 | 3 | /* 4 | decode.h - c++ wrapper for an rcnb encoding algorithm 5 | 6 | This is part of the librcnb project, and has been placed in the public domain. 7 | For details, see https://github.com/rikakomoe/librcnb 8 | */ 9 | 10 | #ifndef RCNB_DECODE_H 11 | #define RCNB_DECODE_H 12 | 13 | #define BUFFERSIZE 4096 14 | 15 | #include 16 | 17 | namespace rcnb { 18 | 19 | extern "C" { 20 | #include "cdecode.h" 21 | } 22 | 23 | struct decoder { 24 | rcnb_decodestate _state; 25 | int _buffersize; 26 | 27 | explicit decoder(int buffersize_in = BUFFERSIZE) : _buffersize(buffersize_in) 28 | { 29 | } 30 | 31 | void initialize() 32 | { 33 | rcnb_init_decodestate(&_state); 34 | } 35 | 36 | ptrdiff_t decode(const wchar_t* code_in, size_t length_in, char* plaintext_out) { 37 | return rcnb_decode_block(code_in, length_in, plaintext_out, &_state); 38 | } 39 | 40 | ptrdiff_t decode_end(char* const plaintext_out) { 41 | return rcnb_decode_blockend(plaintext_out, &_state); 42 | } 43 | 44 | void decode(std::wistream& istream_in, std::ostream& ostream_in) 45 | { 46 | initialize(); 47 | 48 | const int N = _buffersize; 49 | char* plaintext = new char[N]; 50 | auto code = new wchar_t[3 * N]; 51 | long plainlength; 52 | long codelength; 53 | 54 | do { 55 | istream_in.read(code, N); 56 | codelength = istream_in.gcount(); 57 | 58 | plainlength = decode(code, codelength, plaintext); 59 | ostream_in.write(plaintext, plainlength); 60 | } while (istream_in.good() && plainlength > 0); 61 | 62 | plainlength = decode_end(plaintext); 63 | ostream_in.write(plaintext, plainlength); 64 | 65 | delete[] code; 66 | delete[] plaintext; 67 | } 68 | }; 69 | 70 | } // namespace rcnb 71 | 72 | #endif // RCNB_DECODE_H -------------------------------------------------------------------------------- /include/rcnb/encode.h: -------------------------------------------------------------------------------- 1 | // :mode=c++: 2 | 3 | /* 4 | encode.h - c++ wrapper for an rcnb encoding algorithm 5 | 6 | This is part of the librcnb project, and has been placed in the public domain. 7 | For details, see https://github.com/rikakomoe/librcnb 8 | */ 9 | 10 | #ifndef RCNB_ENCODE_H 11 | #define RCNB_ENCODE_H 12 | 13 | #define BUFFERSIZE 4096 14 | 15 | #include 16 | 17 | namespace rcnb { 18 | 19 | extern "C" { 20 | #include "cencode.h" 21 | } 22 | 23 | struct encoder { 24 | rcnb_encodestate _state; 25 | int _buffersize; 26 | 27 | explicit encoder(int buffersize_in = BUFFERSIZE) : _buffersize(buffersize_in) 28 | { 29 | } 30 | 31 | void initialize() 32 | { 33 | rcnb_init_encodestate(&_state); 34 | } 35 | 36 | size_t encode(const char* plaintext_in, size_t length_in, wchar_t* const code_out) { 37 | return rcnb_encode_block(plaintext_in, length_in, code_out, &_state); 38 | } 39 | 40 | size_t encode_end(wchar_t* const code_out) { 41 | return rcnb_encode_blockend(code_out, &_state); 42 | } 43 | 44 | void encode(std::istream& istream_in, std::wostream& ostream_in) 45 | { 46 | initialize(); 47 | 48 | const int N = _buffersize; 49 | char* plaintext = new char[N]; 50 | auto code = new wchar_t[3 * N]; 51 | long plainlength; 52 | long codelength; 53 | 54 | do { 55 | istream_in.read(plaintext, N); 56 | plainlength = istream_in.gcount(); 57 | 58 | codelength = encode(plaintext, plainlength, code); 59 | ostream_in.write(code, codelength); 60 | } while (istream_in.good() && plainlength > 0); 61 | 62 | codelength = encode_end(code); 63 | ostream_in.write(code, codelength); 64 | 65 | delete[] code; 66 | delete[] plaintext; 67 | } 68 | }; 69 | 70 | } // namespace rcnb 71 | 72 | #endif // RCNB_ENCODE_H -------------------------------------------------------------------------------- /include/rcnb/rcnb.h: -------------------------------------------------------------------------------- 1 | /* 2 | rcnb.h - c header to an rcnb encoding algorithm 3 | 4 | This is part of the librcnb project, and has been placed in the public domain. 5 | For details, see https://github.com/rikakomoe/librcnb 6 | */ 7 | 8 | #ifndef RCNB_RCNB_H 9 | #define RCNB_RCNB_H 10 | 11 | #include 12 | 13 | extern const wchar_t cr[]; 14 | extern const wchar_t cc[]; 15 | extern const wchar_t cn[]; 16 | extern const wchar_t cb[]; 17 | 18 | extern const unsigned short sr; 19 | extern const unsigned short sc; 20 | extern const unsigned short sn; 21 | extern const unsigned short sb; 22 | 23 | #define src (sr * sc) 24 | #define snb (sn * sb) 25 | #define scnb (sc * snb) 26 | 27 | #endif // RCNB_RCNB_H 28 | -------------------------------------------------------------------------------- /librcnb-config-version.cmake.in: -------------------------------------------------------------------------------- 1 | set(PACKAGE_VERSION "@PROJECT_VERSION@") 2 | 3 | # Check whether the requested PACKAGE_FIND_VERSION is compatible 4 | if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}") 5 | set(PACKAGE_VERSION_COMPATIBLE FALSE) 6 | else() 7 | set(PACKAGE_VERSION_COMPATIBLE TRUE) 8 | if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}") 9 | set(PACKAGE_VERSION_EXACT TRUE) 10 | endif() 11 | endif() 12 | -------------------------------------------------------------------------------- /librcnb-config.cmake.in: -------------------------------------------------------------------------------- 1 | # - Config file for the librcnb package 2 | # It defines the following variables 3 | # RCNB_INCLUDE_DIR - include directory 4 | # RCNB_LIBRARIES - libraries to link against 5 | 6 | # Compute paths 7 | get_filename_component(RCNB_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) 8 | set(RCNB_INCLUDE_DIR "@CONFIG_INCLUDE_DIRS@") 9 | 10 | # Our library dependencies (contains definitions for IMPORTED targets) 11 | include("${RCNB_CMAKE_DIR}/@PROJECT_NAME@-targets.cmake") 12 | 13 | # These are IMPORTED targets created by librcnb-targets.cmake 14 | set(RCNB_LIBRARIES "@EXPORT_TARGETS@") 15 | -------------------------------------------------------------------------------- /src/cdecode.c: -------------------------------------------------------------------------------- 1 | /* 2 | cdecode.c - c source to an rcnb encoding algorithm 3 | 4 | This is part of the librcnb project, and has been placed in the public domain. 5 | For details, see https://github.com/rikakomoe/librcnb 6 | */ 7 | 8 | #include 9 | #include 10 | 11 | int find(const wchar_t* const arr, const unsigned length, const wchar_t target) 12 | { 13 | for (const wchar_t* iter = arr; iter != arr + length; ++iter) { 14 | if (*iter == target) 15 | return (int)(iter - arr); 16 | } 17 | return -1; 18 | } 19 | 20 | void rcnb_init_decodestate(rcnb_decodestate* state_in) 21 | { 22 | state_in->i = 0; 23 | } 24 | 25 | bool rcnb_decode_short(const wchar_t* value_in, char** value_out) 26 | { 27 | bool reverse = find(cr, sr, *value_in) < 0; 28 | int idx[4]; 29 | if (!reverse) { 30 | idx[0] = find(cr, sr, *value_in); 31 | idx[1] = find(cc, sc, *(value_in + 1)); 32 | idx[2] = find(cn, sn, *(value_in + 2)); 33 | idx[3] = find(cb, sb, *(value_in + 3)); 34 | } else { 35 | idx[0] = find(cr, sr, *(value_in + 2)); 36 | idx[1] = find(cc, sc, *(value_in + 3)); 37 | idx[2] = find(cn, sn, *value_in); 38 | idx[3] = find(cb, sb, *(value_in + 1)); 39 | } 40 | if (idx[0] < 0 || idx[1] < 0 || idx[2] < 0 || idx[3] < 0) 41 | return false; 42 | int result = idx[0] * scnb + idx[1] * snb + idx[2] * sb + idx[3]; 43 | if (result > 0x7FFF) 44 | return false; 45 | result = reverse ? result | 0x8000 : result; 46 | *(*value_out)++ = (char)(result >> 8); 47 | *(*value_out)++ = (char)(result & 0xFF); 48 | return true; 49 | } 50 | 51 | bool rcnb_decode_byte(const wchar_t* value_in, char** value_out) 52 | { 53 | bool nb = false; 54 | int idx[2] = { find(cr, sr, *value_in), find(cc, sc, *(value_in + 1)) }; 55 | if (idx[0] < 0 || idx[1] < 0) { 56 | idx[0] = find(cn, sn, *value_in); 57 | idx[1] = find(cb, sb, *(value_in + 1)); 58 | nb = true; 59 | } 60 | if (idx[0] < 0 || idx[1] < 0) 61 | return false; 62 | int result = nb ? idx[0] * sb + idx[1] : idx[0] * sc + idx[1]; 63 | if (result > 0x7F) 64 | return false; 65 | *(*value_out)++ = (char)(nb ? result | 0x80 : result); 66 | return true; 67 | } 68 | 69 | ptrdiff_t rcnb_decode_block(const wchar_t* code_in, size_t length_in, 70 | char* const plaintext_out, rcnb_decodestate* state_in) 71 | { 72 | char* plaintext_char = plaintext_out; 73 | bool res; 74 | while (state_in->i < 4 && length_in > 0) { 75 | state_in->trailing_code[state_in->i++] = code_in[0]; 76 | length_in--; 77 | code_in++; 78 | } 79 | if (length_in == 0) 80 | return 0; 81 | res = rcnb_decode_short(state_in->trailing_code, &plaintext_char); 82 | if (!res) 83 | return -1; 84 | state_in->i = 0; 85 | #if defined(ENABLE_AVX2) || defined(ENABLE_SSSE3) || defined(ENABLE_NEON) 86 | size_t batch = length_in >> 6; 87 | if (batch > 0) { 88 | res = rcnb_decode_32n_asm((const char *)code_in, plaintext_char, batch); 89 | if (!res) 90 | return -1; 91 | } 92 | plaintext_char += 32 * batch; 93 | code_in += 64 * batch; 94 | length_in = length_in & 63; 95 | #endif 96 | for (int i = 0; i < (length_in >> 2); ++i) { 97 | res = rcnb_decode_short(code_in + i * 4, &plaintext_char); 98 | if (!res) 99 | return -1; 100 | } 101 | state_in->i = length_in % 4; 102 | for (size_t j = 0; j < state_in->i; ++j) { 103 | state_in->trailing_code[j] = code_in[length_in - state_in->i + j]; 104 | } 105 | *plaintext_char = 0; 106 | return plaintext_char - plaintext_out; 107 | } 108 | 109 | ptrdiff_t rcnb_decode_blockend(char* const plaintext_out, rcnb_decodestate* state_in) 110 | { 111 | if (state_in->i != 0 && state_in->i != 2) 112 | return -1; 113 | char* plaintext_char = plaintext_out; 114 | if (state_in->i == 2) { 115 | if(!rcnb_decode_byte(state_in->trailing_code, &plaintext_char)) 116 | return -1; 117 | } 118 | *plaintext_char = 0; 119 | state_in->i = 0; 120 | return plaintext_char - plaintext_out; 121 | } 122 | 123 | ptrdiff_t rcnb_decode(const wchar_t* code_in, size_t length_in, char* plaintext_out) 124 | { 125 | if (length_in == 0) 126 | return 0; 127 | rcnb_decodestate es; 128 | rcnb_init_decodestate(&es); 129 | size_t output_size = 0; 130 | ptrdiff_t block_size = 0; 131 | block_size = rcnb_decode_block(code_in, length_in, plaintext_out, &es); 132 | if (block_size < 0) 133 | return -1; 134 | output_size += block_size; 135 | block_size = rcnb_decode_blockend(plaintext_out + output_size, &es); 136 | if (block_size < 0) 137 | return -1; 138 | return output_size + block_size; 139 | } 140 | -------------------------------------------------------------------------------- /src/cencode.c: -------------------------------------------------------------------------------- 1 | /* 2 | cencode.c - c source to an rcnb encoding algorithm 3 | 4 | This is part of the librcnb project, and has been placed in the public domain. 5 | For details, see https://github.com/rikakomoe/librcnb 6 | */ 7 | 8 | #include 9 | #include 10 | 11 | void rcnb_init_encodestate(rcnb_encodestate* state_in) 12 | { 13 | state_in->cached = false; 14 | } 15 | 16 | void rcnb_encode_short(unsigned short value_in, wchar_t** value_out) 17 | { 18 | bool reverse = false; 19 | if (value_in > 0x7FFF) { 20 | reverse = true; 21 | value_in = (unsigned short)(value_in & 0x7FFF); 22 | } 23 | if (reverse) 24 | *value_out += 2; 25 | *(*value_out)++ = cr[value_in / scnb]; 26 | *(*value_out)++ = cc[value_in % scnb / snb]; 27 | if (reverse) 28 | *value_out -= 4; 29 | *(*value_out)++ = cn[value_in % snb / sb]; 30 | *(*value_out)++ = cb[value_in % sb]; 31 | if (reverse) 32 | *value_out += 2; 33 | } 34 | 35 | void rcnb_encode_byte(unsigned char value_in, wchar_t** value_out) 36 | { 37 | if (value_in > 0x7F) { 38 | value_in = (unsigned char)(value_in & 0x7F); 39 | *(*value_out)++ = cn[value_in / sb]; 40 | *(*value_out)++ = cb[value_in % sb]; 41 | return; 42 | } 43 | *(*value_out)++ = cr[value_in / sc]; 44 | *(*value_out)++ = cc[value_in % sc]; 45 | } 46 | 47 | size_t rcnb_encode_block(const char* plaintext_in, size_t length_in, 48 | wchar_t* const code_out, rcnb_encodestate* state_in) 49 | { 50 | if (length_in == 0) 51 | return 0; 52 | wchar_t* code_char = code_out; 53 | if (state_in->cached) { 54 | rcnb_encode_short(*(unsigned char*)(&state_in->trailing_byte) << 8 | *(unsigned char*)(&plaintext_in[0]), 55 | &code_char); 56 | plaintext_in++; 57 | length_in--; 58 | state_in->cached = false; 59 | } 60 | #if defined(ENABLE_AVX2) || defined(ENABLE_SSSE3) || defined(ENABLE_NEON) 61 | size_t batch = length_in >> 5; 62 | if (batch > 0) { 63 | rcnb_encode_32n_asm(plaintext_in, (char *) code_char, batch); 64 | } 65 | plaintext_in += 32 * batch; 66 | code_char += 64 * batch; 67 | length_in = length_in & 31; 68 | #endif 69 | for (int i = 0; i < (length_in >> 1); ++i) 70 | rcnb_encode_short(*(unsigned char*)(&plaintext_in[i * 2]) << 8 | *(unsigned char*)(&plaintext_in[i * 2 + 1]), 71 | &code_char); 72 | if (length_in & 1) { 73 | state_in->trailing_byte = plaintext_in[length_in - 1]; 74 | state_in->cached = true; 75 | } 76 | *code_char = 0; 77 | return code_char - code_out; 78 | } 79 | 80 | size_t rcnb_encode_blockend(wchar_t* const code_out, rcnb_encodestate* state_in) 81 | { 82 | wchar_t* code_char = code_out; 83 | if (state_in->cached) { 84 | rcnb_encode_byte(*(unsigned char*)(&state_in->trailing_byte), &code_char); 85 | } 86 | *code_char = 0; 87 | state_in->cached = false; 88 | return code_char - code_out; 89 | } 90 | 91 | size_t rcnb_encode(const char* plaintext_in, size_t length_in, wchar_t* code_out) 92 | { 93 | rcnb_encodestate es; 94 | rcnb_init_encodestate(&es); 95 | size_t output_size = 0; 96 | output_size += rcnb_encode_block(plaintext_in, length_in, code_out, &es); 97 | output_size += rcnb_encode_blockend(code_out + output_size, &es); 98 | return output_size; 99 | } 100 | -------------------------------------------------------------------------------- /src/rcnb.c: -------------------------------------------------------------------------------- 1 | /* 2 | rcnb.c - c source to an rcnb encoding algorithm 3 | 4 | This is part of the librcnb project, and has been placed in the public domain. 5 | For details, see https://github.com/rikakomoe/librcnb 6 | */ 7 | 8 | #include 9 | 10 | const wchar_t cr[] = {'r','R',L'Ŕ',L'ŕ',L'Ŗ',L'ŗ',L'Ř',L'ř',L'Ʀ',L'Ȑ',L'ȑ',L'Ȓ',L'ȓ',L'Ɍ',L'ɍ'}; 11 | const wchar_t cc[] = {'c','C',L'Ć',L'ć',L'Ĉ',L'ĉ',L'Ċ',L'ċ',L'Č',L'č',L'Ƈ',L'ƈ',L'Ç',L'Ȼ',L'ȼ'}; 12 | const wchar_t cn[] = {'n','N',L'Ń',L'ń',L'Ņ',L'ņ',L'Ň',L'ň',L'Ɲ',L'ƞ',L'Ñ',L'Ǹ',L'ǹ',L'Ƞ',L'ȵ'}; 13 | const wchar_t cb[] = {'b','B',L'ƀ',L'Ɓ',L'ƃ',L'Ƅ',L'ƅ',L'ß',L'Þ',L'þ'}; 14 | 15 | const unsigned short sr = sizeof(cr) / sizeof(wchar_t); 16 | const unsigned short sc = sizeof(cc) / sizeof(wchar_t); 17 | const unsigned short sn = sizeof(cn) / sizeof(wchar_t); 18 | const unsigned short sb = sizeof(cb) / sizeof(wchar_t); 19 | -------------------------------------------------------------------------------- /src/rcnb_arm64.c: -------------------------------------------------------------------------------- 1 | /* 2 | cencode_arm64.c - arm64 intrinsic source to an rcnb encoding algorithm 3 | 4 | This is part of the librcnb project, and has been placed in the public domain. 5 | For details, see https://github.com/rikakomoe/librcnb 6 | */ 7 | 8 | #if defined(ENABLE_NEON) 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | static const unsigned char r_lo[16] = {114, 82, 84, 85, 86, 87, 88, 89, 166, 16, 17, 18, 19, 76, 77}; 15 | static const unsigned char c_lo[16] = {99, 67, 6, 7, 8, 9, 10, 11, 12, 13, 135, 136, 199, 59, 60}; 16 | static const unsigned char n_lo[16] = {110, 78, 67, 68, 69, 70, 71, 72, 157, 158, 209, 248, 249, 32, 53}; 17 | static const unsigned char b_lo[16] = {98, 66, 128, 129, 131, 132, 133, 223, 222, 254}; 18 | 19 | static const unsigned char r_hi[16] = {0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2}; 20 | static const unsigned char c_hi[16] = {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2}; 21 | static const unsigned char n_hi[16] = {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 2}; 22 | static const unsigned char b_hi[16] = {0, 0, 1, 1, 1, 1, 1, 0, 0, 0}; 23 | 24 | static const unsigned char s_tbl[16] = {0, 0, 255, 255, 255, 0, 255, 0}; 25 | 26 | static const unsigned char r_tbl[16] = {14, 8, 0, 255, 2, 3, 4, 5, 6, 7, 9, 10, 11, 1, 12, 13}; 27 | static const unsigned char c_tbl[16] = {13, 3, 9, 14, 4, 0, 5, 255, 10, 6, 11, 1, 7, 12, 2, 8}; 28 | static const unsigned char n_tbl[16] = {10, 3, 255, 4, 8, 0, 5, 9, 6, 1, 7, 13, 11, 14, 2, 12}; 29 | static const unsigned char b_tbl[16] = {2, 3, 255, 255, 4, 255, 5, 6, 8, 7, 255, 1, 9, 255, 255, 0}; 30 | 31 | void rcnb_encode_32n_asm(const char *value_in, char *value_out, size_t n) { 32 | const int16x8_t mask = vdupq_n_s16(0x7fff); 33 | for (size_t i = 0; i < n; ++i) { 34 | int16x8_t sinput1 = (int16x8_t) vrev16q_s8(vld1q_s8((const signed char *) value_in)); 35 | int16x8_t sinput2 = (int16x8_t) vrev16q_s8(vld1q_s8((const signed char *) (value_in + 16))); 36 | value_in += 32; 37 | uint16x8_t sign1 = (uint16x8_t) vshrq_n_s16(sinput1, 15); 38 | uint16x8_t sign2 = (uint16x8_t) vshrq_n_s16(sinput2, 15); 39 | uint16x8_t input1 = (uint16x8_t) vandq_s16(sinput1, mask); 40 | uint16x8_t input2 = (uint16x8_t) vandq_s16(sinput2, mask); 41 | 42 | uint32x4_t t1, t2; 43 | uint16x8_t idx_r1, idx_c1, idx_n1, idx_b1; 44 | uint16x8_t idx_r2, idx_c2, idx_n2, idx_b2; 45 | { 46 | t1 = vmull_n_u16(vget_low_u16(input1), 59653); 47 | t2 = vmull_high_n_u16(input1, 59653); 48 | idx_r1 = vuzp2q_u16((uint16x8_t) t1, (uint16x8_t) t2); 49 | idx_r1 = vshrq_n_u16(idx_r1, 11); 50 | 51 | uint16x8_t r_mul_2250 = vmulq_n_u16(idx_r1, 2250); 52 | uint16x8_t i_mod_2250 = vsubq_u16(input1, r_mul_2250); 53 | t1 = vmull_n_u16(vget_low_u16(i_mod_2250), 55925); 54 | t2 = vmull_high_n_u16(i_mod_2250, 55925); 55 | idx_c1 = vuzp2q_u16((uint16x8_t) t1, (uint16x8_t) t2); 56 | idx_c1 = vshrq_n_u16(idx_c1, 7); 57 | 58 | uint16x8_t c_mul_150 = vmlaq_n_u16(r_mul_2250, idx_c1, 150); 59 | uint16x8_t i_mod_150 = vsubq_u16(input1, c_mul_150); 60 | t1 = vmull_n_u16(vget_low_u16(i_mod_150), 52429); 61 | t2 = vmull_high_n_u16(i_mod_150, 52429); 62 | idx_n1 = vuzp2q_u16((uint16x8_t) t1, (uint16x8_t) t2); 63 | idx_n1 = vshrq_n_u16(idx_n1, 3); 64 | 65 | idx_b1 = vsubq_u16(input1, vmlaq_n_u16(c_mul_150, idx_n1, 10)); 66 | } 67 | 68 | { 69 | t1 = vmull_n_u16(vget_low_u16(input2), 59653); 70 | t2 = vmull_high_n_u16(input2, 59653); 71 | idx_r2 = vuzp2q_u16((uint16x8_t) t1, (uint16x8_t) t2); 72 | idx_r2 = vshrq_n_u16(idx_r2, 11); 73 | 74 | uint16x8_t r_mul_2250 = vmulq_n_u16(idx_r2, 2250); 75 | uint16x8_t i_mod_2250 = vsubq_u16(input2, r_mul_2250); 76 | t1 = vmull_n_u16(vget_low_u16(i_mod_2250), 55925); 77 | t2 = vmull_high_n_u16(i_mod_2250, 55925); 78 | idx_c2 = vuzp2q_u16((uint16x8_t) t1, (uint16x8_t) t2); 79 | idx_c2 = vshrq_n_u16(idx_c2, 7); 80 | 81 | uint16x8_t c_mul_150 = vmlaq_n_u16(r_mul_2250, idx_c2, 150); 82 | uint16x8_t i_mod_150 = vsubq_u16(input2, c_mul_150); 83 | t1 = vmull_n_u16(vget_low_u16(i_mod_150), 52429); 84 | t2 = vmull_high_n_u16(i_mod_150, 52429); 85 | idx_n2 = vuzp2q_u16((uint16x8_t) t1, (uint16x8_t) t2); 86 | idx_n2 = vshrq_n_u16(idx_n2, 3); 87 | 88 | idx_b2 = vsubq_u16(input2, vmlaq_n_u16(c_mul_150, idx_n2, 10)); 89 | } 90 | 91 | uint8x8_t idx_rt = vmovn_u16(idx_r1); 92 | uint8x16_t idx_r = vmovn_high_u16(idx_rt, idx_r2); 93 | uint8x8_t idx_ct = vmovn_u16(idx_c1); 94 | uint8x16_t idx_c = vmovn_high_u16(idx_ct, idx_c2); 95 | uint8x8_t idx_nt = vmovn_u16(idx_n1); 96 | uint8x16_t idx_n = vmovn_high_u16(idx_nt, idx_n2); 97 | uint8x8_t idx_bt = vmovn_u16(idx_b1); 98 | uint8x16_t idx_b = vmovn_high_u16(idx_bt, idx_b2); 99 | 100 | uint8x16_t r_l = vqtbl1q_u8(vld1q_u8(r_lo), idx_r); 101 | uint8x16_t r_h = vqtbl1q_u8(vld1q_u8(r_hi), idx_r); 102 | uint8x16_t c_l = vqtbl1q_u8(vld1q_u8(c_lo), idx_c); 103 | uint8x16_t c_h = vqtbl1q_u8(vld1q_u8(c_hi), idx_c); 104 | uint8x16_t n_l = vqtbl1q_u8(vld1q_u8(n_lo), idx_n); 105 | uint8x16_t n_h = vqtbl1q_u8(vld1q_u8(n_hi), idx_n); 106 | uint8x16_t b_l = vqtbl1q_u8(vld1q_u8(b_lo), idx_b); 107 | uint8x16_t b_h = vqtbl1q_u8(vld1q_u8(b_hi), idx_b); 108 | 109 | uint16x8_t r1t = (uint16x8_t) vzip1q_u8(r_l, r_h); 110 | uint16x8_t r2t = (uint16x8_t) vzip2q_u8(r_l, r_h); 111 | uint16x8_t c1t = (uint16x8_t) vzip1q_u8(c_l, c_h); 112 | uint16x8_t c2t = (uint16x8_t) vzip2q_u8(c_l, c_h); 113 | uint16x8_t n1t = (uint16x8_t) vzip1q_u8(n_l, n_h); 114 | uint16x8_t n2t = (uint16x8_t) vzip2q_u8(n_l, n_h); 115 | uint16x8_t b1t = (uint16x8_t) vzip1q_u8(b_l, b_h); 116 | uint16x8_t b2t = (uint16x8_t) vzip2q_u8(b_l, b_h); 117 | 118 | if (sizeof(wchar_t) == 2) { 119 | uint16x8x4_t rcnb1, rcnb2; 120 | 121 | rcnb1.val[0] = vbslq_u16(sign1, n1t, r1t); 122 | rcnb1.val[1] = vbslq_u16(sign1, b1t, c1t); 123 | rcnb1.val[2] = vbslq_u16(sign1, r1t, n1t); 124 | rcnb1.val[3] = vbslq_u16(sign1, c1t, b1t); 125 | rcnb2.val[0] = vbslq_u16(sign2, n2t, r2t); 126 | rcnb2.val[1] = vbslq_u16(sign2, b2t, c2t); 127 | rcnb2.val[2] = vbslq_u16(sign2, r2t, n2t); 128 | rcnb2.val[3] = vbslq_u16(sign2, c2t, b2t); 129 | 130 | vst4q_u16((unsigned short *) value_out, rcnb1); 131 | value_out += 64; 132 | vst4q_u16((unsigned short *) value_out, rcnb2); 133 | value_out += 64; 134 | } else if (sizeof(wchar_t) == 4) { 135 | uint16x8_t r1 = vbslq_u16(sign1, n1t, r1t); 136 | uint16x8_t c1 = vbslq_u16(sign1, b1t, c1t); 137 | uint16x8_t n1 = vbslq_u16(sign1, r1t, n1t); 138 | uint16x8_t b1 = vbslq_u16(sign1, c1t, b1t); 139 | uint16x8_t r2 = vbslq_u16(sign2, n2t, r2t); 140 | uint16x8_t c2 = vbslq_u16(sign2, b2t, c2t); 141 | uint16x8_t n2 = vbslq_u16(sign2, r2t, n2t); 142 | uint16x8_t b2 = vbslq_u16(sign2, c2t, b2t); 143 | 144 | uint32x4x4_t rcnb; 145 | 146 | rcnb.val[0] = vmovl_u16(vget_low_u16(r1)); 147 | rcnb.val[1] = vmovl_u16(vget_low_u16(c1)); 148 | rcnb.val[2] = vmovl_u16(vget_low_u16(n1)); 149 | rcnb.val[3] = vmovl_u16(vget_low_u16(b1)); 150 | vst4q_u32((unsigned int *) value_out, rcnb); 151 | value_out += 64; 152 | 153 | rcnb.val[0] = vmovl_u16(vget_high_u16(r1)); 154 | rcnb.val[1] = vmovl_u16(vget_high_u16(c1)); 155 | rcnb.val[2] = vmovl_u16(vget_high_u16(n1)); 156 | rcnb.val[3] = vmovl_u16(vget_high_u16(b1)); 157 | vst4q_u32((unsigned int *) value_out, rcnb); 158 | value_out += 64; 159 | 160 | rcnb.val[0] = vmovl_u16(vget_low_u16(r2)); 161 | rcnb.val[1] = vmovl_u16(vget_low_u16(c2)); 162 | rcnb.val[2] = vmovl_u16(vget_low_u16(n2)); 163 | rcnb.val[3] = vmovl_u16(vget_low_u16(b2)); 164 | vst4q_u32((unsigned int *) value_out, rcnb); 165 | value_out += 64; 166 | 167 | rcnb.val[0] = vmovl_u16(vget_high_u16(r2)); 168 | rcnb.val[1] = vmovl_u16(vget_high_u16(c2)); 169 | rcnb.val[2] = vmovl_u16(vget_high_u16(n2)); 170 | rcnb.val[3] = vmovl_u16(vget_high_u16(b2)); 171 | vst4q_u32((unsigned int *) value_out, rcnb); 172 | value_out += 64; 173 | } 174 | } 175 | } 176 | 177 | int rcnb_decode_32n_asm(const char *value_in, char *value_out, size_t n) { 178 | uint16x8x4_t rcnb1, rcnb2; 179 | for (size_t i = 0; i < n; ++i) { 180 | if (sizeof(wchar_t) == 2) { 181 | rcnb1 = vld4q_u16((const unsigned short *) value_in); 182 | rcnb2 = vld4q_u16((const unsigned short *) (value_in + 64)); 183 | value_in += 128; 184 | } else if (sizeof(wchar_t) == 4) { 185 | uint32x4x4_t tmp1, tmp2; 186 | tmp1 = vld4q_u32((const unsigned int *) value_in); 187 | tmp2 = vld4q_u32((const unsigned int *) (value_in + 64)); 188 | rcnb1.val[0] = vcombine_u16(vmovn_u32(tmp1.val[0]), vmovn_u32(tmp2.val[0])); 189 | rcnb1.val[1] = vcombine_u16(vmovn_u32(tmp1.val[1]), vmovn_u32(tmp2.val[1])); 190 | rcnb1.val[2] = vcombine_u16(vmovn_u32(tmp1.val[2]), vmovn_u32(tmp2.val[2])); 191 | rcnb1.val[3] = vcombine_u16(vmovn_u32(tmp1.val[3]), vmovn_u32(tmp2.val[3])); 192 | value_in += 128; 193 | 194 | tmp1 = vld4q_u32((const unsigned int *) value_in); 195 | tmp2 = vld4q_u32((const unsigned int *) (value_in + 64)); 196 | rcnb2.val[0] = vcombine_u16(vmovn_u32(tmp1.val[0]), vmovn_u32(tmp2.val[0])); 197 | rcnb2.val[1] = vcombine_u16(vmovn_u32(tmp1.val[1]), vmovn_u32(tmp2.val[1])); 198 | rcnb2.val[2] = vcombine_u16(vmovn_u32(tmp1.val[2]), vmovn_u32(tmp2.val[2])); 199 | rcnb2.val[3] = vcombine_u16(vmovn_u32(tmp1.val[3]), vmovn_u32(tmp2.val[3])); 200 | value_in += 128; 201 | } 202 | 203 | uint16x8_t sign_idx1 = vshrq_n_u16(vmulq_n_u16(rcnb1.val[0], 2117), 13); 204 | uint16x8_t sign_idx2 = vshrq_n_u16(vmulq_n_u16(rcnb2.val[0], 2117), 13); 205 | uint16x8_t sign1 = (uint16x8_t)vqtbl1q_u8(vld1q_u8(s_tbl), (uint8x16_t)sign_idx1); 206 | uint16x8_t sign2 = (uint16x8_t)vqtbl1q_u8(vld1q_u8(s_tbl), (uint8x16_t)sign_idx2); 207 | sign1 = vsliq_n_u16(sign1, sign1, 8); 208 | sign2 = vsliq_n_u16(sign2, sign2, 8); 209 | 210 | uint16x8_t r_c1 = vbslq_u16(sign1, rcnb1.val[2], rcnb1.val[0]); 211 | uint16x8_t c_c1 = vbslq_u16(sign1, rcnb1.val[3], rcnb1.val[1]); 212 | uint16x8_t n_c1 = vbslq_u16(sign1, rcnb1.val[0], rcnb1.val[2]); 213 | uint16x8_t b_c1 = vbslq_u16(sign1, rcnb1.val[1], rcnb1.val[3]); 214 | uint16x8_t r_c2 = vbslq_u16(sign2, rcnb2.val[2], rcnb2.val[0]); 215 | uint16x8_t c_c2 = vbslq_u16(sign2, rcnb2.val[3], rcnb2.val[1]); 216 | uint16x8_t n_c2 = vbslq_u16(sign2, rcnb2.val[0], rcnb2.val[2]); 217 | uint16x8_t b_c2 = vbslq_u16(sign2, rcnb2.val[1], rcnb2.val[3]); 218 | 219 | sign1 = vshlq_n_u16(sign1, 15); 220 | sign2 = vshlq_n_u16(sign2, 15); 221 | 222 | r_c1 = vshrq_n_u16(vmulq_n_u16(r_c1, 4675), 12); 223 | c_c1 = vshrq_n_u16(vmulq_n_u16(c_c1, 11482), 12); 224 | n_c1 = vshrq_n_u16(vmulq_n_u16(n_c1, 9726), 12); 225 | b_c1 = vsraq_n_u16(vsraq_n_u16(b_c1, b_c1, 1), b_c1, 3); 226 | 227 | r_c2 = vshrq_n_u16(vmulq_n_u16(r_c2, 4675), 12); 228 | c_c2 = vshrq_n_u16(vmulq_n_u16(c_c2, 11482), 12); 229 | n_c2 = vshrq_n_u16(vmulq_n_u16(n_c2, 9726), 12); 230 | b_c2 = vsraq_n_u16(vsraq_n_u16(b_c2, b_c2, 1), b_c2, 3); 231 | 232 | uint8x16_t r_v = vqtbl1q_u8(vld1q_u8(r_tbl), vcombine_u8(vmovn_u16(r_c1), vmovn_u16(r_c2))); 233 | uint8x16_t c_v = vqtbl1q_u8(vld1q_u8(c_tbl), vcombine_u8(vmovn_u16(c_c1), vmovn_u16(c_c2))); 234 | uint8x16_t n_v = vqtbl1q_u8(vld1q_u8(n_tbl), vcombine_u8(vmovn_u16(n_c1), vmovn_u16(n_c2))); 235 | uint8x16_t b_v = vqtbl1q_u8(vld1q_u8(b_tbl), vbicq_u8(vcombine_u8(vmovn_u16(b_c1), vmovn_u16(b_c2)), vdupq_n_u8(0xf0))); 236 | 237 | uint8x16_t bad_cv = vdupq_n_u8(0xff); 238 | uint8x16_t bad_v = vorrq_u8(vorrq_u8(vceqq_u8(r_v, bad_cv), vceqq_u8(c_v, bad_cv)), 239 | vorrq_u8(vceqq_u8(n_v, bad_cv), vceqq_u8(b_v, bad_cv))); 240 | 241 | uint16x8_t rn1 = vmovl_u8(vget_low_u8(n_v)); 242 | uint16x8_t rn2 = vmovl_u8(vget_high_u8(n_v)); 243 | rn1 = vmlal_u8(rn1, vget_low_u8(r_v), vdup_n_u8(225)); 244 | rn2 = vmlal_u8(rn2, vget_high_u8(r_v), vdup_n_u8(225)); 245 | 246 | uint16x8_t cb1 = vmovl_u8(vget_low_u8(b_v)); 247 | uint16x8_t cb2 = vmovl_u8(vget_high_u8(b_v)); 248 | cb1 = vmlal_u8(cb1, vget_low_u8(c_v), vdup_n_u8(150)); 249 | cb2 = vmlal_u8(cb2, vget_high_u8(c_v), vdup_n_u8(150)); 250 | 251 | if (vmaxvq_u8(bad_v)) { 252 | return 0; 253 | } 254 | 255 | uint16x8x2_t result; 256 | result.val[0] = vmlaq_n_u16(cb1, rn1, 10); 257 | result.val[1] = vmlaq_n_u16(cb2, rn2, 10); 258 | result.val[0] = vorrq_u16(result.val[0], sign1); 259 | result.val[1] = vorrq_u16(result.val[1], sign2); 260 | result.val[0] = (uint16x8_t)vrev16q_u8((uint8x16_t)result.val[0]); 261 | result.val[1] = (uint16x8_t)vrev16q_u8((uint8x16_t)result.val[1]); 262 | 263 | vst1q_u16_x2((unsigned short *) value_out, result); 264 | value_out += 32; 265 | } 266 | 267 | return 1; 268 | } 269 | 270 | #endif 271 | -------------------------------------------------------------------------------- /src/rcnb_x86.c: -------------------------------------------------------------------------------- 1 | /* 2 | cencode_x86.c - x86 intrinsic source to an rcnb encoding algorithm 3 | 4 | This is part of the librcnb project, and has been placed in the public domain. 5 | For details, see https://github.com/rikakomoe/librcnb 6 | */ 7 | 8 | #if defined(ENABLE_AVX2) || defined(ENABLE_SSSE3) 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | typedef struct concat_tbl { 15 | unsigned char first[16]; 16 | unsigned char second[16]; 17 | } concat_tbl; 18 | 19 | static const unsigned char swizzle[16] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; 20 | 21 | static const concat_tbl rc_lo = { 22 | {114, 82, 84, 85, 86, 87, 88, 89, 166, 16, 17, 18, 19, 76, 77}, 23 | {99, 67, 6, 7, 8, 9, 10, 11, 12, 13, 135, 136, 199, 59, 60} 24 | }; 25 | static const concat_tbl rc_hi = { 26 | {0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2}, 27 | {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2} 28 | }; 29 | static const concat_tbl nb_lo = { 30 | {110, 78, 67, 68, 69, 70, 71, 72, 157, 158, 209, 248, 249, 32, 53}, 31 | {98, 66, 128, 129, 131, 132, 133, 223, 222, 254} 32 | }; 33 | static const concat_tbl nb_hi = { 34 | {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 2}, 35 | {0, 0, 1, 1, 1, 1, 1, 0, 0, 0} 36 | }; 37 | 38 | static const concat_tbl rc_tbl = { 39 | {14, 8, 0, 255, 2, 3, 4, 5, 6, 7, 9, 10, 11, 1, 12, 13}, 40 | {13, 3, 9, 14, 4, 0, 5, 255, 10, 6, 11, 1, 7, 12, 2, 8} 41 | }; 42 | 43 | static const concat_tbl nb_tbl = { 44 | {10, 3, 255, 4, 8, 0, 5, 9, 6, 1, 7, 13, 11, 14, 2, 12}, 45 | {2, 3, 255, 255, 4, 255, 5, 6, 8, 7, 255, 1, 9, 255, 255, 0} 46 | }; 47 | 48 | static const unsigned char s_tbl[16] = {0, 0, 255, 255, 255, 0, 255, 0}; 49 | static const concat_tbl mul_c = { 50 | {225, 1, 225, 1, 225, 1, 225, 1, 225, 1, 225, 1, 225, 1, 225, 1}, 51 | {150, 1, 150, 1, 150, 1, 150, 1, 150, 1, 150, 1, 150, 1, 150, 1} 52 | }; 53 | 54 | #ifdef __clang__ 55 | // Clang really don't like vpermd and attempts to replace it with 5+ ops. 56 | // Mark this as potentially non-const to force Clang using vpermd. 57 | static unsigned int permuted[8] = {0, 4, 1, 5, 2, 6, 3, 7}; 58 | static unsigned char shuffler[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15}; 59 | void unused_force_clang_use_vpermd() { permuted[0] = 0; } 60 | void unused_force_clang_use_vpshufb() { shuffler[0] = 0; } 61 | #else 62 | static const unsigned int permuted[8] = {0, 4, 1, 5, 2, 6, 3, 7}; 63 | static const unsigned char shuffler[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15}; 64 | #endif 65 | #endif 66 | 67 | #ifdef ENABLE_SSSE3 68 | 69 | #define mm_blendv_epi8(a, b, mask) _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)) 70 | 71 | void rcnb_encode_32n_asm(const char *value_in, char *value_out, size_t n) { 72 | for (size_t i = 0; i < n; ++i) { 73 | __m128i input1 = _mm_loadu_si128((__m128i *) value_in); 74 | input1 = _mm_shuffle_epi8(input1, *(__m128i *) &swizzle); 75 | // 0xffff for neg, 0x0000 for pos 76 | __m128i sign1 = _mm_srai_epi16(input1, 15); 77 | input1 = _mm_and_si128(input1, _mm_set1_epi16(0x7fff)); 78 | 79 | __m128i input2 = _mm_loadu_si128((__m128i *) (value_in + 16)); 80 | input2 = _mm_shuffle_epi8(input2, *(__m128i *) &swizzle); 81 | __m128i sign2 = _mm_srai_epi16(input2, 15); 82 | input2 = _mm_and_si128(input2, _mm_set1_epi16(0x7fff)); 83 | 84 | value_in += 32; 85 | 86 | __m128i idx_r1, idx_c1, idx_n1, idx_b1; 87 | __m128i idx_r2, idx_c2, idx_n2, idx_b2; 88 | { 89 | // i / 2250 = (i * 59653) >> (16 + 11) 90 | idx_r1 = _mm_srli_epi16(_mm_mulhi_epu16(input1, _mm_set1_epi16(-5883)), 11); 91 | 92 | __m128i r_mul_2250 = _mm_mullo_epi16(idx_r1, _mm_set1_epi16(2250)); 93 | // i % 2250 94 | __m128i i_mod_2250 = _mm_sub_epi16(input1, r_mul_2250); 95 | // i / 150 = (i * 55925) >> (16 + 7) 96 | idx_c1 = _mm_srli_epi16(_mm_mulhi_epu16(i_mod_2250, _mm_set1_epi16(-9611)), 7); 97 | 98 | __m128i c_mul_150 = _mm_add_epi16(r_mul_2250, _mm_mullo_epi16(idx_c1, _mm_set1_epi16(150))); 99 | // i % 150 100 | __m128i i_mod_150 = _mm_sub_epi16(input1, c_mul_150); 101 | // i / 10 = (i * 52429) >> (16 + 3); 102 | idx_n1 = _mm_srli_epi16(_mm_mulhi_epu16(i_mod_150, _mm_set1_epi16(-13107)), 3); 103 | 104 | __m128i n_mul_10 = _mm_add_epi16(c_mul_150, _mm_mullo_epi16(idx_n1, _mm_set1_epi16(10))); 105 | // i % 10 106 | idx_b1 = _mm_sub_epi16(input1, n_mul_10); 107 | } 108 | 109 | { 110 | idx_r2 = _mm_srli_epi16(_mm_mulhi_epu16(input2, _mm_set1_epi16(-5883)), 11); 111 | __m128i r_mul_2250 = _mm_mullo_epi16(idx_r2, _mm_set1_epi16(2250)); 112 | __m128i i_mod_2250 = _mm_sub_epi16(input2, r_mul_2250); 113 | idx_c2 = _mm_srli_epi16(_mm_mulhi_epu16(i_mod_2250, _mm_set1_epi16(-9611)), 7); 114 | __m128i c_mul_150 = _mm_add_epi16(r_mul_2250, _mm_mullo_epi16(idx_c2, _mm_set1_epi16(150))); 115 | __m128i i_mod_150 = _mm_sub_epi16(input2, c_mul_150); 116 | idx_n2 = _mm_srli_epi16(_mm_mulhi_epu16(i_mod_150, _mm_set1_epi16(-13107)), 3); 117 | __m128i n_mul_10 = _mm_add_epi16(c_mul_150, _mm_mullo_epi16(idx_n2, _mm_set1_epi16(10))); 118 | idx_b2 = _mm_sub_epi16(input2, n_mul_10); 119 | } 120 | 121 | __m128i idx_r = _mm_packus_epi16(idx_r1, idx_r2); 122 | __m128i idx_c = _mm_packus_epi16(idx_c1, idx_c2); 123 | __m128i idx_n = _mm_packus_epi16(idx_n1, idx_n2); 124 | __m128i idx_b = _mm_packus_epi16(idx_b1, idx_b2); 125 | 126 | __m128i r_l = _mm_shuffle_epi8(*(__m128i *) &rc_lo.first, idx_r); 127 | __m128i c_l = _mm_shuffle_epi8(*(__m128i *) &rc_lo.second, idx_c); 128 | __m128i n_l = _mm_shuffle_epi8(*(__m128i *) &nb_lo.first, idx_n); 129 | __m128i b_l = _mm_shuffle_epi8(*(__m128i *) &nb_lo.second, idx_b); 130 | 131 | __m128i r_h = _mm_shuffle_epi8(*(__m128i *) &rc_hi.first, idx_r); 132 | __m128i c_h = _mm_shuffle_epi8(*(__m128i *) &rc_hi.second, idx_c); 133 | __m128i n_h = _mm_shuffle_epi8(*(__m128i *) &nb_hi.first, idx_n); 134 | __m128i b_h = _mm_shuffle_epi8(*(__m128i *) &nb_hi.second, idx_b); 135 | 136 | __m128i r1 = _mm_unpacklo_epi8(r_l, r_h); 137 | __m128i r2 = _mm_unpackhi_epi8(r_l, r_h); 138 | __m128i c1 = _mm_unpacklo_epi8(c_l, c_h); 139 | __m128i c2 = _mm_unpackhi_epi8(c_l, c_h); 140 | __m128i n1 = _mm_unpacklo_epi8(n_l, n_h); 141 | __m128i n2 = _mm_unpackhi_epi8(n_l, n_h); 142 | __m128i b1 = _mm_unpacklo_epi8(b_l, b_h); 143 | __m128i b2 = _mm_unpackhi_epi8(b_l, b_h); 144 | 145 | __m128i rc1_t = _mm_unpacklo_epi16(r1, c1); 146 | __m128i rc2_t = _mm_unpackhi_epi16(r1, c1); 147 | __m128i rc3_t = _mm_unpacklo_epi16(r2, c2); 148 | __m128i rc4_t = _mm_unpackhi_epi16(r2, c2); 149 | __m128i nb1_t = _mm_unpacklo_epi16(n1, b1); 150 | __m128i nb2_t = _mm_unpackhi_epi16(n1, b1); 151 | __m128i nb3_t = _mm_unpacklo_epi16(n2, b2); 152 | __m128i nb4_t = _mm_unpackhi_epi16(n2, b2); 153 | 154 | __m128i mask1 = _mm_unpacklo_epi16(sign1, sign1); 155 | __m128i mask2 = _mm_unpackhi_epi16(sign1, sign1); 156 | __m128i mask3 = _mm_unpacklo_epi16(sign2, sign2); 157 | __m128i mask4 = _mm_unpackhi_epi16(sign2, sign2); 158 | 159 | __m128i rc1 = mm_blendv_epi8(rc1_t, nb1_t, mask1); 160 | __m128i rc2 = mm_blendv_epi8(rc2_t, nb2_t, mask2); 161 | __m128i rc3 = mm_blendv_epi8(rc3_t, nb3_t, mask3); 162 | __m128i rc4 = mm_blendv_epi8(rc4_t, nb4_t, mask4); 163 | __m128i nb1 = mm_blendv_epi8(nb1_t, rc1_t, mask1); 164 | __m128i nb2 = mm_blendv_epi8(nb2_t, rc2_t, mask2); 165 | __m128i nb3 = mm_blendv_epi8(nb3_t, rc3_t, mask3); 166 | __m128i nb4 = mm_blendv_epi8(nb4_t, rc4_t, mask4); 167 | 168 | __m128i rcnb1 = _mm_unpacklo_epi32(rc1, nb1); 169 | __m128i rcnb2 = _mm_unpackhi_epi32(rc1, nb1); 170 | __m128i rcnb3 = _mm_unpacklo_epi32(rc2, nb2); 171 | __m128i rcnb4 = _mm_unpackhi_epi32(rc2, nb2); 172 | __m128i rcnb5 = _mm_unpacklo_epi32(rc3, nb3); 173 | __m128i rcnb6 = _mm_unpackhi_epi32(rc3, nb3); 174 | __m128i rcnb7 = _mm_unpacklo_epi32(rc4, nb4); 175 | __m128i rcnb8 = _mm_unpackhi_epi32(rc4, nb4); 176 | 177 | if (sizeof(wchar_t) == 2) { 178 | _mm_storeu_si128((__m128i *) (value_out), rcnb1); 179 | value_out += 16; 180 | _mm_storeu_si128((__m128i *) (value_out), rcnb2); 181 | value_out += 16; 182 | _mm_storeu_si128((__m128i *) (value_out), rcnb3); 183 | value_out += 16; 184 | _mm_storeu_si128((__m128i *) (value_out), rcnb4); 185 | value_out += 16; 186 | _mm_storeu_si128((__m128i *) (value_out), rcnb5); 187 | value_out += 16; 188 | _mm_storeu_si128((__m128i *) (value_out), rcnb6); 189 | value_out += 16; 190 | _mm_storeu_si128((__m128i *) (value_out), rcnb7); 191 | value_out += 16; 192 | _mm_storeu_si128((__m128i *) (value_out), rcnb8); 193 | value_out += 16; 194 | } else if (sizeof(wchar_t) == 4) { 195 | _mm_storeu_si128((__m128i *) (value_out), _mm_unpacklo_epi16(rcnb1, _mm_setzero_si128())); 196 | value_out += 16; 197 | _mm_storeu_si128((__m128i *) (value_out), _mm_unpackhi_epi16(rcnb1, _mm_setzero_si128())); 198 | value_out += 16; 199 | _mm_storeu_si128((__m128i *) (value_out), _mm_unpacklo_epi16(rcnb2, _mm_setzero_si128())); 200 | value_out += 16; 201 | _mm_storeu_si128((__m128i *) (value_out), _mm_unpackhi_epi16(rcnb2, _mm_setzero_si128())); 202 | value_out += 16; 203 | _mm_storeu_si128((__m128i *) (value_out), _mm_unpacklo_epi16(rcnb3, _mm_setzero_si128())); 204 | value_out += 16; 205 | _mm_storeu_si128((__m128i *) (value_out), _mm_unpackhi_epi16(rcnb3, _mm_setzero_si128())); 206 | value_out += 16; 207 | _mm_storeu_si128((__m128i *) (value_out), _mm_unpacklo_epi16(rcnb4, _mm_setzero_si128())); 208 | value_out += 16; 209 | _mm_storeu_si128((__m128i *) (value_out), _mm_unpackhi_epi16(rcnb4, _mm_setzero_si128())); 210 | value_out += 16; 211 | _mm_storeu_si128((__m128i *) (value_out), _mm_unpacklo_epi16(rcnb5, _mm_setzero_si128())); 212 | value_out += 16; 213 | _mm_storeu_si128((__m128i *) (value_out), _mm_unpackhi_epi16(rcnb5, _mm_setzero_si128())); 214 | value_out += 16; 215 | _mm_storeu_si128((__m128i *) (value_out), _mm_unpacklo_epi16(rcnb6, _mm_setzero_si128())); 216 | value_out += 16; 217 | _mm_storeu_si128((__m128i *) (value_out), _mm_unpackhi_epi16(rcnb6, _mm_setzero_si128())); 218 | value_out += 16; 219 | _mm_storeu_si128((__m128i *) (value_out), _mm_unpacklo_epi16(rcnb7, _mm_setzero_si128())); 220 | value_out += 16; 221 | _mm_storeu_si128((__m128i *) (value_out), _mm_unpackhi_epi16(rcnb7, _mm_setzero_si128())); 222 | value_out += 16; 223 | _mm_storeu_si128((__m128i *) (value_out), _mm_unpacklo_epi16(rcnb8, _mm_setzero_si128())); 224 | value_out += 16; 225 | _mm_storeu_si128((__m128i *) (value_out), _mm_unpackhi_epi16(rcnb8, _mm_setzero_si128())); 226 | value_out += 16; 227 | } 228 | } 229 | } 230 | 231 | int rcnb_decode_32n_asm(const char *value_in, char *value_out, size_t n) { 232 | __m128i rcnb1, rcnb2, rcnb3, rcnb4, rcnb5, rcnb6, rcnb7, rcnb8; 233 | 234 | __m128i s_t = *(__m128i *) &s_tbl; 235 | 236 | __m128i r_t = *(__m128i *) rc_tbl.first; 237 | __m128i c_t = *(__m128i *) rc_tbl.second; 238 | __m128i n_t = *(__m128i *) nb_tbl.first; 239 | __m128i b_t = *(__m128i *) nb_tbl.second; 240 | 241 | __m128i mul_rc = *(__m128i *) mul_c.first; 242 | __m128i mul_nb = *(__m128i *) mul_c.second; 243 | 244 | __m128i r_swizzle = *(__m128i *) &swizzle; 245 | 246 | for (size_t i = 0; i < n; ++i) { 247 | if (sizeof(wchar_t) == 2) { 248 | rcnb1 = _mm_loadu_si128((__m128i *) value_in); 249 | rcnb2 = _mm_loadu_si128((__m128i *) (value_in + 16)); 250 | rcnb3 = _mm_loadu_si128((__m128i *) (value_in + 32)); 251 | rcnb4 = _mm_loadu_si128((__m128i *) (value_in + 48)); 252 | rcnb5 = _mm_loadu_si128((__m128i *) (value_in + 64)); 253 | rcnb6 = _mm_loadu_si128((__m128i *) (value_in + 80)); 254 | rcnb7 = _mm_loadu_si128((__m128i *) (value_in + 96)); 255 | rcnb8 = _mm_loadu_si128((__m128i *) (value_in + 112)); 256 | value_in += 128; 257 | } else if (sizeof(wchar_t) == 4) { 258 | rcnb1 = _mm_packs_epi32(_mm_loadu_si128((__m128i *) value_in), 259 | _mm_loadu_si128((__m128i *) (value_in + 16))); 260 | value_in += 32; 261 | rcnb2 = _mm_packs_epi32(_mm_loadu_si128((__m128i *) value_in), 262 | _mm_loadu_si128((__m128i *) (value_in + 16))); 263 | value_in += 32; 264 | rcnb3 = _mm_packs_epi32(_mm_loadu_si128((__m128i *) value_in), 265 | _mm_loadu_si128((__m128i *) (value_in + 16))); 266 | value_in += 32; 267 | rcnb4 = _mm_packs_epi32(_mm_loadu_si128((__m128i *) value_in), 268 | _mm_loadu_si128((__m128i *) (value_in + 16))); 269 | value_in += 32; 270 | rcnb5 = _mm_packs_epi32(_mm_loadu_si128((__m128i *) value_in), 271 | _mm_loadu_si128((__m128i *) (value_in + 16))); 272 | value_in += 32; 273 | rcnb6 = _mm_packs_epi32(_mm_loadu_si128((__m128i *) value_in), 274 | _mm_loadu_si128((__m128i *) (value_in + 16))); 275 | value_in += 32; 276 | rcnb7 = _mm_packs_epi32(_mm_loadu_si128((__m128i *) value_in), 277 | _mm_loadu_si128((__m128i *) (value_in + 16))); 278 | value_in += 32; 279 | rcnb8 = _mm_packs_epi32(_mm_loadu_si128((__m128i *) value_in), 280 | _mm_loadu_si128((__m128i *) (value_in + 16))); 281 | value_in += 32; 282 | } 283 | 284 | __m128i r_c1t, r_c2t, c_c1t, c_c2t, n_c1t, n_c2t, b_c1t, b_c2t; 285 | 286 | { 287 | __m128i rcnb_04 = _mm_unpacklo_epi16(rcnb1, rcnb3); 288 | __m128i rcnb_15 = _mm_unpackhi_epi16(rcnb1, rcnb3); 289 | __m128i rcnb_26 = _mm_unpacklo_epi16(rcnb2, rcnb4); 290 | __m128i rcnb_37 = _mm_unpackhi_epi16(rcnb2, rcnb4); 291 | 292 | __m128i rcnb_0246_1 = _mm_unpacklo_epi16(rcnb_04, rcnb_26); 293 | __m128i rcnb_0246_2 = _mm_unpackhi_epi16(rcnb_04, rcnb_26); 294 | __m128i rcnb_1357_1 = _mm_unpacklo_epi16(rcnb_15, rcnb_37); 295 | __m128i rcnb_1357_2 = _mm_unpackhi_epi16(rcnb_15, rcnb_37); 296 | 297 | r_c1t = _mm_unpacklo_epi16(rcnb_0246_1, rcnb_1357_1); 298 | c_c1t = _mm_unpackhi_epi16(rcnb_0246_1, rcnb_1357_1); 299 | n_c1t = _mm_unpacklo_epi16(rcnb_0246_2, rcnb_1357_2); 300 | b_c1t = _mm_unpackhi_epi16(rcnb_0246_2, rcnb_1357_2); 301 | } 302 | 303 | { 304 | __m128i rcnb_04 = _mm_unpacklo_epi16(rcnb5, rcnb7); 305 | __m128i rcnb_15 = _mm_unpackhi_epi16(rcnb5, rcnb7); 306 | __m128i rcnb_26 = _mm_unpacklo_epi16(rcnb6, rcnb8); 307 | __m128i rcnb_37 = _mm_unpackhi_epi16(rcnb6, rcnb8); 308 | 309 | __m128i rcnb_0246_1 = _mm_unpacklo_epi16(rcnb_04, rcnb_26); 310 | __m128i rcnb_0246_2 = _mm_unpackhi_epi16(rcnb_04, rcnb_26); 311 | __m128i rcnb_1357_1 = _mm_unpacklo_epi16(rcnb_15, rcnb_37); 312 | __m128i rcnb_1357_2 = _mm_unpackhi_epi16(rcnb_15, rcnb_37); 313 | 314 | r_c2t = _mm_unpacklo_epi16(rcnb_0246_1, rcnb_1357_1); 315 | c_c2t = _mm_unpackhi_epi16(rcnb_0246_1, rcnb_1357_1); 316 | n_c2t = _mm_unpacklo_epi16(rcnb_0246_2, rcnb_1357_2); 317 | b_c2t = _mm_unpackhi_epi16(rcnb_0246_2, rcnb_1357_2); 318 | } 319 | 320 | __m128i sign_idx1 = _mm_srli_epi16(_mm_mullo_epi16(r_c1t, _mm_set1_epi16(2117)), 13); 321 | __m128i sign_idx2 = _mm_srli_epi16(_mm_mullo_epi16(r_c2t, _mm_set1_epi16(2117)), 13); 322 | __m128i sign1 = _mm_shuffle_epi8(s_t, sign_idx1); 323 | __m128i sign2 = _mm_shuffle_epi8(s_t, sign_idx2); 324 | sign1 = _mm_or_si128(sign1, _mm_slli_epi16(sign1, 8)); 325 | sign2 = _mm_or_si128(sign2, _mm_slli_epi16(sign2, 8)); 326 | 327 | __m128i r_c1 = mm_blendv_epi8(r_c1t, n_c1t, sign1); 328 | __m128i c_c1 = mm_blendv_epi8(c_c1t, b_c1t, sign1); 329 | __m128i n_c1 = mm_blendv_epi8(n_c1t, r_c1t, sign1); 330 | __m128i b_c1 = mm_blendv_epi8(b_c1t, c_c1t, sign1); 331 | __m128i r_c2 = mm_blendv_epi8(r_c2t, n_c2t, sign2); 332 | __m128i c_c2 = mm_blendv_epi8(c_c2t, b_c2t, sign2); 333 | __m128i n_c2 = mm_blendv_epi8(n_c2t, r_c2t, sign2); 334 | __m128i b_c2 = mm_blendv_epi8(b_c2t, c_c2t, sign2); 335 | 336 | sign1 = _mm_slli_epi16(sign1, 15); 337 | sign2 = _mm_slli_epi16(sign2, 15); 338 | 339 | __m128i r_i116 = _mm_srli_epi16(_mm_mullo_epi16(r_c1, _mm_set1_epi16(4675)), 12); 340 | __m128i c_i116 = _mm_srli_epi16(_mm_mullo_epi16(c_c1, _mm_set1_epi16(11482)), 12); 341 | __m128i n_i116 = _mm_srli_epi16(_mm_mullo_epi16(n_c1, _mm_set1_epi16(9726)), 12); 342 | __m128i b_i116 = _mm_and_si128(_mm_add_epi16(b_c1, 343 | _mm_add_epi16( 344 | _mm_srli_epi16(b_c1, 1), 345 | _mm_srli_epi16(b_c1, 3))), 346 | _mm_set1_epi16(15)); 347 | 348 | __m128i r_i216 = _mm_srli_epi16(_mm_mullo_epi16(r_c2, _mm_set1_epi16(4675)), 12); 349 | __m128i c_i216 = _mm_srli_epi16(_mm_mullo_epi16(c_c2, _mm_set1_epi16(11482)), 12); 350 | __m128i n_i216 = _mm_srli_epi16(_mm_mullo_epi16(n_c2, _mm_set1_epi16(9726)), 12); 351 | __m128i b_i216 = _mm_and_si128(_mm_add_epi16(b_c2, 352 | _mm_add_epi16( 353 | _mm_srli_epi16(b_c2, 1), 354 | _mm_srli_epi16(b_c2, 3))), 355 | _mm_set1_epi16(15)); 356 | 357 | __m128i r_i = _mm_packus_epi16(r_i116, r_i216); 358 | __m128i c_i = _mm_packus_epi16(c_i116, c_i216); 359 | __m128i n_i = _mm_packus_epi16(n_i116, n_i216); 360 | __m128i b_i = _mm_packus_epi16(b_i116, b_i216); 361 | 362 | __m128i r_v = _mm_shuffle_epi8(r_t, r_i); 363 | __m128i c_v = _mm_shuffle_epi8(c_t, c_i); 364 | __m128i n_v = _mm_shuffle_epi8(n_t, n_i); 365 | __m128i b_v = _mm_shuffle_epi8(b_t, b_i); 366 | 367 | __m128i bad_v = _mm_or_si128( 368 | _mm_or_si128( 369 | _mm_cmpeq_epi8(r_v, _mm_set1_epi8(-1)), 370 | _mm_cmpeq_epi8(c_v, _mm_set1_epi8(-1)) 371 | ), 372 | _mm_or_si128( 373 | _mm_cmpeq_epi8(n_v, _mm_set1_epi8(-1)), 374 | _mm_cmpeq_epi8(b_v, _mm_set1_epi8(-1)) 375 | ) 376 | ); 377 | 378 | __m128i rn_1 = _mm_unpacklo_epi8(r_v, n_v); 379 | __m128i rn_2 = _mm_unpackhi_epi8(r_v, n_v); 380 | __m128i cb_1 = _mm_unpacklo_epi8(c_v, b_v); 381 | __m128i cb_2 = _mm_unpackhi_epi8(c_v, b_v); 382 | rn_1 = _mm_maddubs_epi16(mul_rc, rn_1); 383 | rn_2 = _mm_maddubs_epi16(mul_rc, rn_2); 384 | cb_1 = _mm_maddubs_epi16(mul_nb, cb_1); 385 | cb_2 = _mm_maddubs_epi16(mul_nb, cb_2); 386 | 387 | if (_mm_movemask_epi8(bad_v)) { 388 | return 0; 389 | } 390 | 391 | __m128i result1 = _mm_add_epi16(_mm_add_epi16(_mm_slli_epi16(rn_1, 3), _mm_slli_epi16(rn_1, 1)), cb_1); 392 | __m128i result2 = _mm_add_epi16(_mm_add_epi16(_mm_slli_epi16(rn_2, 3), _mm_slli_epi16(rn_2, 1)), cb_2); 393 | 394 | result1 = _mm_or_si128(result1, sign1); 395 | result1 = _mm_shuffle_epi8(result1, r_swizzle); 396 | result2 = _mm_or_si128(result2, sign2); 397 | result2 = _mm_shuffle_epi8(result2, r_swizzle); 398 | 399 | _mm_storeu_si128((__m128i *) value_out, result1); 400 | _mm_storeu_si128((__m128i *) (value_out + 16), result2); 401 | value_out += 32; 402 | } 403 | return 1; 404 | } 405 | 406 | #endif 407 | 408 | #ifdef ENABLE_AVX2 409 | 410 | void rcnb_encode_32n_asm(const char *value_in, char *value_out, size_t n) { 411 | __m256i r_swizzle = _mm256_broadcastsi128_si256(*(__m128i *) &swizzle); 412 | __m256i r_permute = *(__m256i *) &permuted; 413 | __m256i r_shuffler = _mm256_broadcastsi128_si256(*(__m128i *) &shuffler); 414 | for (size_t i = 0; i < n; ++i) { 415 | __m256i input = _mm256_loadu_si256((__m256i *) value_in); 416 | value_in += 32; 417 | input = _mm256_shuffle_epi8(input, r_swizzle); 418 | // 0xffff for neg, 0x0000 for pos 419 | __m256i sign = _mm256_srai_epi16(input, 15); 420 | input = _mm256_and_si256(input, _mm256_set1_epi16(0x7fff)); 421 | 422 | __m256i idx_r = _mm256_srli_epi16(_mm256_mulhi_epu16(input, _mm256_set1_epi16(-5883)), 11); 423 | __m256i r_mul_2250 = _mm256_mullo_epi16(idx_r, _mm256_set1_epi16(2250)); 424 | __m256i i_mod_2250 = _mm256_sub_epi16(input, r_mul_2250); 425 | __m256i idx_c = _mm256_srli_epi16(_mm256_mulhi_epu16(i_mod_2250, _mm256_set1_epi16(-9611)), 7); 426 | __m256i c_mul_150 = _mm256_add_epi16(r_mul_2250, _mm256_mullo_epi16(idx_c, _mm256_set1_epi16(150))); 427 | __m256i i_mod_150 = _mm256_sub_epi16(input, c_mul_150); 428 | __m256i idx_n = _mm256_srli_epi16(_mm256_mulhi_epu16(i_mod_150, _mm256_set1_epi16(-13107)), 3); 429 | __m256i n_mul_10 = _mm256_add_epi16(c_mul_150, _mm256_mullo_epi16(idx_n, _mm256_set1_epi16(10))); 430 | __m256i idx_b = _mm256_sub_epi16(input, n_mul_10); 431 | 432 | __m256i idx_rc = _mm256_packus_epi16(idx_r, idx_c); 433 | __m256i idx_nb = _mm256_packus_epi16(idx_n, idx_b); 434 | idx_rc = _mm256_permute4x64_epi64(idx_rc, 0xd8); 435 | idx_nb = _mm256_permute4x64_epi64(idx_nb, 0xd8); 436 | 437 | __m256i rc_l = _mm256_shuffle_epi8(*(__m256i *) &rc_lo, idx_rc); 438 | __m256i rc_h = _mm256_shuffle_epi8(*(__m256i *) &rc_hi, idx_rc); 439 | __m256i nb_l = _mm256_shuffle_epi8(*(__m256i *) &nb_lo, idx_nb); 440 | __m256i nb_h = _mm256_shuffle_epi8(*(__m256i *) &nb_hi, idx_nb); 441 | 442 | __m256i r1c1_t = _mm256_unpacklo_epi8(rc_l, rc_h); 443 | __m256i r2c2_t = _mm256_unpackhi_epi8(rc_l, rc_h); 444 | __m256i n1b1_t = _mm256_unpacklo_epi8(nb_l, nb_h); 445 | __m256i n2b2_t = _mm256_unpackhi_epi8(nb_l, nb_h); 446 | 447 | __m256i sign1 = _mm256_permute4x64_epi64(sign, 0b01000100); 448 | __m256i sign2 = _mm256_permute4x64_epi64(sign, 0b11101110); 449 | 450 | __m256i r1c1 = _mm256_blendv_epi8(r1c1_t, n1b1_t, sign1); 451 | __m256i r2c2 = _mm256_blendv_epi8(r2c2_t, n2b2_t, sign2); 452 | __m256i n1b1 = _mm256_blendv_epi8(n1b1_t, r1c1_t, sign1); 453 | __m256i n2b2 = _mm256_blendv_epi8(n2b2_t, r2c2_t, sign2); 454 | 455 | __m256i rn1cb1 = _mm256_unpacklo_epi16(r1c1, n1b1); 456 | __m256i rn2cb2 = _mm256_unpackhi_epi16(r1c1, n1b1); 457 | __m256i rn3cb3 = _mm256_unpacklo_epi16(r2c2, n2b2); 458 | __m256i rn4cb4 = _mm256_unpackhi_epi16(r2c2, n2b2); 459 | 460 | __m256i rncb1 = _mm256_permutevar8x32_epi32(rn1cb1, r_permute); 461 | __m256i rncb2 = _mm256_permutevar8x32_epi32(rn2cb2, r_permute); 462 | __m256i rncb3 = _mm256_permutevar8x32_epi32(rn3cb3, r_permute); 463 | __m256i rncb4 = _mm256_permutevar8x32_epi32(rn4cb4, r_permute); 464 | 465 | __m256i rcnb1 = _mm256_shuffle_epi8(rncb1, r_shuffler); 466 | __m256i rcnb2 = _mm256_shuffle_epi8(rncb2, r_shuffler); 467 | __m256i rcnb3 = _mm256_shuffle_epi8(rncb3, r_shuffler); 468 | __m256i rcnb4 = _mm256_shuffle_epi8(rncb4, r_shuffler); 469 | 470 | if (sizeof(wchar_t) == 2) { 471 | _mm256_storeu_si256((__m256i *) (value_out), rcnb1); 472 | value_out += 32; 473 | _mm256_storeu_si256((__m256i *) (value_out), rcnb2); 474 | value_out += 32; 475 | _mm256_storeu_si256((__m256i *) (value_out), rcnb3); 476 | value_out += 32; 477 | _mm256_storeu_si256((__m256i *) (value_out), rcnb4); 478 | value_out += 32; 479 | } else if (sizeof(wchar_t) == 4) { 480 | _mm256_storeu_si256((__m256i *) (value_out), _mm256_cvtepi16_epi32(_mm256_extracti128_si256(rcnb1, 0))); 481 | value_out += 32; 482 | _mm256_storeu_si256((__m256i *) (value_out), _mm256_cvtepi16_epi32(_mm256_extracti128_si256(rcnb1, 1))); 483 | value_out += 32; 484 | _mm256_storeu_si256((__m256i *) (value_out), _mm256_cvtepi16_epi32(_mm256_extracti128_si256(rcnb2, 0))); 485 | value_out += 32; 486 | _mm256_storeu_si256((__m256i *) (value_out), _mm256_cvtepi16_epi32(_mm256_extracti128_si256(rcnb2, 1))); 487 | value_out += 32; 488 | _mm256_storeu_si256((__m256i *) (value_out), _mm256_cvtepi16_epi32(_mm256_extracti128_si256(rcnb3, 0))); 489 | value_out += 32; 490 | _mm256_storeu_si256((__m256i *) (value_out), _mm256_cvtepi16_epi32(_mm256_extracti128_si256(rcnb3, 1))); 491 | value_out += 32; 492 | _mm256_storeu_si256((__m256i *) (value_out), _mm256_cvtepi16_epi32(_mm256_extracti128_si256(rcnb4, 0))); 493 | value_out += 32; 494 | _mm256_storeu_si256((__m256i *) (value_out), _mm256_cvtepi16_epi32(_mm256_extracti128_si256(rcnb4, 1))); 495 | value_out += 32; 496 | } 497 | } 498 | } 499 | 500 | int rcnb_decode_32n_asm(const char *value_in, char *value_out, size_t n) { 501 | __m256i rcnb1, rcnb2, rcnb3, rcnb4; 502 | 503 | __m256i rc_t = *(__m256i *) &rc_tbl; 504 | __m256i nb_t = *(__m256i *) &nb_tbl; 505 | 506 | __m256i s_t = _mm256_broadcastsi128_si256(*(__m128i *) &s_tbl); 507 | __m256i mul = *(__m256i *) &mul_c; 508 | __m256i r_swizzle = _mm256_broadcastsi128_si256(*(__m128i *) &swizzle); 509 | 510 | for (size_t i = 0; i < n; ++i) { 511 | if (sizeof(wchar_t) == 2) { 512 | rcnb1 = _mm256_loadu_si256((__m256i*) value_in); 513 | rcnb2 = _mm256_loadu_si256((__m256i*) (value_in + 32)); 514 | rcnb3 = _mm256_loadu_si256((__m256i*) (value_in + 64)); 515 | rcnb4 = _mm256_loadu_si256((__m256i*) (value_in + 96)); 516 | value_in += 128; 517 | } else if (sizeof(wchar_t) == 4) { 518 | __m256i tmp1, tmp2; 519 | tmp1 = _mm256_loadu_si256((__m256i*) value_in); 520 | tmp2 = _mm256_loadu_si256((__m256i*) (value_in + 32)); 521 | value_in += 64; 522 | rcnb1 = _mm256_permute4x64_epi64(_mm256_packus_epi32(tmp1, tmp2), 0xd8); 523 | 524 | tmp1 = _mm256_loadu_si256((__m256i*) value_in); 525 | tmp2 = _mm256_loadu_si256((__m256i*) (value_in + 32)); 526 | value_in += 64; 527 | rcnb2 = _mm256_permute4x64_epi64(_mm256_packus_epi32(tmp1, tmp2), 0xd8); 528 | 529 | tmp1 = _mm256_loadu_si256((__m256i*) value_in); 530 | tmp2 = _mm256_loadu_si256((__m256i*) (value_in + 32)); 531 | value_in += 64; 532 | rcnb3 = _mm256_permute4x64_epi64(_mm256_packus_epi32(tmp1, tmp2), 0xd8); 533 | 534 | tmp1 = _mm256_loadu_si256((__m256i*) value_in); 535 | tmp2 = _mm256_loadu_si256((__m256i*) (value_in + 32)); 536 | value_in += 64; 537 | rcnb4 = _mm256_permute4x64_epi64(_mm256_packus_epi32(tmp1, tmp2), 0xd8); 538 | } 539 | 540 | __m256i rcnb_0_1_8_9 = _mm256_permute2x128_si256(rcnb1, rcnb3, 0x20); 541 | __m256i rcnb_2_3_a_b = _mm256_permute2x128_si256(rcnb1, rcnb3, 0x31); 542 | __m256i rcnb_4_5_c_d = _mm256_permute2x128_si256(rcnb2, rcnb4, 0x20); 543 | __m256i rcnb_6_7_e_f = _mm256_permute2x128_si256(rcnb2, rcnb4, 0x31); 544 | 545 | __m256i rcnb_02_8a = _mm256_unpacklo_epi16(rcnb_0_1_8_9, rcnb_2_3_a_b); 546 | __m256i rcnb_13_9b = _mm256_unpackhi_epi16(rcnb_0_1_8_9, rcnb_2_3_a_b); 547 | __m256i rcnb_46_ce = _mm256_unpacklo_epi16(rcnb_4_5_c_d, rcnb_6_7_e_f); 548 | __m256i rcnb_57_df = _mm256_unpackhi_epi16(rcnb_4_5_c_d, rcnb_6_7_e_f); 549 | 550 | __m256i rcnb_0123_89ab_rc = _mm256_unpacklo_epi16(rcnb_02_8a, rcnb_13_9b); 551 | __m256i rcnb_0123_89ab_nb = _mm256_unpackhi_epi16(rcnb_02_8a, rcnb_13_9b); 552 | __m256i rcnb_4567_cdef_rc = _mm256_unpacklo_epi16(rcnb_46_ce, rcnb_57_df); 553 | __m256i rcnb_4567_cdef_nb = _mm256_unpackhi_epi16(rcnb_46_ce, rcnb_57_df); 554 | 555 | __m256i r_ct = _mm256_unpacklo_epi64(rcnb_0123_89ab_rc, rcnb_4567_cdef_rc); 556 | __m256i c_ct = _mm256_unpackhi_epi64(rcnb_0123_89ab_rc, rcnb_4567_cdef_rc); 557 | __m256i n_ct = _mm256_unpacklo_epi64(rcnb_0123_89ab_nb, rcnb_4567_cdef_nb); 558 | __m256i b_ct = _mm256_unpackhi_epi64(rcnb_0123_89ab_nb, rcnb_4567_cdef_nb); 559 | 560 | __m256i sign_idx = _mm256_srli_epi16(_mm256_mullo_epi16(r_ct, _mm256_set1_epi16(2117)), 13); 561 | __m256i sign = _mm256_shuffle_epi8(s_t, sign_idx); 562 | sign = _mm256_or_si256(sign, _mm256_slli_epi16(sign, 8)); 563 | 564 | __m256i r_c = _mm256_blendv_epi8(r_ct, n_ct, sign); 565 | __m256i c_c = _mm256_blendv_epi8(c_ct, b_ct, sign); 566 | __m256i n_c = _mm256_blendv_epi8(n_ct, r_ct, sign); 567 | __m256i b_c = _mm256_blendv_epi8(b_ct, c_ct, sign); 568 | 569 | sign = _mm256_slli_epi16(sign, 15); 570 | 571 | __m256i r_i16 = _mm256_srli_epi16(_mm256_mullo_epi16(r_c, _mm256_set1_epi16(4675)), 12); 572 | __m256i c_i16 = _mm256_srli_epi16(_mm256_mullo_epi16(c_c, _mm256_set1_epi16(11482)), 12); 573 | __m256i n_i16 = _mm256_srli_epi16(_mm256_mullo_epi16(n_c, _mm256_set1_epi16(9726)), 12); 574 | __m256i b_i16 = _mm256_and_si256(_mm256_add_epi16(b_c, 575 | _mm256_add_epi16( 576 | _mm256_srli_epi16(b_c, 1), 577 | _mm256_srli_epi16(b_c, 3))), 578 | _mm256_set1_epi16(15)); 579 | 580 | __m256i rc_i = _mm256_permute4x64_epi64(_mm256_packus_epi16(r_i16, c_i16), 0xd8); 581 | __m256i nb_i = _mm256_permute4x64_epi64(_mm256_packus_epi16(n_i16, b_i16), 0xd8); 582 | 583 | __m256i rc_v = _mm256_shuffle_epi8(rc_t, rc_i); 584 | __m256i nb_v = _mm256_shuffle_epi8(nb_t, nb_i); 585 | 586 | __m256i bad_v = _mm256_or_si256( 587 | _mm256_cmpeq_epi8(rc_v, _mm256_set1_epi8(-1)), 588 | _mm256_cmpeq_epi8(nb_v, _mm256_set1_epi8(-1)) 589 | ); 590 | 591 | __m256i rn_cb_1 = _mm256_unpacklo_epi8(rc_v, nb_v); 592 | __m256i rn_cb_2 = _mm256_unpackhi_epi8(rc_v, nb_v); 593 | rn_cb_1 = _mm256_maddubs_epi16(mul, rn_cb_1); 594 | rn_cb_2 = _mm256_maddubs_epi16(mul, rn_cb_2); 595 | 596 | if (_mm256_movemask_epi8(bad_v)) { 597 | return 0; 598 | } 599 | 600 | __m256i rn = _mm256_permute2x128_si256(rn_cb_1, rn_cb_2, 0x20); 601 | __m256i cb = _mm256_permute2x128_si256(rn_cb_1, rn_cb_2, 0x31); 602 | __m256i result = _mm256_add_epi16(_mm256_add_epi16(_mm256_slli_epi16(rn, 3), _mm256_slli_epi16(rn, 1)), cb); 603 | result = _mm256_or_si256(result, sign); 604 | result = _mm256_shuffle_epi8(result, r_swizzle); 605 | 606 | _mm256_storeu_si256((__m256i*)value_out, result); 607 | value_out += 32; 608 | } 609 | return 1; 610 | } 611 | #endif 612 | --------------------------------------------------------------------------------