├── .appveyor.yml
├── .gitignore
├── CMakeLists.txt
├── INSTALL
├── LICENSE
├── README
├── examples
    ├── c-example1.c
    ├── c-example2.c
    ├── cpp-example3.cc
    ├── cpp-rcnb-cli.cc
    └── python3-example.py
├── include
    └── rcnb
    │   ├── cdecode.h
    │   ├── cencode.h
    │   ├── decode.h
    │   ├── encode.h
    │   └── rcnb.h
├── librcnb-config-version.cmake.in
├── librcnb-config.cmake.in
└── src
    ├── cdecode.c
    ├── cencode.c
    ├── rcnb.c
    ├── rcnb_arm64.c
    └── rcnb_x86.c


/.appveyor.yml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   - Ubuntu1604
 3 |   - Ubuntu1804
 4 |   - Visual Studio 2015
 5 |   - Visual Studio 2017
 6 |   - Visual Studio 2019
 7 | 
 8 | configuration:
 9 |   - Release
10 | 
11 | before_build:
12 |   - |-
13 |     mkdir build
14 |     cd build
15 |     cmake --version
16 |   - sh: cmake ..
17 |   - cmd: cmake .. -D CMAKE_BUILD_TYPE=%CONFIGURATION%
18 | 
19 | build_script:
20 |   - sh: cmake --build . --config $CONFIGURATION
21 |   - cmd: cmake --build . --config %CONFIGURATION%
22 | 
23 | test_script:
24 |   - echo "The Quick Brown RC Jumps Over the NB Dog." > in.txt
25 |   - cmd: |-
26 |       .\\%CONFIGURATION%\\rcnb -e in.txt out.rcnb
27 |       .\\%CONFIGURATION%\\rcnb -d out.rcnb out.txt
28 |       fc /b in.txt out.txt && exit /b 0 || exit /b 1
29 |   - sh: |-
30 |       ./rcnb -e in.txt out.rcnb
31 |       ./rcnb -d out.rcnb out.txt
32 |       cmp in.txt out.txt
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | cmake-build-debug
 3 | .vscode
 4 | .vs
 5 | build
 6 | *.o
 7 | *.a
 8 | *.so
 9 | *.dll
10 | out
11 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | ###
  2 | ### CMake settings
  3 | ###
  4 | cmake_minimum_required(VERSION 3.1)
  5 | 
  6 | ###
  7 | ### Project settings
  8 | ###
  9 | project(librcnb)
 10 | 
 11 | set(PROJECT_VERSION_MAJOR "1")
 12 | set(PROJECT_VERSION_MINOR "0")
 13 | set(PROJECT_VERSION_PATCH "0")
 14 | set(PROJECT_VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}")
 15 | if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
 16 |     set(CMAKE_C_FLAGS "/utf-8 ${CMAKE_C_FLAGS}")
 17 | endif()
 18 | 
 19 | option(ENABLE_AVX2 "Enable AVX2 optimized code." OFF)
 20 | option(ENABLE_SSSE3 "Enable SSSE3 optimized code." OFF)
 21 | option(ENABLE_NEON "Enable NEON optimized code." OFF)
 22 | option(NATIVE_ASM "Allow compiler use best instruction set on current environment." OFF)
 23 | 
 24 | ###
 25 | ### Sources, headers, directories and libs
 26 | ###
 27 | include_directories(include)
 28 | 
 29 | set(RCNB_SOURCES
 30 |     src/cencode.c
 31 |     src/cdecode.c
 32 |     src/rcnb.c
 33 | )
 34 | 
 35 | if(ENABLE_AVX2)
 36 |     if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
 37 |         add_compile_options(/arch:AVX2)
 38 |     else()
 39 |         add_compile_options(-mavx2)
 40 |     endif()
 41 |     add_compile_definitions(ENABLE_AVX2)
 42 |     set(RCNB_SOURCES ${RCNB_SOURCES} src/rcnb_x86.c)
 43 | elseif(ENABLE_SSSE3)
 44 |     if(NOT CMAKE_C_COMPILER_ID MATCHES "MSVC")
 45 |         add_compile_options(-mssse3)
 46 |     endif()
 47 |     add_compile_definitions(ENABLE_SSSE3)
 48 |     set(RCNB_SOURCES ${RCNB_SOURCES} src/rcnb_x86.c)
 49 | endif()
 50 | 
 51 | if(ENABLE_NEON)
 52 |     add_compile_definitions(ENABLE_NEON)
 53 |     set(RCNB_SOURCES ${RCNB_SOURCES} src/rcnb_arm64.c)
 54 | endif()
 55 | 
 56 | if(NATIVE_ASM)
 57 |     if(NOT CMAKE_C_COMPILER_ID MATCHES "MSVC")
 58 |         add_compile_options(-march=native)
 59 |     endif()
 60 | endif()
 61 | 
 62 | add_library(rcnb SHARED ${RCNB_SOURCES})
 63 | add_library(rcnb-static STATIC ${RCNB_SOURCES})
 64 | set_target_properties(rcnb PROPERTIES
 65 |         VERSION ${PROJECT_VERSION}
 66 |         SOVERSION ${PROJECT_VERSION_MAJOR}
 67 |         PUBLIC_HEADER "include/rcnb/cencode.h;include/rcnb/cdecode.h;include/rcnb/encode.h;include/rcnb/decode.h")
 68 | set_target_properties(rcnb-static PROPERTIES
 69 |         VERSION ${PROJECT_VERSION}
 70 |         SOVERSION ${PROJECT_VERSION_MAJOR}
 71 |         PUBLIC_HEADER "include/rcnb/cencode.h;include/rcnb/cdecode.h;include/rcnb/encode.h;include/rcnb/decode.h")
 72 | if (NOT CMAKE_VERSION VERSION_LESS 2.8.12)
 73 |     target_include_directories(rcnb-static
 74 |             PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
 75 |             $<INSTALL_INTERFACE:include>)
 76 | endif()
 77 | add_executable(example1 examples/c-example1.c)
 78 | target_link_libraries(example1 rcnb-static)
 79 | add_executable(example2 examples/c-example2.c)
 80 | target_link_libraries(example2 rcnb-static)
 81 | add_executable(example3 examples/cpp-example3.cc)
 82 | target_link_libraries(example3 rcnb-static)
 83 | add_executable(rcnb-cli examples/cpp-rcnb-cli.cc)
 84 | set_target_properties(example3
 85 |         PROPERTIES CXX_STANDARD 11)
 86 | target_link_libraries(rcnb-cli rcnb-static)
 87 | set_target_properties(rcnb-cli
 88 |         PROPERTIES OUTPUT_NAME rcnb
 89 |         CXX_STANDARD 11)
 90 | 
 91 | ###
 92 | ### General compilation settings
 93 | ###
 94 | if(NOT CMAKE_CONFIGURATION_TYPES AND NOT CMAKE_BUILD_TYPE)
 95 |     set(CMAKE_BUILD_TYPE Release)
 96 | endif()
 97 | 
 98 | ###
 99 | ### General install settings
100 | ###
101 | include(GNUInstallDirs)
102 | 
103 | export(
104 |         TARGETS rcnb-static
105 |         FILE "${PROJECT_BINARY_DIR}/${PROJECT_NAME}-targets.cmake")
106 | export(PACKAGE ${PROJECT_NAME})
107 | set(EXPORT_TARGETS rcnb-static CACHE INTERNAL "export targets")
108 | 
109 | set(CONFIG_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/include")
110 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}-config.cmake.in
111 |         "${PROJECT_BINARY_DIR}/${PROJECT_NAME}-config.cmake" @ONLY)
112 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}-config-version.cmake.in
113 |         "${PROJECT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake" @ONLY)
114 | 
115 | install(TARGETS rcnb EXPORT ${PROJECT_NAME}-config
116 |         RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
117 |         LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
118 |         PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rcnb)
119 | install(EXPORT ${PROJECT_NAME}-config DESTINATION share/${PROJECT_NAME}/cmake)
120 | 


--------------------------------------------------------------------------------
/INSTALL:
--------------------------------------------------------------------------------
 1 | librcnb: RCNB Encoding/Decoding Routines
 2 | ======================================
 3 | 
 4 | Compiling:
 5 | ---------
 6 | ```
 7 | cmake .
 8 | make
 9 | ```
10 | 
11 | Installing:
12 | ----------
13 | ```
14 | sudo make install
15 | ```
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 rikakomoe
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | librcnb: RCNB Encoding/Decoding Routines
 2 | ======================================
 3 | 
 4 | Overview:
 5 | --------
 6 | librcnb is a library of ANSI C routines for fast encoding/decoding data into
 7 | and from a rcnb-encoded format. C++ wrappers are included, as well as the
 8 | source code for standalone encoding and decoding executables.
 9 | 
10 | References:
11 | ----------
12 | * RCNB.js:
13 | 	https://github.com/rcnbapp/RCNB.js
14 | * RCNB.php, another implementation of the rcnb encoding:
15 | 	https://github.com/rcnbapp/RCNB.php
16 | 
17 | Commandline Use:
18 | ---------------
19 | There is an executable available, it is simply called rcnb.
20 | It can encode and decode files, as instructed by the user.
21 | 
22 | To encode a file:
23 | $ ./rcnb -e filea fileb
24 | fileb will now be the rcnb-encoded version of filea.
25 | 
26 | To decode a file:
27 | $ ./rcnb -d fileb filec
28 | filec will now be identical to filea.
29 | 
30 | Programming:
31 | -----------
32 | Some C++ wrappers are provided as well, so you don't have to get your hands
33 | dirty. Encoding from standard input to standard output is as simple as
34 | 
35 | 	#include <rcnb/encode.h>
36 | 	#include <iostream>
37 | 	int main()
38 | 	{
39 | 		rcnb::encoder E;
40 | 		setlocale(LC_ALL, "");
41 | 		E.encode(std::cin, std::wcout);
42 | 		return 0;
43 | 	}
44 | 
45 | Both standalone executables and a static library is provided in the package,
46 | 
47 | Example code:
48 | ------------
49 | The 'examples' directory contains some simple example code, that demonstrates
50 | how to use the interface of the library.
51 | 
52 | More information:
53 | ------------
54 | Go to https://github.com/rcnbapp/librcnb/wiki to find out more information
55 | about librcnb.
56 | 


--------------------------------------------------------------------------------
/examples/c-example1.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | c-example1.c - librcnb example code
 3 | 
 4 | This is part of the librcnb project, and has been placed in the public domain.
 5 | For details, see https://github.com/rikakomoe/librcnb
 6 | 
 7 | This is a short example of how to use librcnb's C interface to encode
 8 | and decode a string directly.
 9 | */
10 | 
11 | #include <rcnb/cdecode.h>
12 | #include <rcnb/cencode.h>
13 | 
14 | #include <assert.h>
15 | #include <locale.h>
16 | #include <stdio.h>
17 | #include <stdlib.h>
18 | #include <string.h>
19 | #include <wchar.h>
20 | 
21 | /* arbitrary buffer size */
22 | #define SIZE 256
23 | 
24 | int main()
25 | {
26 |     const char* input = "The Quick Brown RC Jumps Over the NB Dog.";
27 |     const wchar_t* rcnb_contrast = L"ȐčnÞȒċƝÞȐĈnƁȒȼǹþȓĆǹƃřČŇbȓƇńƄȓċȵƀȐĉņþŕƇNƅɌĉŇBȓƈȠßŕƇŃBɌċnþȓȼǸƅɌćÑbȒċƝÞƦȻƝƃŕƇNbȓƇNþŕC";
28 |     wchar_t* encoded = malloc(SIZE * sizeof(wchar_t));
29 |     char* decoded = malloc(SIZE);
30 | 
31 |     setlocale(LC_ALL, "");
32 | 
33 |     /* encode the data */
34 |     rcnb_encode(input, strlen(input), encoded);
35 |     wprintf(L"encoded: %ls\n", encoded);
36 | 
37 |     /* decode the data */
38 |     ptrdiff_t res = rcnb_decode(encoded, wcslen(encoded), decoded);
39 |     if (res < 0)
40 |         wprintf(L"decode failed\n");
41 |     printf("decoded: %s\n", decoded);
42 | 
43 |     /* compare the original and decoded data */
44 |     assert(strcmp(input, decoded) == 0);
45 |     assert(wcscmp(encoded, rcnb_contrast) == 0);
46 | 
47 |     free(encoded);
48 |     free(decoded);
49 |     return 0;
50 | }


--------------------------------------------------------------------------------
/examples/c-example2.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | c-example2.c - librcnb example code
  3 | 
  4 | This is part of the librcnb project, and has been placed in the public domain.
  5 | For details, see https://github.com/rikakomoe/librcnb
  6 | 
  7 | This is a short example of how to use librcnb's C interface to encode
  8 | and decode a file directly.
  9 | 
 10 | The main work is done between the START/STOP ENCODING and DECODING lines.
 11 | The main difference between this code and c-example1.c is that we do not
 12 | know the size of the input file before hand, and so we use to iterate over
 13 | encoding and decoding the data.
 14 | */
 15 | 
 16 | #include <rcnb/cdecode.h>
 17 | #include <rcnb/cencode.h>
 18 | 
 19 | #include <locale.h>
 20 | #include <stdio.h>
 21 | #include <stdlib.h>
 22 | #include <string.h>
 23 | #include <wchar.h>
 24 | 
 25 | /* arbitrary buffer size */
 26 | #define SIZE 100
 27 | 
 28 | void encode(FILE* inputFile, FILE* outputFile)
 29 | {
 30 |     /* set up a destination buffer large enough to hold the encoded data */
 31 |     int size = SIZE;
 32 |     char* input = (char*)malloc(size);
 33 |     wchar_t* encoded = (wchar_t*)malloc(3 * size * sizeof(wchar_t)); /* ~2 x input */
 34 |     /* we need an encoder and decoder state */
 35 |     rcnb_encodestate es;
 36 |     /* store the number of bytes encoded by a single call */
 37 |     size_t cnt = 0;
 38 | 
 39 |     /*---------- START ENCODING ----------*/
 40 |     /* initialise the encoder state */
 41 |     rcnb_init_encodestate(&es);
 42 |     /* gather data from the input and send it to the output */
 43 |     while (true)
 44 |     {
 45 |         cnt = fread(input, sizeof(char), size, inputFile);
 46 |         if (cnt == 0) break;
 47 |         rcnb_encode_block(input, cnt, encoded, &es);
 48 |         /* output the encoded bytes to the output file */
 49 |         fputws(encoded, outputFile);
 50 |     }
 51 |     /* since we have reached the end of the input file, we know that
 52 |        there is no more input data; finalise the encoding */
 53 |     rcnb_encode_blockend(encoded, &es);
 54 |     /* write the last bytes to the output file */
 55 |     fputws(encoded, outputFile);
 56 |     /*---------- STOP ENCODING  ----------*/
 57 | 
 58 |     free(encoded);
 59 |     free(input);
 60 | }
 61 | 
 62 | void decode(FILE* inputFile, FILE* outputFile)
 63 | {
 64 |     /* set up a destination buffer large enough to hold the decoded data */
 65 |     int size = SIZE;
 66 |     wchar_t* encoded = (wchar_t*)malloc(3 * size * sizeof(wchar_t));
 67 |     char* decoded = (char*)malloc(size); /* ~1/2 x encoded */
 68 |     /* we need an encoder and decoder state */
 69 |     rcnb_decodestate ds;
 70 |     /* store the number of bytes encoded by a single call */
 71 |     size_t cnt = 0;
 72 | 
 73 |     /*---------- START DECODING ----------*/
 74 |     /* initialise the encoder state */
 75 |     rcnb_init_decodestate(&ds);
 76 |     /* gather data from the input and send it to the output */
 77 |     while (fgetws(encoded, size, inputFile))
 78 |     {
 79 |         cnt = rcnb_decode_block(encoded, wcslen(encoded), decoded, &ds);
 80 |         /* output the encoded bytes to the output file */
 81 |         fwrite(decoded, sizeof(char), cnt, outputFile);
 82 |     }
 83 |     /* since we have reached the end of the input file, we know that
 84 |        there is no more input data; finalise the decoding */
 85 |     cnt = rcnb_decode_blockend(decoded, &ds);
 86 |     /* write the last bytes to the output file */
 87 |     fwrite(decoded, sizeof(char), cnt, outputFile);
 88 |     /*---------- STOP DECODING  ----------*/
 89 | 
 90 |     free(encoded);
 91 |     free(decoded);
 92 | }
 93 | 
 94 | int main(int argc, char** argv)
 95 | {
 96 |     FILE* inputFile;
 97 |     FILE* encodedFile;
 98 |     FILE* decodedFile;
 99 | 
100 |     if (argc < 4)
101 |     {
102 |         printf("please supply three filenames: input, encoded & decoded\n");
103 |         exit(-1);
104 |     }
105 | 
106 |     /* encode the input file */
107 | 
108 |     setlocale(LC_ALL, "");
109 |     inputFile   = fopen(argv[1], "rb, ccs=UTF-8");
110 |     encodedFile = fopen(argv[2], "wb, ccs=UTF-8");
111 | 
112 |     encode(inputFile, encodedFile);
113 | 
114 |     fclose(inputFile);
115 |     fclose(encodedFile);
116 | 
117 |     /* decode the encoded file */
118 | 
119 |     encodedFile = fopen(argv[2], "rb, ccs=UTF-8");
120 |     decodedFile = fopen(argv[3], "wb, ccs=UTF-8");
121 | 
122 |     decode(encodedFile, decodedFile);
123 | 
124 |     fclose(encodedFile);
125 |     fclose(decodedFile);
126 | 
127 |     return 0;
128 | }
129 | 
130 | 


--------------------------------------------------------------------------------
/examples/cpp-example3.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 | cpp-example3.cc - c++ source to a rcnb reference encoder and decoder
 3 | 
 4 | This is part of the librcnb project, and has been placed in the public domain.
 5 | For details, see https://github.com/rikakomoe/librcnb
 6 | */
 7 | 
 8 | #include <rcnb/encode.h>
 9 | #include <iostream>
10 | 
11 | int main()
12 | {
13 |     rcnb::encoder E;
14 |     setlocale(LC_ALL, "");
15 |     E.encode(std::cin, std::wcout);
16 |     return 0;
17 | }
18 | 


--------------------------------------------------------------------------------
/examples/cpp-rcnb-cli.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 | rcnb.cc - c++ source to a rcnb reference encoder and decoder
  3 | 
  4 | This is part of the librcnb project, and has been placed in the public domain.
  5 | For details, see https://github.com/rikakomoe/librcnb
  6 | */
  7 | 
  8 | #include <rcnb/decode.h>
  9 | #include <rcnb/encode.h>
 10 | 
 11 | #include <codecvt>
 12 | #include <cstdlib>
 13 | #include <fstream>
 14 | #include <iostream>
 15 | #include <string>
 16 | 
 17 | void usage()
 18 | {
 19 |     std::cerr << \
 20 | 		"rcnb: Encodes and Decodes files using rcnb\n" \
 21 | 		"Usage: rcnb [-e|-d] [input] [output]\n" \
 22 | 		"   Where [-e] will encode the input file into the output file,\n" \
 23 | 		"         [-d] will decode the input file into the output file, and\n" \
 24 | 		"         [input] and [output] are the input and output files, respectively.\n";
 25 | }
 26 | 
 27 | void usage(const std::string& message)
 28 | {
 29 |     usage();
 30 |     std::cerr << "Incorrect invocation of rcnb:\n";
 31 |     std::cerr << message << std::endl;
 32 | }
 33 | 
 34 | int main(int argc, char** argv)
 35 | {
 36 |     if (argc == 1)
 37 |     {
 38 |         usage();
 39 |         exit(-1);
 40 |     }
 41 |     if (argc != 4)
 42 |     {
 43 |         usage("Wrong number of arguments!");
 44 |         exit(-1);
 45 |     }
 46 | 
 47 |     std::string input = argv[2];
 48 |     std::string output = argv[3];
 49 | 
 50 |     // determine whether we need to encode or decode:
 51 |     std::string choice = argv[1];
 52 |     if (choice == "-d")
 53 |     {
 54 |         std::wifstream instream(input.c_str(), std::ios_base::in | std::ios_base::binary);
 55 |         if (!instream.is_open())
 56 |         {
 57 |             usage("Could not open input file!");
 58 |             exit(-1);
 59 |         }
 60 |         std::locale utf8_locale(std::locale(), new std::codecvt_utf8<wchar_t>);
 61 |         instream.imbue(utf8_locale);
 62 | 
 63 |         std::ofstream outstream(output.c_str(), std::ios_base::out | std::ios_base::binary);
 64 |         if (!outstream.is_open())
 65 |         {
 66 |             usage("Could not open output file!");
 67 |             exit(-1);
 68 |         }
 69 |         rcnb::decoder D;
 70 |         D.decode(instream, outstream);
 71 |     }
 72 |     else if (choice == "-e")
 73 |     {
 74 |         std::ifstream instream(input.c_str(), std::ios_base::in | std::ios_base::binary);
 75 |         if (!instream.is_open())
 76 |         {
 77 |             usage("Could not open input file!");
 78 |             exit(-1);
 79 |         }
 80 | 
 81 |         std::wofstream outstream(output.c_str(), std::ios_base::out | std::ios_base::binary);
 82 |         if (!outstream.is_open())
 83 |         {
 84 |             usage("Could not open output file!");
 85 |             exit(-1);
 86 |         }
 87 |         std::locale utf8_locale(std::locale(), new std::codecvt_utf8<wchar_t>);
 88 |         outstream.imbue(utf8_locale);
 89 |         rcnb::encoder E;
 90 |         E.encode(instream, outstream);
 91 |     }
 92 |     else
 93 |     {
 94 |         std::cout<<"["<<choice<<"]"<<std::endl;
 95 |         usage("Please specify -d or -e as first argument!");
 96 |     }
 97 | 
 98 |     return 0;
 99 | }
100 | 


--------------------------------------------------------------------------------
/examples/python3-example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | python3-example.py - librcnb example code
 3 | 
 4 | This is part of the librcnb project, and has been placed in the public domain.
 5 | For details, see https://github.com/rikakomoe/librcnb
 6 | 
 7 | This is a short example of how to use librcnb's shared library to encode
 8 | and decode a string directly.
 9 | """
10 | 
11 | from ctypes import *
12 | 
13 | rcnb = CDLL('librcnb.so')
14 | 
15 | origin = 'The Quick Brown RC Jumps Over the NB Dog. 😄'.encode('utf-8')
16 | encoded = create_unicode_buffer(256)
17 | rcnb.rcnb_encode(c_char_p(origin), len(origin), encoded)
18 | print(encoded.value)
19 | 
20 | decoded = create_string_buffer(256)
21 | rcnb.rcnb_decode(encoded, len(encoded.value), decoded)
22 | print(decoded.value.decode('utf-8'))
23 | 


--------------------------------------------------------------------------------
/include/rcnb/cdecode.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | cdecode.h - c header for an rcnb encoding algorithm
 3 | 
 4 | This is part of the librcnb project, and has been placed in the public domain.
 5 | For details, see https://github.com/rikakomoe/librcnb
 6 | */
 7 | 
 8 | #ifndef RCNB_CDECODE_H
 9 | #define RCNB_CDECODE_H
10 | 
11 | #include <stddef.h>
12 | #include <stdbool.h>
13 | 
14 | typedef struct
15 | {
16 |     size_t i;
17 |     wchar_t trailing_code[4];
18 | } rcnb_decodestate;
19 | 
20 | void rcnb_init_decodestate(rcnb_decodestate* state_in);
21 | ptrdiff_t rcnb_decode_block(const wchar_t* code_in, size_t length_in, char* plaintext_out, rcnb_decodestate* state_in);
22 | ptrdiff_t rcnb_decode_blockend(char* plaintext_out, rcnb_decodestate* state_in);
23 | ptrdiff_t rcnb_decode(const wchar_t* code_in, size_t length_in, char* plaintext_out);
24 | 
25 | int rcnb_decode_32n_asm(const char *value_in, char *value_out, size_t n);
26 | 
27 | #endif //RCNB_CDECODE_H
28 | 


--------------------------------------------------------------------------------
/include/rcnb/cencode.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | cencode.h - c header for an rcnb encoding algorithm
 3 | 
 4 | This is part of the librcnb project, and has been placed in the public domain.
 5 | For details, see https://github.com/rikakomoe/librcnb
 6 | */
 7 | 
 8 | #ifndef RCNB_CENCODE_H
 9 | #define RCNB_CENCODE_H
10 | 
11 | #include <stddef.h>
12 | #include <stdbool.h>
13 | 
14 | typedef struct
15 | {
16 |     bool cached;
17 |     char trailing_byte;
18 | } rcnb_encodestate;
19 | 
20 | void rcnb_init_encodestate(rcnb_encodestate* state_in);
21 | size_t rcnb_encode_block(const char* plaintext_in, size_t length_in, wchar_t* code_out, rcnb_encodestate* state_in);
22 | size_t rcnb_encode_blockend(wchar_t* code_out, rcnb_encodestate* state_in);
23 | size_t rcnb_encode(const char* plaintext_in, size_t length_in, wchar_t* code_out);
24 | 
25 | void rcnb_encode_32n_asm(const char *value_in, char *value_out, size_t n);
26 | 
27 | #endif /* RCNB_CENCODE_H */
28 | 


--------------------------------------------------------------------------------
/include/rcnb/decode.h:
--------------------------------------------------------------------------------
 1 | // :mode=c++:
 2 | 
 3 | /*
 4 | decode.h - c++ wrapper for an rcnb encoding algorithm
 5 | 
 6 | This is part of the librcnb project, and has been placed in the public domain.
 7 | For details, see https://github.com/rikakomoe/librcnb
 8 | */
 9 | 
10 | #ifndef RCNB_DECODE_H
11 | #define RCNB_DECODE_H
12 | 
13 | #define BUFFERSIZE 4096
14 | 
15 | #include <iostream>
16 | 
17 | namespace rcnb {
18 | 
19 | extern "C" {
20 |     #include "cdecode.h"
21 | }
22 | 
23 | struct decoder {
24 |     rcnb_decodestate _state;
25 |     int _buffersize;
26 | 
27 |     explicit decoder(int buffersize_in = BUFFERSIZE) : _buffersize(buffersize_in)
28 |     {
29 |     }
30 | 
31 |     void initialize()
32 |     {
33 |         rcnb_init_decodestate(&_state);
34 |     }
35 | 
36 |     ptrdiff_t decode(const wchar_t* code_in, size_t length_in, char* plaintext_out) {
37 |         return rcnb_decode_block(code_in, length_in, plaintext_out, &_state);
38 |     }
39 | 
40 |     ptrdiff_t decode_end(char* const plaintext_out) {
41 |         return rcnb_decode_blockend(plaintext_out, &_state);
42 |     }
43 | 
44 |     void decode(std::wistream& istream_in, std::ostream& ostream_in)
45 |     {
46 |         initialize();
47 | 
48 |         const int N = _buffersize;
49 |         char* plaintext = new char[N];
50 |         auto code = new wchar_t[3 * N];
51 |         long plainlength;
52 |         long codelength;
53 | 
54 |         do {
55 |             istream_in.read(code, N);
56 |             codelength = istream_in.gcount();
57 | 
58 |             plainlength = decode(code, codelength, plaintext);
59 |             ostream_in.write(plaintext, plainlength);
60 |         } while (istream_in.good() && plainlength > 0);
61 | 
62 |         plainlength = decode_end(plaintext);
63 |         ostream_in.write(plaintext, plainlength);
64 | 
65 |         delete[] code;
66 |         delete[] plaintext;
67 |     }
68 | };
69 | 
70 | } // namespace rcnb
71 | 
72 | #endif // RCNB_DECODE_H


--------------------------------------------------------------------------------
/include/rcnb/encode.h:
--------------------------------------------------------------------------------
 1 | // :mode=c++:
 2 | 
 3 | /*
 4 | encode.h - c++ wrapper for an rcnb encoding algorithm
 5 | 
 6 | This is part of the librcnb project, and has been placed in the public domain.
 7 | For details, see https://github.com/rikakomoe/librcnb
 8 | */
 9 | 
10 | #ifndef RCNB_ENCODE_H
11 | #define RCNB_ENCODE_H
12 | 
13 | #define BUFFERSIZE 4096
14 | 
15 | #include <iostream>
16 | 
17 | namespace rcnb {
18 | 
19 | extern "C" {
20 |     #include "cencode.h"
21 | }
22 | 
23 | struct encoder {
24 |     rcnb_encodestate _state;
25 |     int _buffersize;
26 | 
27 |     explicit encoder(int buffersize_in = BUFFERSIZE) : _buffersize(buffersize_in)
28 |     {
29 |     }
30 | 
31 |     void initialize()
32 |     {
33 |         rcnb_init_encodestate(&_state);
34 |     }
35 | 
36 |     size_t encode(const char* plaintext_in, size_t length_in, wchar_t* const code_out) {
37 |         return rcnb_encode_block(plaintext_in, length_in, code_out, &_state);
38 |     }
39 | 
40 |     size_t encode_end(wchar_t* const code_out) {
41 |         return rcnb_encode_blockend(code_out, &_state);
42 |     }
43 | 
44 |     void encode(std::istream& istream_in, std::wostream& ostream_in)
45 |     {
46 |         initialize();
47 | 
48 |         const int N = _buffersize;
49 |         char* plaintext = new char[N];
50 |         auto code = new wchar_t[3 * N];
51 |         long plainlength;
52 |         long codelength;
53 | 
54 |         do {
55 |             istream_in.read(plaintext, N);
56 |             plainlength = istream_in.gcount();
57 | 
58 |             codelength = encode(plaintext, plainlength, code);
59 |             ostream_in.write(code, codelength);
60 |         } while (istream_in.good() && plainlength > 0);
61 | 
62 |         codelength = encode_end(code);
63 |         ostream_in.write(code, codelength);
64 | 
65 |         delete[] code;
66 |         delete[] plaintext;
67 |     }
68 | };
69 | 
70 | } // namespace rcnb
71 | 
72 | #endif // RCNB_ENCODE_H


--------------------------------------------------------------------------------
/include/rcnb/rcnb.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | rcnb.h - c header to an rcnb encoding algorithm
 3 | 
 4 | This is part of the librcnb project, and has been placed in the public domain.
 5 | For details, see https://github.com/rikakomoe/librcnb
 6 | */
 7 | 
 8 | #ifndef RCNB_RCNB_H
 9 | #define RCNB_RCNB_H
10 | 
11 | #include <stddef.h>
12 | 
13 | extern const wchar_t cr[];
14 | extern const wchar_t cc[];
15 | extern const wchar_t cn[];
16 | extern const wchar_t cb[];
17 | 
18 | extern const unsigned short sr;
19 | extern const unsigned short sc;
20 | extern const unsigned short sn;
21 | extern const unsigned short sb;
22 | 
23 | #define src (sr * sc)
24 | #define snb (sn * sb)
25 | #define scnb (sc * snb)
26 | 
27 | #endif // RCNB_RCNB_H
28 | 


--------------------------------------------------------------------------------
/librcnb-config-version.cmake.in:
--------------------------------------------------------------------------------
 1 | set(PACKAGE_VERSION "@PROJECT_VERSION@")
 2 | 
 3 | # Check whether the requested PACKAGE_FIND_VERSION is compatible
 4 | if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}")
 5 |     set(PACKAGE_VERSION_COMPATIBLE FALSE)
 6 | else()
 7 |     set(PACKAGE_VERSION_COMPATIBLE TRUE)
 8 |     if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}")
 9 |         set(PACKAGE_VERSION_EXACT TRUE)
10 |     endif()
11 | endif()
12 | 


--------------------------------------------------------------------------------
/librcnb-config.cmake.in:
--------------------------------------------------------------------------------
 1 | # - Config file for the librcnb package
 2 | # It defines the following variables
 3 | #  RCNB_INCLUDE_DIR - include directory
 4 | #  RCNB_LIBRARIES    - libraries to link against
 5 | 
 6 | # Compute paths
 7 | get_filename_component(RCNB_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
 8 | set(RCNB_INCLUDE_DIR "@CONFIG_INCLUDE_DIRS@")
 9 | 
10 | # Our library dependencies (contains definitions for IMPORTED targets)
11 | include("${RCNB_CMAKE_DIR}/@PROJECT_NAME@-targets.cmake")
12 | 
13 | # These are IMPORTED targets created by librcnb-targets.cmake
14 | set(RCNB_LIBRARIES "@EXPORT_TARGETS@")
15 | 


--------------------------------------------------------------------------------
/src/cdecode.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | cdecode.c - c source to an rcnb encoding algorithm
  3 | 
  4 | This is part of the librcnb project, and has been placed in the public domain.
  5 | For details, see https://github.com/rikakomoe/librcnb
  6 | */
  7 | 
  8 | #include <rcnb/cdecode.h>
  9 | #include <rcnb/rcnb.h>
 10 | 
 11 | int find(const wchar_t* const arr, const unsigned length, const wchar_t target)
 12 | {
 13 |     for (const wchar_t* iter = arr; iter != arr + length; ++iter) {
 14 |         if (*iter == target)
 15 |             return (int)(iter - arr);
 16 |     }
 17 |     return -1;
 18 | }
 19 | 
 20 | void rcnb_init_decodestate(rcnb_decodestate* state_in)
 21 | {
 22 |     state_in->i = 0;
 23 | }
 24 | 
 25 | bool rcnb_decode_short(const wchar_t* value_in, char** value_out)
 26 | {
 27 |     bool reverse = find(cr, sr, *value_in) < 0;
 28 |     int idx[4];
 29 |     if (!reverse) {
 30 |         idx[0] = find(cr, sr, *value_in);
 31 |         idx[1] = find(cc, sc, *(value_in + 1));
 32 |         idx[2] = find(cn, sn, *(value_in + 2));
 33 |         idx[3] = find(cb, sb, *(value_in + 3));
 34 |     } else {
 35 |         idx[0] = find(cr, sr, *(value_in + 2));
 36 |         idx[1] = find(cc, sc, *(value_in + 3));
 37 |         idx[2] = find(cn, sn, *value_in);
 38 |         idx[3] = find(cb, sb, *(value_in + 1));
 39 |     }
 40 |     if (idx[0] < 0 || idx[1] < 0 || idx[2] < 0 || idx[3] < 0)
 41 |         return false;
 42 |     int result = idx[0] * scnb + idx[1] * snb + idx[2] * sb + idx[3];
 43 |     if (result > 0x7FFF)
 44 |         return false;
 45 |     result = reverse ? result | 0x8000 : result;
 46 |     *(*value_out)++ = (char)(result >> 8);
 47 |     *(*value_out)++ = (char)(result & 0xFF);
 48 |     return true;
 49 | }
 50 | 
 51 | bool rcnb_decode_byte(const wchar_t* value_in, char** value_out)
 52 | {
 53 |     bool nb = false;
 54 |     int idx[2] = { find(cr, sr, *value_in), find(cc, sc, *(value_in + 1)) };
 55 |     if (idx[0] < 0 || idx[1] < 0) {
 56 |         idx[0] = find(cn, sn, *value_in);
 57 |         idx[1] = find(cb, sb, *(value_in + 1));
 58 |         nb = true;
 59 |     }
 60 |     if (idx[0] < 0 || idx[1] < 0)
 61 |         return false;
 62 |     int result = nb ? idx[0] * sb + idx[1] : idx[0] * sc + idx[1];
 63 |     if (result > 0x7F)
 64 |         return false;
 65 |     *(*value_out)++ = (char)(nb ? result | 0x80 : result);
 66 |     return true;
 67 | }
 68 | 
 69 | ptrdiff_t rcnb_decode_block(const wchar_t* code_in, size_t length_in,
 70 |         char* const plaintext_out, rcnb_decodestate* state_in)
 71 | {
 72 |     char* plaintext_char = plaintext_out;
 73 |     bool res;
 74 |     while (state_in->i < 4 && length_in > 0) {
 75 |         state_in->trailing_code[state_in->i++] = code_in[0];
 76 |         length_in--;
 77 |         code_in++;
 78 |     }
 79 |     if (length_in == 0)
 80 |         return 0;
 81 |     res = rcnb_decode_short(state_in->trailing_code, &plaintext_char);
 82 |     if (!res)
 83 |         return -1;
 84 |     state_in->i = 0;
 85 | #if defined(ENABLE_AVX2) || defined(ENABLE_SSSE3) || defined(ENABLE_NEON)
 86 |     size_t batch = length_in >> 6;
 87 |     if (batch > 0) {
 88 |         res = rcnb_decode_32n_asm((const char *)code_in, plaintext_char, batch);
 89 |         if (!res)
 90 |             return -1;
 91 |     }
 92 |     plaintext_char += 32 * batch;
 93 |     code_in += 64 * batch;
 94 |     length_in = length_in & 63;
 95 | #endif
 96 |     for (int i = 0; i < (length_in >> 2); ++i) {
 97 |         res = rcnb_decode_short(code_in + i * 4, &plaintext_char);
 98 |         if (!res)
 99 |             return -1;
100 |     }
101 |     state_in->i = length_in % 4;
102 |     for (size_t j = 0; j < state_in->i; ++j) {
103 |         state_in->trailing_code[j] = code_in[length_in - state_in->i + j];
104 |     }
105 |     *plaintext_char = 0;
106 |     return plaintext_char - plaintext_out;
107 | }
108 | 
109 | ptrdiff_t rcnb_decode_blockend(char* const plaintext_out, rcnb_decodestate* state_in)
110 | {
111 |     if (state_in->i != 0 && state_in->i != 2)
112 |         return -1;
113 |     char* plaintext_char = plaintext_out;
114 |     if (state_in->i == 2) {
115 |         if(!rcnb_decode_byte(state_in->trailing_code, &plaintext_char))
116 |             return -1;
117 |     }
118 |     *plaintext_char = 0;
119 |     state_in->i = 0;
120 |     return plaintext_char - plaintext_out;
121 | }
122 | 
123 | ptrdiff_t rcnb_decode(const wchar_t* code_in, size_t length_in, char* plaintext_out)
124 | {
125 |     if (length_in == 0)
126 |         return 0;
127 |     rcnb_decodestate es;
128 |     rcnb_init_decodestate(&es);
129 |     size_t output_size = 0;
130 |     ptrdiff_t block_size = 0;
131 |     block_size = rcnb_decode_block(code_in, length_in, plaintext_out, &es);
132 |     if (block_size < 0)
133 |         return -1;
134 |     output_size += block_size;
135 |     block_size = rcnb_decode_blockend(plaintext_out + output_size, &es);
136 |     if (block_size < 0)
137 |         return -1;
138 |     return output_size + block_size;
139 | }
140 | 


--------------------------------------------------------------------------------
/src/cencode.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | cencode.c - c source to an rcnb encoding algorithm
  3 | 
  4 | This is part of the librcnb project, and has been placed in the public domain.
  5 | For details, see https://github.com/rikakomoe/librcnb
  6 | */
  7 | 
  8 | #include <rcnb/cencode.h>
  9 | #include <rcnb/rcnb.h>
 10 | 
 11 | void rcnb_init_encodestate(rcnb_encodestate* state_in)
 12 | {
 13 |     state_in->cached = false;
 14 | }
 15 | 
 16 | void rcnb_encode_short(unsigned short value_in, wchar_t** value_out)
 17 | {
 18 |     bool reverse = false;
 19 |     if (value_in > 0x7FFF) {
 20 |         reverse = true;
 21 |         value_in = (unsigned short)(value_in & 0x7FFF);
 22 |     }
 23 |     if (reverse)
 24 |         *value_out += 2;
 25 |     *(*value_out)++ = cr[value_in / scnb];
 26 |     *(*value_out)++ = cc[value_in % scnb / snb];
 27 |     if (reverse)
 28 |         *value_out -= 4;
 29 |     *(*value_out)++ = cn[value_in % snb / sb];
 30 |     *(*value_out)++ = cb[value_in % sb];
 31 |     if (reverse)
 32 |         *value_out += 2;
 33 | }
 34 | 
 35 | void rcnb_encode_byte(unsigned char value_in, wchar_t** value_out)
 36 | {
 37 |     if (value_in > 0x7F) {
 38 |         value_in = (unsigned char)(value_in & 0x7F);
 39 |         *(*value_out)++ = cn[value_in / sb];
 40 |         *(*value_out)++ = cb[value_in % sb];
 41 |         return;
 42 |     }
 43 |     *(*value_out)++ = cr[value_in / sc];
 44 |     *(*value_out)++ = cc[value_in % sc];
 45 | }
 46 | 
 47 | size_t rcnb_encode_block(const char* plaintext_in, size_t length_in,
 48 |         wchar_t* const code_out, rcnb_encodestate* state_in)
 49 | {
 50 |     if (length_in == 0)
 51 |         return 0;
 52 |     wchar_t* code_char = code_out;
 53 |     if (state_in->cached) {
 54 |         rcnb_encode_short(*(unsigned char*)(&state_in->trailing_byte) << 8 | *(unsigned char*)(&plaintext_in[0]),
 55 |                 &code_char);
 56 |         plaintext_in++;
 57 |         length_in--;
 58 |         state_in->cached = false;
 59 |     }
 60 | #if defined(ENABLE_AVX2) || defined(ENABLE_SSSE3) || defined(ENABLE_NEON)
 61 |     size_t batch = length_in >> 5;
 62 |     if (batch > 0) {
 63 |         rcnb_encode_32n_asm(plaintext_in, (char *) code_char, batch);
 64 |     }
 65 |     plaintext_in += 32 * batch;
 66 |     code_char += 64 * batch;
 67 |     length_in = length_in & 31;
 68 | #endif
 69 |     for (int i = 0; i < (length_in >> 1); ++i)
 70 |         rcnb_encode_short(*(unsigned char*)(&plaintext_in[i * 2]) << 8 | *(unsigned char*)(&plaintext_in[i * 2 + 1]),
 71 |                 &code_char);
 72 |     if (length_in & 1) {
 73 |         state_in->trailing_byte = plaintext_in[length_in - 1];
 74 |         state_in->cached = true;
 75 |     }
 76 |     *code_char = 0;
 77 |     return code_char - code_out;
 78 | }
 79 | 
 80 | size_t rcnb_encode_blockend(wchar_t* const code_out, rcnb_encodestate* state_in)
 81 | {
 82 |     wchar_t* code_char = code_out;
 83 |     if (state_in->cached) {
 84 |         rcnb_encode_byte(*(unsigned char*)(&state_in->trailing_byte), &code_char);
 85 |     }
 86 |     *code_char = 0;
 87 |     state_in->cached = false;
 88 |     return code_char - code_out;
 89 | }
 90 | 
 91 | size_t rcnb_encode(const char* plaintext_in, size_t length_in, wchar_t* code_out)
 92 | {
 93 |     rcnb_encodestate es;
 94 |     rcnb_init_encodestate(&es);
 95 |     size_t output_size = 0;
 96 |     output_size += rcnb_encode_block(plaintext_in, length_in, code_out, &es);
 97 |     output_size += rcnb_encode_blockend(code_out + output_size, &es);
 98 |     return output_size;
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/rcnb.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | rcnb.c - c source to an rcnb encoding algorithm
 3 | 
 4 | This is part of the librcnb project, and has been placed in the public domain.
 5 | For details, see https://github.com/rikakomoe/librcnb
 6 | */
 7 | 
 8 | #include <rcnb/rcnb.h>
 9 | 
10 | const wchar_t cr[] = {'r','R',L'Ŕ',L'ŕ',L'Ŗ',L'ŗ',L'Ř',L'ř',L'Ʀ',L'Ȑ',L'ȑ',L'Ȓ',L'ȓ',L'Ɍ',L'ɍ'};
11 | const wchar_t cc[] = {'c','C',L'Ć',L'ć',L'Ĉ',L'ĉ',L'Ċ',L'ċ',L'Č',L'č',L'Ƈ',L'ƈ',L'Ç',L'Ȼ',L'ȼ'};
12 | const wchar_t cn[] = {'n','N',L'Ń',L'ń',L'Ņ',L'ņ',L'Ň',L'ň',L'Ɲ',L'ƞ',L'Ñ',L'Ǹ',L'ǹ',L'Ƞ',L'ȵ'};
13 | const wchar_t cb[] = {'b','B',L'ƀ',L'Ɓ',L'ƃ',L'Ƅ',L'ƅ',L'ß',L'Þ',L'þ'};
14 | 
15 | const unsigned short sr = sizeof(cr) / sizeof(wchar_t);
16 | const unsigned short sc = sizeof(cc) / sizeof(wchar_t);
17 | const unsigned short sn = sizeof(cn) / sizeof(wchar_t);
18 | const unsigned short sb = sizeof(cb) / sizeof(wchar_t);
19 | 


--------------------------------------------------------------------------------
/src/rcnb_arm64.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | cencode_arm64.c - arm64 intrinsic source to an rcnb encoding algorithm
  3 | 
  4 | This is part of the librcnb project, and has been placed in the public domain.
  5 | For details, see https://github.com/rikakomoe/librcnb
  6 | */
  7 | 
  8 | #if defined(ENABLE_NEON)
  9 | 
 10 | #include <arm_neon.h>
 11 | #include <rcnb/cencode.h>
 12 | #include <rcnb/cdecode.h>
 13 | 
 14 | static const unsigned char r_lo[16] = {114, 82, 84, 85, 86, 87, 88, 89, 166, 16, 17, 18, 19, 76, 77};
 15 | static const unsigned char c_lo[16] = {99, 67, 6, 7, 8, 9, 10, 11, 12, 13, 135, 136, 199, 59, 60};
 16 | static const unsigned char n_lo[16] = {110, 78, 67, 68, 69, 70, 71, 72, 157, 158, 209, 248, 249, 32, 53};
 17 | static const unsigned char b_lo[16] = {98, 66, 128, 129, 131, 132, 133, 223, 222, 254};
 18 | 
 19 | static const unsigned char r_hi[16] = {0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2};
 20 | static const unsigned char c_hi[16] = {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2};
 21 | static const unsigned char n_hi[16] = {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 2};
 22 | static const unsigned char b_hi[16] = {0, 0, 1, 1, 1, 1, 1, 0, 0, 0};
 23 | 
 24 | static const unsigned char s_tbl[16] = {0, 0, 255, 255, 255, 0, 255, 0};
 25 | 
 26 | static const unsigned char r_tbl[16] = {14, 8, 0, 255, 2, 3, 4, 5, 6, 7, 9, 10, 11, 1, 12, 13};
 27 | static const unsigned char c_tbl[16] = {13, 3, 9, 14, 4, 0, 5, 255, 10, 6, 11, 1, 7, 12, 2, 8};
 28 | static const unsigned char n_tbl[16] = {10, 3, 255, 4, 8, 0, 5, 9, 6, 1, 7, 13, 11, 14, 2, 12};
 29 | static const unsigned char b_tbl[16] = {2, 3, 255, 255, 4, 255, 5, 6, 8, 7, 255, 1, 9, 255, 255, 0};
 30 | 
 31 | void rcnb_encode_32n_asm(const char *value_in, char *value_out, size_t n) {
 32 |     const int16x8_t mask = vdupq_n_s16(0x7fff);
 33 |     for (size_t i = 0; i < n; ++i) {
 34 |         int16x8_t sinput1 = (int16x8_t) vrev16q_s8(vld1q_s8((const signed char *) value_in));
 35 |         int16x8_t sinput2 = (int16x8_t) vrev16q_s8(vld1q_s8((const signed char *) (value_in + 16)));
 36 |         value_in += 32;
 37 |         uint16x8_t sign1 = (uint16x8_t) vshrq_n_s16(sinput1, 15);
 38 |         uint16x8_t sign2 = (uint16x8_t) vshrq_n_s16(sinput2, 15);
 39 |         uint16x8_t input1 = (uint16x8_t) vandq_s16(sinput1, mask);
 40 |         uint16x8_t input2 = (uint16x8_t) vandq_s16(sinput2, mask);
 41 | 
 42 |         uint32x4_t t1, t2;
 43 |         uint16x8_t idx_r1, idx_c1, idx_n1, idx_b1;
 44 |         uint16x8_t idx_r2, idx_c2, idx_n2, idx_b2;
 45 |         {
 46 |             t1 = vmull_n_u16(vget_low_u16(input1), 59653);
 47 |             t2 = vmull_high_n_u16(input1, 59653);
 48 |             idx_r1 = vuzp2q_u16((uint16x8_t) t1, (uint16x8_t) t2);
 49 |             idx_r1 = vshrq_n_u16(idx_r1, 11);
 50 | 
 51 |             uint16x8_t r_mul_2250 = vmulq_n_u16(idx_r1, 2250);
 52 |             uint16x8_t i_mod_2250 = vsubq_u16(input1, r_mul_2250);
 53 |             t1 = vmull_n_u16(vget_low_u16(i_mod_2250), 55925);
 54 |             t2 = vmull_high_n_u16(i_mod_2250, 55925);
 55 |             idx_c1 = vuzp2q_u16((uint16x8_t) t1, (uint16x8_t) t2);
 56 |             idx_c1 = vshrq_n_u16(idx_c1, 7);
 57 | 
 58 |             uint16x8_t c_mul_150 = vmlaq_n_u16(r_mul_2250, idx_c1, 150);
 59 |             uint16x8_t i_mod_150 = vsubq_u16(input1, c_mul_150);
 60 |             t1 = vmull_n_u16(vget_low_u16(i_mod_150), 52429);
 61 |             t2 = vmull_high_n_u16(i_mod_150, 52429);
 62 |             idx_n1 = vuzp2q_u16((uint16x8_t) t1, (uint16x8_t) t2);
 63 |             idx_n1 = vshrq_n_u16(idx_n1, 3);
 64 | 
 65 |             idx_b1 = vsubq_u16(input1, vmlaq_n_u16(c_mul_150, idx_n1, 10));
 66 |         }
 67 | 
 68 |         {
 69 |             t1 = vmull_n_u16(vget_low_u16(input2), 59653);
 70 |             t2 = vmull_high_n_u16(input2, 59653);
 71 |             idx_r2 = vuzp2q_u16((uint16x8_t) t1, (uint16x8_t) t2);
 72 |             idx_r2 = vshrq_n_u16(idx_r2, 11);
 73 | 
 74 |             uint16x8_t r_mul_2250 = vmulq_n_u16(idx_r2, 2250);
 75 |             uint16x8_t i_mod_2250 = vsubq_u16(input2, r_mul_2250);
 76 |             t1 = vmull_n_u16(vget_low_u16(i_mod_2250), 55925);
 77 |             t2 = vmull_high_n_u16(i_mod_2250, 55925);
 78 |             idx_c2 = vuzp2q_u16((uint16x8_t) t1, (uint16x8_t) t2);
 79 |             idx_c2 = vshrq_n_u16(idx_c2, 7);
 80 | 
 81 |             uint16x8_t c_mul_150 = vmlaq_n_u16(r_mul_2250, idx_c2, 150);
 82 |             uint16x8_t i_mod_150 = vsubq_u16(input2, c_mul_150);
 83 |             t1 = vmull_n_u16(vget_low_u16(i_mod_150), 52429);
 84 |             t2 = vmull_high_n_u16(i_mod_150, 52429);
 85 |             idx_n2 = vuzp2q_u16((uint16x8_t) t1, (uint16x8_t) t2);
 86 |             idx_n2 = vshrq_n_u16(idx_n2, 3);
 87 | 
 88 |             idx_b2 = vsubq_u16(input2, vmlaq_n_u16(c_mul_150, idx_n2, 10));
 89 |         }
 90 | 
 91 |         uint8x8_t idx_rt = vmovn_u16(idx_r1);
 92 |         uint8x16_t idx_r = vmovn_high_u16(idx_rt, idx_r2);
 93 |         uint8x8_t idx_ct = vmovn_u16(idx_c1);
 94 |         uint8x16_t idx_c = vmovn_high_u16(idx_ct, idx_c2);
 95 |         uint8x8_t idx_nt = vmovn_u16(idx_n1);
 96 |         uint8x16_t idx_n = vmovn_high_u16(idx_nt, idx_n2);
 97 |         uint8x8_t idx_bt = vmovn_u16(idx_b1);
 98 |         uint8x16_t idx_b = vmovn_high_u16(idx_bt, idx_b2);
 99 | 
100 |         uint8x16_t r_l = vqtbl1q_u8(vld1q_u8(r_lo), idx_r);
101 |         uint8x16_t r_h = vqtbl1q_u8(vld1q_u8(r_hi), idx_r);
102 |         uint8x16_t c_l = vqtbl1q_u8(vld1q_u8(c_lo), idx_c);
103 |         uint8x16_t c_h = vqtbl1q_u8(vld1q_u8(c_hi), idx_c);
104 |         uint8x16_t n_l = vqtbl1q_u8(vld1q_u8(n_lo), idx_n);
105 |         uint8x16_t n_h = vqtbl1q_u8(vld1q_u8(n_hi), idx_n);
106 |         uint8x16_t b_l = vqtbl1q_u8(vld1q_u8(b_lo), idx_b);
107 |         uint8x16_t b_h = vqtbl1q_u8(vld1q_u8(b_hi), idx_b);
108 | 
109 |         uint16x8_t r1t = (uint16x8_t) vzip1q_u8(r_l, r_h);
110 |         uint16x8_t r2t = (uint16x8_t) vzip2q_u8(r_l, r_h);
111 |         uint16x8_t c1t = (uint16x8_t) vzip1q_u8(c_l, c_h);
112 |         uint16x8_t c2t = (uint16x8_t) vzip2q_u8(c_l, c_h);
113 |         uint16x8_t n1t = (uint16x8_t) vzip1q_u8(n_l, n_h);
114 |         uint16x8_t n2t = (uint16x8_t) vzip2q_u8(n_l, n_h);
115 |         uint16x8_t b1t = (uint16x8_t) vzip1q_u8(b_l, b_h);
116 |         uint16x8_t b2t = (uint16x8_t) vzip2q_u8(b_l, b_h);
117 | 
118 |         if (sizeof(wchar_t) == 2) {
119 |             uint16x8x4_t rcnb1, rcnb2;
120 | 
121 |             rcnb1.val[0] = vbslq_u16(sign1, n1t, r1t);
122 |             rcnb1.val[1] = vbslq_u16(sign1, b1t, c1t);
123 |             rcnb1.val[2] = vbslq_u16(sign1, r1t, n1t);
124 |             rcnb1.val[3] = vbslq_u16(sign1, c1t, b1t);
125 |             rcnb2.val[0] = vbslq_u16(sign2, n2t, r2t);
126 |             rcnb2.val[1] = vbslq_u16(sign2, b2t, c2t);
127 |             rcnb2.val[2] = vbslq_u16(sign2, r2t, n2t);
128 |             rcnb2.val[3] = vbslq_u16(sign2, c2t, b2t);
129 | 
130 |             vst4q_u16((unsigned short *) value_out, rcnb1);
131 |             value_out += 64;
132 |             vst4q_u16((unsigned short *) value_out, rcnb2);
133 |             value_out += 64;
134 |         } else if (sizeof(wchar_t) == 4) {
135 |             uint16x8_t r1 = vbslq_u16(sign1, n1t, r1t);
136 |             uint16x8_t c1 = vbslq_u16(sign1, b1t, c1t);
137 |             uint16x8_t n1 = vbslq_u16(sign1, r1t, n1t);
138 |             uint16x8_t b1 = vbslq_u16(sign1, c1t, b1t);
139 |             uint16x8_t r2 = vbslq_u16(sign2, n2t, r2t);
140 |             uint16x8_t c2 = vbslq_u16(sign2, b2t, c2t);
141 |             uint16x8_t n2 = vbslq_u16(sign2, r2t, n2t);
142 |             uint16x8_t b2 = vbslq_u16(sign2, c2t, b2t);
143 | 
144 |             uint32x4x4_t rcnb;
145 | 
146 |             rcnb.val[0] = vmovl_u16(vget_low_u16(r1));
147 |             rcnb.val[1] = vmovl_u16(vget_low_u16(c1));
148 |             rcnb.val[2] = vmovl_u16(vget_low_u16(n1));
149 |             rcnb.val[3] = vmovl_u16(vget_low_u16(b1));
150 |             vst4q_u32((unsigned int *) value_out, rcnb);
151 |             value_out += 64;
152 | 
153 |             rcnb.val[0] = vmovl_u16(vget_high_u16(r1));
154 |             rcnb.val[1] = vmovl_u16(vget_high_u16(c1));
155 |             rcnb.val[2] = vmovl_u16(vget_high_u16(n1));
156 |             rcnb.val[3] = vmovl_u16(vget_high_u16(b1));
157 |             vst4q_u32((unsigned int *) value_out, rcnb);
158 |             value_out += 64;
159 | 
160 |             rcnb.val[0] = vmovl_u16(vget_low_u16(r2));
161 |             rcnb.val[1] = vmovl_u16(vget_low_u16(c2));
162 |             rcnb.val[2] = vmovl_u16(vget_low_u16(n2));
163 |             rcnb.val[3] = vmovl_u16(vget_low_u16(b2));
164 |             vst4q_u32((unsigned int *) value_out, rcnb);
165 |             value_out += 64;
166 | 
167 |             rcnb.val[0] = vmovl_u16(vget_high_u16(r2));
168 |             rcnb.val[1] = vmovl_u16(vget_high_u16(c2));
169 |             rcnb.val[2] = vmovl_u16(vget_high_u16(n2));
170 |             rcnb.val[3] = vmovl_u16(vget_high_u16(b2));
171 |             vst4q_u32((unsigned int *) value_out, rcnb);
172 |             value_out += 64;
173 |         }
174 |     }
175 | }
176 | 
177 | int rcnb_decode_32n_asm(const char *value_in, char *value_out, size_t n) {
178 |     uint16x8x4_t rcnb1, rcnb2;
179 |     for (size_t i = 0; i < n; ++i) {
180 |         if (sizeof(wchar_t) == 2) {
181 |             rcnb1 = vld4q_u16((const unsigned short *) value_in);
182 |             rcnb2 = vld4q_u16((const unsigned short *) (value_in + 64));
183 |             value_in += 128;
184 |         } else if (sizeof(wchar_t) == 4) {
185 |             uint32x4x4_t tmp1, tmp2;
186 |             tmp1 = vld4q_u32((const unsigned int *) value_in);
187 |             tmp2 = vld4q_u32((const unsigned int *) (value_in + 64));
188 |             rcnb1.val[0] = vcombine_u16(vmovn_u32(tmp1.val[0]), vmovn_u32(tmp2.val[0]));
189 |             rcnb1.val[1] = vcombine_u16(vmovn_u32(tmp1.val[1]), vmovn_u32(tmp2.val[1]));
190 |             rcnb1.val[2] = vcombine_u16(vmovn_u32(tmp1.val[2]), vmovn_u32(tmp2.val[2]));
191 |             rcnb1.val[3] = vcombine_u16(vmovn_u32(tmp1.val[3]), vmovn_u32(tmp2.val[3]));
192 |             value_in += 128;
193 | 
194 |             tmp1 = vld4q_u32((const unsigned int *) value_in);
195 |             tmp2 = vld4q_u32((const unsigned int *) (value_in + 64));
196 |             rcnb2.val[0] = vcombine_u16(vmovn_u32(tmp1.val[0]), vmovn_u32(tmp2.val[0]));
197 |             rcnb2.val[1] = vcombine_u16(vmovn_u32(tmp1.val[1]), vmovn_u32(tmp2.val[1]));
198 |             rcnb2.val[2] = vcombine_u16(vmovn_u32(tmp1.val[2]), vmovn_u32(tmp2.val[2]));
199 |             rcnb2.val[3] = vcombine_u16(vmovn_u32(tmp1.val[3]), vmovn_u32(tmp2.val[3]));
200 |             value_in += 128;
201 |         }
202 | 
203 |         uint16x8_t sign_idx1 = vshrq_n_u16(vmulq_n_u16(rcnb1.val[0], 2117), 13);
204 |         uint16x8_t sign_idx2 = vshrq_n_u16(vmulq_n_u16(rcnb2.val[0], 2117), 13);
205 |         uint16x8_t sign1 = (uint16x8_t)vqtbl1q_u8(vld1q_u8(s_tbl), (uint8x16_t)sign_idx1);
206 |         uint16x8_t sign2 = (uint16x8_t)vqtbl1q_u8(vld1q_u8(s_tbl), (uint8x16_t)sign_idx2);
207 |         sign1 = vsliq_n_u16(sign1, sign1, 8);
208 |         sign2 = vsliq_n_u16(sign2, sign2, 8);
209 |         
210 |         uint16x8_t r_c1 = vbslq_u16(sign1, rcnb1.val[2], rcnb1.val[0]);
211 |         uint16x8_t c_c1 = vbslq_u16(sign1, rcnb1.val[3], rcnb1.val[1]);
212 |         uint16x8_t n_c1 = vbslq_u16(sign1, rcnb1.val[0], rcnb1.val[2]);
213 |         uint16x8_t b_c1 = vbslq_u16(sign1, rcnb1.val[1], rcnb1.val[3]);
214 |         uint16x8_t r_c2 = vbslq_u16(sign2, rcnb2.val[2], rcnb2.val[0]);
215 |         uint16x8_t c_c2 = vbslq_u16(sign2, rcnb2.val[3], rcnb2.val[1]);
216 |         uint16x8_t n_c2 = vbslq_u16(sign2, rcnb2.val[0], rcnb2.val[2]);
217 |         uint16x8_t b_c2 = vbslq_u16(sign2, rcnb2.val[1], rcnb2.val[3]);
218 | 
219 |         sign1 = vshlq_n_u16(sign1, 15);
220 |         sign2 = vshlq_n_u16(sign2, 15);
221 | 
222 |         r_c1 = vshrq_n_u16(vmulq_n_u16(r_c1, 4675), 12);
223 |         c_c1 = vshrq_n_u16(vmulq_n_u16(c_c1, 11482), 12);
224 |         n_c1 = vshrq_n_u16(vmulq_n_u16(n_c1, 9726), 12);
225 |         b_c1 = vsraq_n_u16(vsraq_n_u16(b_c1, b_c1, 1), b_c1, 3);
226 | 
227 |         r_c2 = vshrq_n_u16(vmulq_n_u16(r_c2, 4675), 12);
228 |         c_c2 = vshrq_n_u16(vmulq_n_u16(c_c2, 11482), 12);
229 |         n_c2 = vshrq_n_u16(vmulq_n_u16(n_c2, 9726), 12);
230 |         b_c2 = vsraq_n_u16(vsraq_n_u16(b_c2, b_c2, 1), b_c2, 3);
231 | 
232 |         uint8x16_t r_v = vqtbl1q_u8(vld1q_u8(r_tbl), vcombine_u8(vmovn_u16(r_c1), vmovn_u16(r_c2)));
233 |         uint8x16_t c_v = vqtbl1q_u8(vld1q_u8(c_tbl), vcombine_u8(vmovn_u16(c_c1), vmovn_u16(c_c2)));
234 |         uint8x16_t n_v = vqtbl1q_u8(vld1q_u8(n_tbl), vcombine_u8(vmovn_u16(n_c1), vmovn_u16(n_c2)));
235 |         uint8x16_t b_v = vqtbl1q_u8(vld1q_u8(b_tbl), vbicq_u8(vcombine_u8(vmovn_u16(b_c1), vmovn_u16(b_c2)), vdupq_n_u8(0xf0)));
236 | 
237 |         uint8x16_t bad_cv = vdupq_n_u8(0xff);
238 |         uint8x16_t bad_v = vorrq_u8(vorrq_u8(vceqq_u8(r_v, bad_cv), vceqq_u8(c_v, bad_cv)),
239 |                                     vorrq_u8(vceqq_u8(n_v, bad_cv), vceqq_u8(b_v, bad_cv)));
240 | 
241 |         uint16x8_t rn1 = vmovl_u8(vget_low_u8(n_v));
242 |         uint16x8_t rn2 = vmovl_u8(vget_high_u8(n_v));
243 |         rn1 = vmlal_u8(rn1, vget_low_u8(r_v), vdup_n_u8(225));
244 |         rn2 = vmlal_u8(rn2, vget_high_u8(r_v), vdup_n_u8(225));
245 | 
246 |         uint16x8_t cb1 = vmovl_u8(vget_low_u8(b_v));
247 |         uint16x8_t cb2 = vmovl_u8(vget_high_u8(b_v));
248 |         cb1 = vmlal_u8(cb1, vget_low_u8(c_v), vdup_n_u8(150));
249 |         cb2 = vmlal_u8(cb2, vget_high_u8(c_v), vdup_n_u8(150));
250 | 
251 |         if (vmaxvq_u8(bad_v)) {
252 |             return 0;
253 |         }
254 | 
255 |         uint16x8x2_t result;
256 |         result.val[0] = vmlaq_n_u16(cb1, rn1, 10);
257 |         result.val[1] = vmlaq_n_u16(cb2, rn2, 10);
258 |         result.val[0] = vorrq_u16(result.val[0], sign1);
259 |         result.val[1] = vorrq_u16(result.val[1], sign2);
260 |         result.val[0] = (uint16x8_t)vrev16q_u8((uint8x16_t)result.val[0]);
261 |         result.val[1] = (uint16x8_t)vrev16q_u8((uint8x16_t)result.val[1]);
262 | 
263 |         vst1q_u16_x2((unsigned short *) value_out, result);
264 |         value_out += 32;
265 |     }
266 | 
267 |     return 1;
268 | }
269 | 
270 | #endif
271 | 


--------------------------------------------------------------------------------
/src/rcnb_x86.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | cencode_x86.c - x86 intrinsic source to an rcnb encoding algorithm
  3 | 
  4 | This is part of the librcnb project, and has been placed in the public domain.
  5 | For details, see https://github.com/rikakomoe/librcnb
  6 | */
  7 | 
  8 | #if defined(ENABLE_AVX2) || defined(ENABLE_SSSE3)
  9 | 
 10 | #include <immintrin.h>
 11 | #include <rcnb/cencode.h>
 12 | #include <rcnb/cdecode.h>
 13 | 
 14 | typedef struct concat_tbl {
 15 |     unsigned char first[16];
 16 |     unsigned char second[16];
 17 | } concat_tbl;
 18 | 
 19 | static const unsigned char swizzle[16] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
 20 | 
 21 | static const concat_tbl rc_lo = {
 22 |         {114, 82, 84, 85, 86, 87, 88, 89, 166, 16, 17, 18, 19, 76, 77},
 23 |         {99, 67, 6, 7, 8, 9, 10, 11, 12, 13, 135, 136, 199, 59, 60}
 24 | };
 25 | static const concat_tbl rc_hi = {
 26 |         {0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2},
 27 |         {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2}
 28 | };
 29 | static const concat_tbl nb_lo = {
 30 |         {110, 78, 67, 68, 69, 70, 71, 72, 157, 158, 209, 248, 249, 32, 53},
 31 |         {98, 66, 128, 129, 131, 132, 133, 223, 222, 254}
 32 | };
 33 | static const concat_tbl nb_hi = {
 34 |         {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 2},
 35 |         {0, 0, 1, 1, 1, 1, 1, 0, 0, 0}
 36 | };
 37 | 
 38 | static const concat_tbl rc_tbl = {
 39 |         {14, 8, 0, 255, 2, 3, 4, 5, 6, 7, 9, 10, 11, 1, 12, 13},
 40 |         {13, 3, 9, 14, 4, 0, 5, 255, 10, 6, 11, 1, 7, 12, 2, 8}
 41 | };
 42 | 
 43 | static const concat_tbl nb_tbl = {
 44 |         {10, 3, 255, 4, 8, 0, 5, 9, 6, 1, 7, 13, 11, 14, 2, 12},
 45 |         {2, 3, 255, 255, 4, 255, 5, 6, 8, 7, 255, 1, 9, 255, 255, 0}
 46 | };
 47 | 
 48 | static const unsigned char s_tbl[16] = {0, 0, 255, 255, 255, 0, 255, 0};
 49 | static const concat_tbl mul_c = {
 50 |         {225, 1, 225, 1, 225, 1, 225, 1, 225, 1, 225, 1, 225, 1, 225, 1},
 51 |         {150, 1, 150, 1, 150, 1, 150, 1, 150, 1, 150, 1, 150, 1, 150, 1}
 52 | };
 53 | 
 54 | #ifdef __clang__
 55 | // Clang really don't like vpermd and attempts to replace it with 5+ ops.
 56 | // Mark this as potentially non-const to force Clang using vpermd.
 57 | static unsigned int permuted[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 58 | static unsigned char shuffler[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
 59 | void unused_force_clang_use_vpermd() { permuted[0] = 0; }
 60 | void unused_force_clang_use_vpshufb() { shuffler[0] = 0; }
 61 | #else
 62 | static const unsigned int permuted[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 63 | static const unsigned char shuffler[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
 64 | #endif
 65 | #endif
 66 | 
 67 | #ifdef ENABLE_SSSE3
 68 | 
 69 | #define mm_blendv_epi8(a, b, mask) _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a))
 70 | 
 71 | void rcnb_encode_32n_asm(const char *value_in, char *value_out, size_t n) {
 72 |     for (size_t i = 0; i < n; ++i) {
 73 |         __m128i input1 = _mm_loadu_si128((__m128i *) value_in);
 74 |         input1 = _mm_shuffle_epi8(input1, *(__m128i *) &swizzle);
 75 |         // 0xffff for neg, 0x0000 for pos
 76 |         __m128i sign1 = _mm_srai_epi16(input1, 15);
 77 |         input1 = _mm_and_si128(input1, _mm_set1_epi16(0x7fff));
 78 | 
 79 |         __m128i input2 = _mm_loadu_si128((__m128i *) (value_in + 16));
 80 |         input2 = _mm_shuffle_epi8(input2, *(__m128i *) &swizzle);
 81 |         __m128i sign2 = _mm_srai_epi16(input2, 15);
 82 |         input2 = _mm_and_si128(input2, _mm_set1_epi16(0x7fff));
 83 | 
 84 |         value_in += 32;
 85 | 
 86 |         __m128i idx_r1, idx_c1, idx_n1, idx_b1;
 87 |         __m128i idx_r2, idx_c2, idx_n2, idx_b2;
 88 |         {
 89 |             // i / 2250 = (i * 59653) >> (16 + 11)
 90 |             idx_r1 = _mm_srli_epi16(_mm_mulhi_epu16(input1, _mm_set1_epi16(-5883)), 11);
 91 | 
 92 |             __m128i r_mul_2250 = _mm_mullo_epi16(idx_r1, _mm_set1_epi16(2250));
 93 |             // i % 2250
 94 |             __m128i i_mod_2250 = _mm_sub_epi16(input1, r_mul_2250);
 95 |             // i / 150 = (i * 55925) >> (16 + 7)
 96 |             idx_c1 = _mm_srli_epi16(_mm_mulhi_epu16(i_mod_2250, _mm_set1_epi16(-9611)), 7);
 97 | 
 98 |             __m128i c_mul_150 = _mm_add_epi16(r_mul_2250, _mm_mullo_epi16(idx_c1, _mm_set1_epi16(150)));
 99 |             // i % 150
100 |             __m128i i_mod_150 = _mm_sub_epi16(input1, c_mul_150);
101 |             // i / 10 = (i * 52429) >> (16 + 3);
102 |             idx_n1 = _mm_srli_epi16(_mm_mulhi_epu16(i_mod_150, _mm_set1_epi16(-13107)), 3);
103 | 
104 |             __m128i n_mul_10 = _mm_add_epi16(c_mul_150, _mm_mullo_epi16(idx_n1, _mm_set1_epi16(10)));
105 |             // i % 10
106 |             idx_b1 = _mm_sub_epi16(input1, n_mul_10);
107 |         }
108 | 
109 |         {
110 |             idx_r2 = _mm_srli_epi16(_mm_mulhi_epu16(input2, _mm_set1_epi16(-5883)), 11);
111 |             __m128i r_mul_2250 = _mm_mullo_epi16(idx_r2, _mm_set1_epi16(2250));
112 |             __m128i i_mod_2250 = _mm_sub_epi16(input2, r_mul_2250);
113 |             idx_c2 = _mm_srli_epi16(_mm_mulhi_epu16(i_mod_2250, _mm_set1_epi16(-9611)), 7);
114 |             __m128i c_mul_150 = _mm_add_epi16(r_mul_2250, _mm_mullo_epi16(idx_c2, _mm_set1_epi16(150)));
115 |             __m128i i_mod_150 = _mm_sub_epi16(input2, c_mul_150);
116 |             idx_n2 = _mm_srli_epi16(_mm_mulhi_epu16(i_mod_150, _mm_set1_epi16(-13107)), 3);
117 |             __m128i n_mul_10 = _mm_add_epi16(c_mul_150, _mm_mullo_epi16(idx_n2, _mm_set1_epi16(10)));
118 |             idx_b2 = _mm_sub_epi16(input2, n_mul_10);
119 |         }
120 | 
121 |         __m128i idx_r = _mm_packus_epi16(idx_r1, idx_r2);
122 |         __m128i idx_c = _mm_packus_epi16(idx_c1, idx_c2);
123 |         __m128i idx_n = _mm_packus_epi16(idx_n1, idx_n2);
124 |         __m128i idx_b = _mm_packus_epi16(idx_b1, idx_b2);
125 | 
126 |         __m128i r_l = _mm_shuffle_epi8(*(__m128i *) &rc_lo.first, idx_r);
127 |         __m128i c_l = _mm_shuffle_epi8(*(__m128i *) &rc_lo.second, idx_c);
128 |         __m128i n_l = _mm_shuffle_epi8(*(__m128i *) &nb_lo.first, idx_n);
129 |         __m128i b_l = _mm_shuffle_epi8(*(__m128i *) &nb_lo.second, idx_b);
130 | 
131 |         __m128i r_h = _mm_shuffle_epi8(*(__m128i *) &rc_hi.first, idx_r);
132 |         __m128i c_h = _mm_shuffle_epi8(*(__m128i *) &rc_hi.second, idx_c);
133 |         __m128i n_h = _mm_shuffle_epi8(*(__m128i *) &nb_hi.first, idx_n);
134 |         __m128i b_h = _mm_shuffle_epi8(*(__m128i *) &nb_hi.second, idx_b);
135 | 
136 |         __m128i r1 = _mm_unpacklo_epi8(r_l, r_h);
137 |         __m128i r2 = _mm_unpackhi_epi8(r_l, r_h);
138 |         __m128i c1 = _mm_unpacklo_epi8(c_l, c_h);
139 |         __m128i c2 = _mm_unpackhi_epi8(c_l, c_h);
140 |         __m128i n1 = _mm_unpacklo_epi8(n_l, n_h);
141 |         __m128i n2 = _mm_unpackhi_epi8(n_l, n_h);
142 |         __m128i b1 = _mm_unpacklo_epi8(b_l, b_h);
143 |         __m128i b2 = _mm_unpackhi_epi8(b_l, b_h);
144 | 
145 |         __m128i rc1_t = _mm_unpacklo_epi16(r1, c1);
146 |         __m128i rc2_t = _mm_unpackhi_epi16(r1, c1);
147 |         __m128i rc3_t = _mm_unpacklo_epi16(r2, c2);
148 |         __m128i rc4_t = _mm_unpackhi_epi16(r2, c2);
149 |         __m128i nb1_t = _mm_unpacklo_epi16(n1, b1);
150 |         __m128i nb2_t = _mm_unpackhi_epi16(n1, b1);
151 |         __m128i nb3_t = _mm_unpacklo_epi16(n2, b2);
152 |         __m128i nb4_t = _mm_unpackhi_epi16(n2, b2);
153 | 
154 |         __m128i mask1 = _mm_unpacklo_epi16(sign1, sign1);
155 |         __m128i mask2 = _mm_unpackhi_epi16(sign1, sign1);
156 |         __m128i mask3 = _mm_unpacklo_epi16(sign2, sign2);
157 |         __m128i mask4 = _mm_unpackhi_epi16(sign2, sign2);
158 | 
159 |         __m128i rc1 = mm_blendv_epi8(rc1_t, nb1_t, mask1);
160 |         __m128i rc2 = mm_blendv_epi8(rc2_t, nb2_t, mask2);
161 |         __m128i rc3 = mm_blendv_epi8(rc3_t, nb3_t, mask3);
162 |         __m128i rc4 = mm_blendv_epi8(rc4_t, nb4_t, mask4);
163 |         __m128i nb1 = mm_blendv_epi8(nb1_t, rc1_t, mask1);
164 |         __m128i nb2 = mm_blendv_epi8(nb2_t, rc2_t, mask2);
165 |         __m128i nb3 = mm_blendv_epi8(nb3_t, rc3_t, mask3);
166 |         __m128i nb4 = mm_blendv_epi8(nb4_t, rc4_t, mask4);
167 | 
168 |         __m128i rcnb1 = _mm_unpacklo_epi32(rc1, nb1);
169 |         __m128i rcnb2 = _mm_unpackhi_epi32(rc1, nb1);
170 |         __m128i rcnb3 = _mm_unpacklo_epi32(rc2, nb2);
171 |         __m128i rcnb4 = _mm_unpackhi_epi32(rc2, nb2);
172 |         __m128i rcnb5 = _mm_unpacklo_epi32(rc3, nb3);
173 |         __m128i rcnb6 = _mm_unpackhi_epi32(rc3, nb3);
174 |         __m128i rcnb7 = _mm_unpacklo_epi32(rc4, nb4);
175 |         __m128i rcnb8 = _mm_unpackhi_epi32(rc4, nb4);
176 | 
177 |         if (sizeof(wchar_t) == 2) {
178 |             _mm_storeu_si128((__m128i *) (value_out), rcnb1);
179 |             value_out += 16;
180 |             _mm_storeu_si128((__m128i *) (value_out), rcnb2);
181 |             value_out += 16;
182 |             _mm_storeu_si128((__m128i *) (value_out), rcnb3);
183 |             value_out += 16;
184 |             _mm_storeu_si128((__m128i *) (value_out), rcnb4);
185 |             value_out += 16;
186 |             _mm_storeu_si128((__m128i *) (value_out), rcnb5);
187 |             value_out += 16;
188 |             _mm_storeu_si128((__m128i *) (value_out), rcnb6);
189 |             value_out += 16;
190 |             _mm_storeu_si128((__m128i *) (value_out), rcnb7);
191 |             value_out += 16;
192 |             _mm_storeu_si128((__m128i *) (value_out), rcnb8);
193 |             value_out += 16;
194 |         } else if (sizeof(wchar_t) == 4) {
195 |             _mm_storeu_si128((__m128i *) (value_out), _mm_unpacklo_epi16(rcnb1, _mm_setzero_si128()));
196 |             value_out += 16;
197 |             _mm_storeu_si128((__m128i *) (value_out), _mm_unpackhi_epi16(rcnb1, _mm_setzero_si128()));
198 |             value_out += 16;
199 |             _mm_storeu_si128((__m128i *) (value_out), _mm_unpacklo_epi16(rcnb2, _mm_setzero_si128()));
200 |             value_out += 16;
201 |             _mm_storeu_si128((__m128i *) (value_out), _mm_unpackhi_epi16(rcnb2, _mm_setzero_si128()));
202 |             value_out += 16;
203 |             _mm_storeu_si128((__m128i *) (value_out), _mm_unpacklo_epi16(rcnb3, _mm_setzero_si128()));
204 |             value_out += 16;
205 |             _mm_storeu_si128((__m128i *) (value_out), _mm_unpackhi_epi16(rcnb3, _mm_setzero_si128()));
206 |             value_out += 16;
207 |             _mm_storeu_si128((__m128i *) (value_out), _mm_unpacklo_epi16(rcnb4, _mm_setzero_si128()));
208 |             value_out += 16;
209 |             _mm_storeu_si128((__m128i *) (value_out), _mm_unpackhi_epi16(rcnb4, _mm_setzero_si128()));
210 |             value_out += 16;
211 |             _mm_storeu_si128((__m128i *) (value_out), _mm_unpacklo_epi16(rcnb5, _mm_setzero_si128()));
212 |             value_out += 16;
213 |             _mm_storeu_si128((__m128i *) (value_out), _mm_unpackhi_epi16(rcnb5, _mm_setzero_si128()));
214 |             value_out += 16;
215 |             _mm_storeu_si128((__m128i *) (value_out), _mm_unpacklo_epi16(rcnb6, _mm_setzero_si128()));
216 |             value_out += 16;
217 |             _mm_storeu_si128((__m128i *) (value_out), _mm_unpackhi_epi16(rcnb6, _mm_setzero_si128()));
218 |             value_out += 16;
219 |             _mm_storeu_si128((__m128i *) (value_out), _mm_unpacklo_epi16(rcnb7, _mm_setzero_si128()));
220 |             value_out += 16;
221 |             _mm_storeu_si128((__m128i *) (value_out), _mm_unpackhi_epi16(rcnb7, _mm_setzero_si128()));
222 |             value_out += 16;
223 |             _mm_storeu_si128((__m128i *) (value_out), _mm_unpacklo_epi16(rcnb8, _mm_setzero_si128()));
224 |             value_out += 16;
225 |             _mm_storeu_si128((__m128i *) (value_out), _mm_unpackhi_epi16(rcnb8, _mm_setzero_si128()));
226 |             value_out += 16;
227 |         }
228 |     }
229 | }
230 | 
231 | int rcnb_decode_32n_asm(const char *value_in, char *value_out, size_t n) {
232 |     __m128i rcnb1, rcnb2, rcnb3, rcnb4, rcnb5, rcnb6, rcnb7, rcnb8;
233 | 
234 |     __m128i s_t = *(__m128i *) &s_tbl;
235 | 
236 |     __m128i r_t = *(__m128i *) rc_tbl.first;
237 |     __m128i c_t = *(__m128i *) rc_tbl.second;
238 |     __m128i n_t = *(__m128i *) nb_tbl.first;
239 |     __m128i b_t = *(__m128i *) nb_tbl.second;
240 | 
241 |     __m128i mul_rc = *(__m128i *) mul_c.first;
242 |     __m128i mul_nb = *(__m128i *) mul_c.second;
243 | 
244 |     __m128i r_swizzle = *(__m128i *) &swizzle;
245 | 
246 |     for (size_t i = 0; i < n; ++i) {
247 |         if (sizeof(wchar_t) == 2) {
248 |             rcnb1 = _mm_loadu_si128((__m128i *) value_in);
249 |             rcnb2 = _mm_loadu_si128((__m128i *) (value_in + 16));
250 |             rcnb3 = _mm_loadu_si128((__m128i *) (value_in + 32));
251 |             rcnb4 = _mm_loadu_si128((__m128i *) (value_in + 48));
252 |             rcnb5 = _mm_loadu_si128((__m128i *) (value_in + 64));
253 |             rcnb6 = _mm_loadu_si128((__m128i *) (value_in + 80));
254 |             rcnb7 = _mm_loadu_si128((__m128i *) (value_in + 96));
255 |             rcnb8 = _mm_loadu_si128((__m128i *) (value_in + 112));
256 |             value_in += 128;
257 |         } else if (sizeof(wchar_t) == 4) {
258 |             rcnb1 = _mm_packs_epi32(_mm_loadu_si128((__m128i *) value_in),
259 |                                     _mm_loadu_si128((__m128i *) (value_in + 16)));
260 |             value_in += 32;
261 |             rcnb2 = _mm_packs_epi32(_mm_loadu_si128((__m128i *) value_in),
262 |                                     _mm_loadu_si128((__m128i *) (value_in + 16)));
263 |             value_in += 32;
264 |             rcnb3 = _mm_packs_epi32(_mm_loadu_si128((__m128i *) value_in),
265 |                                     _mm_loadu_si128((__m128i *) (value_in + 16)));
266 |             value_in += 32;
267 |             rcnb4 = _mm_packs_epi32(_mm_loadu_si128((__m128i *) value_in),
268 |                                     _mm_loadu_si128((__m128i *) (value_in + 16)));
269 |             value_in += 32;
270 |             rcnb5 = _mm_packs_epi32(_mm_loadu_si128((__m128i *) value_in),
271 |                                     _mm_loadu_si128((__m128i *) (value_in + 16)));
272 |             value_in += 32;
273 |             rcnb6 = _mm_packs_epi32(_mm_loadu_si128((__m128i *) value_in),
274 |                                     _mm_loadu_si128((__m128i *) (value_in + 16)));
275 |             value_in += 32;
276 |             rcnb7 = _mm_packs_epi32(_mm_loadu_si128((__m128i *) value_in),
277 |                                     _mm_loadu_si128((__m128i *) (value_in + 16)));
278 |             value_in += 32;
279 |             rcnb8 = _mm_packs_epi32(_mm_loadu_si128((__m128i *) value_in),
280 |                                     _mm_loadu_si128((__m128i *) (value_in + 16)));
281 |             value_in += 32;
282 |         }
283 | 
284 |         __m128i r_c1t, r_c2t, c_c1t, c_c2t, n_c1t, n_c2t, b_c1t, b_c2t;
285 | 
286 |         {
287 |             __m128i rcnb_04 = _mm_unpacklo_epi16(rcnb1, rcnb3);
288 |             __m128i rcnb_15 = _mm_unpackhi_epi16(rcnb1, rcnb3);
289 |             __m128i rcnb_26 = _mm_unpacklo_epi16(rcnb2, rcnb4);
290 |             __m128i rcnb_37 = _mm_unpackhi_epi16(rcnb2, rcnb4);
291 | 
292 |             __m128i rcnb_0246_1 = _mm_unpacklo_epi16(rcnb_04, rcnb_26);
293 |             __m128i rcnb_0246_2 = _mm_unpackhi_epi16(rcnb_04, rcnb_26);
294 |             __m128i rcnb_1357_1 = _mm_unpacklo_epi16(rcnb_15, rcnb_37);
295 |             __m128i rcnb_1357_2 = _mm_unpackhi_epi16(rcnb_15, rcnb_37);
296 | 
297 |             r_c1t = _mm_unpacklo_epi16(rcnb_0246_1, rcnb_1357_1);
298 |             c_c1t = _mm_unpackhi_epi16(rcnb_0246_1, rcnb_1357_1);
299 |             n_c1t = _mm_unpacklo_epi16(rcnb_0246_2, rcnb_1357_2);
300 |             b_c1t = _mm_unpackhi_epi16(rcnb_0246_2, rcnb_1357_2);
301 |         }
302 | 
303 |         {
304 |             __m128i rcnb_04 = _mm_unpacklo_epi16(rcnb5, rcnb7);
305 |             __m128i rcnb_15 = _mm_unpackhi_epi16(rcnb5, rcnb7);
306 |             __m128i rcnb_26 = _mm_unpacklo_epi16(rcnb6, rcnb8);
307 |             __m128i rcnb_37 = _mm_unpackhi_epi16(rcnb6, rcnb8);
308 | 
309 |             __m128i rcnb_0246_1 = _mm_unpacklo_epi16(rcnb_04, rcnb_26);
310 |             __m128i rcnb_0246_2 = _mm_unpackhi_epi16(rcnb_04, rcnb_26);
311 |             __m128i rcnb_1357_1 = _mm_unpacklo_epi16(rcnb_15, rcnb_37);
312 |             __m128i rcnb_1357_2 = _mm_unpackhi_epi16(rcnb_15, rcnb_37);
313 | 
314 |             r_c2t = _mm_unpacklo_epi16(rcnb_0246_1, rcnb_1357_1);
315 |             c_c2t = _mm_unpackhi_epi16(rcnb_0246_1, rcnb_1357_1);
316 |             n_c2t = _mm_unpacklo_epi16(rcnb_0246_2, rcnb_1357_2);
317 |             b_c2t = _mm_unpackhi_epi16(rcnb_0246_2, rcnb_1357_2);
318 |         }
319 | 
320 |         __m128i sign_idx1 = _mm_srli_epi16(_mm_mullo_epi16(r_c1t, _mm_set1_epi16(2117)), 13);
321 |         __m128i sign_idx2 = _mm_srli_epi16(_mm_mullo_epi16(r_c2t, _mm_set1_epi16(2117)), 13);
322 |         __m128i sign1 = _mm_shuffle_epi8(s_t, sign_idx1);
323 |         __m128i sign2 = _mm_shuffle_epi8(s_t, sign_idx2);
324 |         sign1 = _mm_or_si128(sign1, _mm_slli_epi16(sign1, 8));
325 |         sign2 = _mm_or_si128(sign2, _mm_slli_epi16(sign2, 8));
326 | 
327 |         __m128i r_c1 = mm_blendv_epi8(r_c1t, n_c1t, sign1);
328 |         __m128i c_c1 = mm_blendv_epi8(c_c1t, b_c1t, sign1);
329 |         __m128i n_c1 = mm_blendv_epi8(n_c1t, r_c1t, sign1);
330 |         __m128i b_c1 = mm_blendv_epi8(b_c1t, c_c1t, sign1);
331 |         __m128i r_c2 = mm_blendv_epi8(r_c2t, n_c2t, sign2);
332 |         __m128i c_c2 = mm_blendv_epi8(c_c2t, b_c2t, sign2);
333 |         __m128i n_c2 = mm_blendv_epi8(n_c2t, r_c2t, sign2);
334 |         __m128i b_c2 = mm_blendv_epi8(b_c2t, c_c2t, sign2);
335 | 
336 |         sign1 = _mm_slli_epi16(sign1, 15);
337 |         sign2 = _mm_slli_epi16(sign2, 15);
338 | 
339 |         __m128i r_i116 = _mm_srli_epi16(_mm_mullo_epi16(r_c1, _mm_set1_epi16(4675)), 12);
340 |         __m128i c_i116 = _mm_srli_epi16(_mm_mullo_epi16(c_c1, _mm_set1_epi16(11482)), 12);
341 |         __m128i n_i116 = _mm_srli_epi16(_mm_mullo_epi16(n_c1, _mm_set1_epi16(9726)), 12);
342 |         __m128i b_i116 = _mm_and_si128(_mm_add_epi16(b_c1,
343 |                                                      _mm_add_epi16(
344 |                                                              _mm_srli_epi16(b_c1, 1),
345 |                                                              _mm_srli_epi16(b_c1, 3))),
346 |                                        _mm_set1_epi16(15));
347 | 
348 |         __m128i r_i216 = _mm_srli_epi16(_mm_mullo_epi16(r_c2, _mm_set1_epi16(4675)), 12);
349 |         __m128i c_i216 = _mm_srli_epi16(_mm_mullo_epi16(c_c2, _mm_set1_epi16(11482)), 12);
350 |         __m128i n_i216 = _mm_srli_epi16(_mm_mullo_epi16(n_c2, _mm_set1_epi16(9726)), 12);
351 |         __m128i b_i216 = _mm_and_si128(_mm_add_epi16(b_c2,
352 |                                                      _mm_add_epi16(
353 |                                                              _mm_srli_epi16(b_c2, 1),
354 |                                                              _mm_srli_epi16(b_c2, 3))),
355 |                                        _mm_set1_epi16(15));
356 | 
357 |         __m128i r_i = _mm_packus_epi16(r_i116, r_i216);
358 |         __m128i c_i = _mm_packus_epi16(c_i116, c_i216);
359 |         __m128i n_i = _mm_packus_epi16(n_i116, n_i216);
360 |         __m128i b_i = _mm_packus_epi16(b_i116, b_i216);
361 | 
362 |         __m128i r_v = _mm_shuffle_epi8(r_t, r_i);
363 |         __m128i c_v = _mm_shuffle_epi8(c_t, c_i);
364 |         __m128i n_v = _mm_shuffle_epi8(n_t, n_i);
365 |         __m128i b_v = _mm_shuffle_epi8(b_t, b_i);
366 | 
367 |         __m128i bad_v = _mm_or_si128(
368 |                 _mm_or_si128(
369 |                         _mm_cmpeq_epi8(r_v, _mm_set1_epi8(-1)),
370 |                         _mm_cmpeq_epi8(c_v, _mm_set1_epi8(-1))
371 |                 ),
372 |                 _mm_or_si128(
373 |                         _mm_cmpeq_epi8(n_v, _mm_set1_epi8(-1)),
374 |                         _mm_cmpeq_epi8(b_v, _mm_set1_epi8(-1))
375 |                 )
376 |         );
377 | 
378 |         __m128i rn_1 = _mm_unpacklo_epi8(r_v, n_v);
379 |         __m128i rn_2 = _mm_unpackhi_epi8(r_v, n_v);
380 |         __m128i cb_1 = _mm_unpacklo_epi8(c_v, b_v);
381 |         __m128i cb_2 = _mm_unpackhi_epi8(c_v, b_v);
382 |         rn_1 = _mm_maddubs_epi16(mul_rc, rn_1);
383 |         rn_2 = _mm_maddubs_epi16(mul_rc, rn_2);
384 |         cb_1 = _mm_maddubs_epi16(mul_nb, cb_1);
385 |         cb_2 = _mm_maddubs_epi16(mul_nb, cb_2);
386 | 
387 |         if (_mm_movemask_epi8(bad_v)) {
388 |             return 0;
389 |         }
390 | 
391 |         __m128i result1 = _mm_add_epi16(_mm_add_epi16(_mm_slli_epi16(rn_1, 3), _mm_slli_epi16(rn_1, 1)), cb_1);
392 |         __m128i result2 = _mm_add_epi16(_mm_add_epi16(_mm_slli_epi16(rn_2, 3), _mm_slli_epi16(rn_2, 1)), cb_2);
393 | 
394 |         result1 = _mm_or_si128(result1, sign1);
395 |         result1 = _mm_shuffle_epi8(result1, r_swizzle);
396 |         result2 = _mm_or_si128(result2, sign2);
397 |         result2 = _mm_shuffle_epi8(result2, r_swizzle);
398 | 
399 |         _mm_storeu_si128((__m128i *) value_out, result1);
400 |         _mm_storeu_si128((__m128i *) (value_out + 16), result2);
401 |         value_out += 32;
402 |     }
403 |     return 1;
404 | }
405 | 
406 | #endif
407 | 
408 | #ifdef ENABLE_AVX2
409 | 
410 | void rcnb_encode_32n_asm(const char *value_in, char *value_out, size_t n) {
411 |     __m256i r_swizzle = _mm256_broadcastsi128_si256(*(__m128i *) &swizzle);
412 |     __m256i r_permute = *(__m256i *) &permuted;
413 |     __m256i r_shuffler = _mm256_broadcastsi128_si256(*(__m128i *) &shuffler);
414 |     for (size_t i = 0; i < n; ++i) {
415 |         __m256i input = _mm256_loadu_si256((__m256i *) value_in);
416 |         value_in += 32;
417 |         input = _mm256_shuffle_epi8(input, r_swizzle);
418 |         // 0xffff for neg, 0x0000 for pos
419 |         __m256i sign = _mm256_srai_epi16(input, 15);
420 |         input = _mm256_and_si256(input, _mm256_set1_epi16(0x7fff));
421 | 
422 |         __m256i idx_r = _mm256_srli_epi16(_mm256_mulhi_epu16(input, _mm256_set1_epi16(-5883)), 11);
423 |         __m256i r_mul_2250 = _mm256_mullo_epi16(idx_r, _mm256_set1_epi16(2250));
424 |         __m256i i_mod_2250 = _mm256_sub_epi16(input, r_mul_2250);
425 |         __m256i idx_c = _mm256_srli_epi16(_mm256_mulhi_epu16(i_mod_2250, _mm256_set1_epi16(-9611)), 7);
426 |         __m256i c_mul_150 = _mm256_add_epi16(r_mul_2250, _mm256_mullo_epi16(idx_c, _mm256_set1_epi16(150)));
427 |         __m256i i_mod_150 = _mm256_sub_epi16(input, c_mul_150);
428 |         __m256i idx_n = _mm256_srli_epi16(_mm256_mulhi_epu16(i_mod_150, _mm256_set1_epi16(-13107)), 3);
429 |         __m256i n_mul_10 = _mm256_add_epi16(c_mul_150, _mm256_mullo_epi16(idx_n, _mm256_set1_epi16(10)));
430 |         __m256i idx_b = _mm256_sub_epi16(input, n_mul_10);
431 | 
432 |         __m256i idx_rc = _mm256_packus_epi16(idx_r, idx_c);
433 |         __m256i idx_nb = _mm256_packus_epi16(idx_n, idx_b);
434 |         idx_rc = _mm256_permute4x64_epi64(idx_rc, 0xd8);
435 |         idx_nb = _mm256_permute4x64_epi64(idx_nb, 0xd8);
436 | 
437 |         __m256i rc_l = _mm256_shuffle_epi8(*(__m256i *) &rc_lo, idx_rc);
438 |         __m256i rc_h = _mm256_shuffle_epi8(*(__m256i *) &rc_hi, idx_rc);
439 |         __m256i nb_l = _mm256_shuffle_epi8(*(__m256i *) &nb_lo, idx_nb);
440 |         __m256i nb_h = _mm256_shuffle_epi8(*(__m256i *) &nb_hi, idx_nb);
441 | 
442 |         __m256i r1c1_t = _mm256_unpacklo_epi8(rc_l, rc_h);
443 |         __m256i r2c2_t = _mm256_unpackhi_epi8(rc_l, rc_h);
444 |         __m256i n1b1_t = _mm256_unpacklo_epi8(nb_l, nb_h);
445 |         __m256i n2b2_t = _mm256_unpackhi_epi8(nb_l, nb_h);
446 | 
447 |         __m256i sign1 = _mm256_permute4x64_epi64(sign, 0b01000100);
448 |         __m256i sign2 = _mm256_permute4x64_epi64(sign, 0b11101110);
449 | 
450 |         __m256i r1c1 = _mm256_blendv_epi8(r1c1_t, n1b1_t, sign1);
451 |         __m256i r2c2 = _mm256_blendv_epi8(r2c2_t, n2b2_t, sign2);
452 |         __m256i n1b1 = _mm256_blendv_epi8(n1b1_t, r1c1_t, sign1);
453 |         __m256i n2b2 = _mm256_blendv_epi8(n2b2_t, r2c2_t, sign2);
454 | 
455 |         __m256i rn1cb1 = _mm256_unpacklo_epi16(r1c1, n1b1);
456 |         __m256i rn2cb2 = _mm256_unpackhi_epi16(r1c1, n1b1);
457 |         __m256i rn3cb3 = _mm256_unpacklo_epi16(r2c2, n2b2);
458 |         __m256i rn4cb4 = _mm256_unpackhi_epi16(r2c2, n2b2);
459 | 
460 |         __m256i rncb1 = _mm256_permutevar8x32_epi32(rn1cb1, r_permute);
461 |         __m256i rncb2 = _mm256_permutevar8x32_epi32(rn2cb2, r_permute);
462 |         __m256i rncb3 = _mm256_permutevar8x32_epi32(rn3cb3, r_permute);
463 |         __m256i rncb4 = _mm256_permutevar8x32_epi32(rn4cb4, r_permute);
464 | 
465 |         __m256i rcnb1 = _mm256_shuffle_epi8(rncb1, r_shuffler);
466 |         __m256i rcnb2 = _mm256_shuffle_epi8(rncb2, r_shuffler);
467 |         __m256i rcnb3 = _mm256_shuffle_epi8(rncb3, r_shuffler);
468 |         __m256i rcnb4 = _mm256_shuffle_epi8(rncb4, r_shuffler);
469 | 
470 |         if (sizeof(wchar_t) == 2) {
471 |             _mm256_storeu_si256((__m256i *) (value_out), rcnb1);
472 |             value_out += 32;
473 |             _mm256_storeu_si256((__m256i *) (value_out), rcnb2);
474 |             value_out += 32;
475 |             _mm256_storeu_si256((__m256i *) (value_out), rcnb3);
476 |             value_out += 32;
477 |             _mm256_storeu_si256((__m256i *) (value_out), rcnb4);
478 |             value_out += 32;
479 |         } else if (sizeof(wchar_t) == 4) {
480 |             _mm256_storeu_si256((__m256i *) (value_out), _mm256_cvtepi16_epi32(_mm256_extracti128_si256(rcnb1, 0)));
481 |             value_out += 32;
482 |             _mm256_storeu_si256((__m256i *) (value_out), _mm256_cvtepi16_epi32(_mm256_extracti128_si256(rcnb1, 1)));
483 |             value_out += 32;
484 |             _mm256_storeu_si256((__m256i *) (value_out), _mm256_cvtepi16_epi32(_mm256_extracti128_si256(rcnb2, 0)));
485 |             value_out += 32;
486 |             _mm256_storeu_si256((__m256i *) (value_out), _mm256_cvtepi16_epi32(_mm256_extracti128_si256(rcnb2, 1)));
487 |             value_out += 32;
488 |             _mm256_storeu_si256((__m256i *) (value_out), _mm256_cvtepi16_epi32(_mm256_extracti128_si256(rcnb3, 0)));
489 |             value_out += 32;
490 |             _mm256_storeu_si256((__m256i *) (value_out), _mm256_cvtepi16_epi32(_mm256_extracti128_si256(rcnb3, 1)));
491 |             value_out += 32;
492 |             _mm256_storeu_si256((__m256i *) (value_out), _mm256_cvtepi16_epi32(_mm256_extracti128_si256(rcnb4, 0)));
493 |             value_out += 32;
494 |             _mm256_storeu_si256((__m256i *) (value_out), _mm256_cvtepi16_epi32(_mm256_extracti128_si256(rcnb4, 1)));
495 |             value_out += 32;
496 |         }
497 |     }
498 | }
499 | 
500 | int rcnb_decode_32n_asm(const char *value_in, char *value_out, size_t n) {
501 |     __m256i rcnb1, rcnb2, rcnb3, rcnb4;
502 | 
503 |     __m256i rc_t = *(__m256i *) &rc_tbl;
504 |     __m256i nb_t = *(__m256i *) &nb_tbl;
505 | 
506 |     __m256i s_t = _mm256_broadcastsi128_si256(*(__m128i *) &s_tbl);
507 |     __m256i mul = *(__m256i *) &mul_c;
508 |     __m256i r_swizzle = _mm256_broadcastsi128_si256(*(__m128i *) &swizzle);
509 | 
510 |     for (size_t i = 0; i < n; ++i) {
511 |         if (sizeof(wchar_t) == 2) {
512 |             rcnb1 = _mm256_loadu_si256((__m256i*) value_in);
513 |             rcnb2 = _mm256_loadu_si256((__m256i*) (value_in + 32));
514 |             rcnb3 = _mm256_loadu_si256((__m256i*) (value_in + 64));
515 |             rcnb4 = _mm256_loadu_si256((__m256i*) (value_in + 96));
516 |             value_in += 128;
517 |         } else if (sizeof(wchar_t) == 4) {
518 |             __m256i tmp1, tmp2;
519 |             tmp1 = _mm256_loadu_si256((__m256i*) value_in);
520 |             tmp2 = _mm256_loadu_si256((__m256i*) (value_in + 32));
521 |             value_in += 64;
522 |             rcnb1 = _mm256_permute4x64_epi64(_mm256_packus_epi32(tmp1, tmp2), 0xd8);
523 | 
524 |             tmp1 = _mm256_loadu_si256((__m256i*) value_in);
525 |             tmp2 = _mm256_loadu_si256((__m256i*) (value_in + 32));
526 |             value_in += 64;
527 |             rcnb2 = _mm256_permute4x64_epi64(_mm256_packus_epi32(tmp1, tmp2), 0xd8);
528 | 
529 |             tmp1 = _mm256_loadu_si256((__m256i*) value_in);
530 |             tmp2 = _mm256_loadu_si256((__m256i*) (value_in + 32));
531 |             value_in += 64;
532 |             rcnb3 = _mm256_permute4x64_epi64(_mm256_packus_epi32(tmp1, tmp2), 0xd8);
533 | 
534 |             tmp1 = _mm256_loadu_si256((__m256i*) value_in);
535 |             tmp2 = _mm256_loadu_si256((__m256i*) (value_in + 32));
536 |             value_in += 64;
537 |             rcnb4 = _mm256_permute4x64_epi64(_mm256_packus_epi32(tmp1, tmp2), 0xd8);
538 |         }
539 | 
540 |         __m256i rcnb_0_1_8_9 = _mm256_permute2x128_si256(rcnb1, rcnb3, 0x20);
541 |         __m256i rcnb_2_3_a_b = _mm256_permute2x128_si256(rcnb1, rcnb3, 0x31);
542 |         __m256i rcnb_4_5_c_d = _mm256_permute2x128_si256(rcnb2, rcnb4, 0x20);
543 |         __m256i rcnb_6_7_e_f = _mm256_permute2x128_si256(rcnb2, rcnb4, 0x31);
544 | 
545 |         __m256i rcnb_02_8a = _mm256_unpacklo_epi16(rcnb_0_1_8_9, rcnb_2_3_a_b);
546 |         __m256i rcnb_13_9b = _mm256_unpackhi_epi16(rcnb_0_1_8_9, rcnb_2_3_a_b);
547 |         __m256i rcnb_46_ce = _mm256_unpacklo_epi16(rcnb_4_5_c_d, rcnb_6_7_e_f);
548 |         __m256i rcnb_57_df = _mm256_unpackhi_epi16(rcnb_4_5_c_d, rcnb_6_7_e_f);
549 | 
550 |         __m256i rcnb_0123_89ab_rc = _mm256_unpacklo_epi16(rcnb_02_8a, rcnb_13_9b);
551 |         __m256i rcnb_0123_89ab_nb = _mm256_unpackhi_epi16(rcnb_02_8a, rcnb_13_9b);
552 |         __m256i rcnb_4567_cdef_rc = _mm256_unpacklo_epi16(rcnb_46_ce, rcnb_57_df);
553 |         __m256i rcnb_4567_cdef_nb = _mm256_unpackhi_epi16(rcnb_46_ce, rcnb_57_df);
554 | 
555 |         __m256i r_ct = _mm256_unpacklo_epi64(rcnb_0123_89ab_rc, rcnb_4567_cdef_rc);
556 |         __m256i c_ct = _mm256_unpackhi_epi64(rcnb_0123_89ab_rc, rcnb_4567_cdef_rc);
557 |         __m256i n_ct = _mm256_unpacklo_epi64(rcnb_0123_89ab_nb, rcnb_4567_cdef_nb);
558 |         __m256i b_ct = _mm256_unpackhi_epi64(rcnb_0123_89ab_nb, rcnb_4567_cdef_nb);
559 | 
560 |         __m256i sign_idx = _mm256_srli_epi16(_mm256_mullo_epi16(r_ct, _mm256_set1_epi16(2117)), 13);
561 |         __m256i sign = _mm256_shuffle_epi8(s_t, sign_idx);
562 |         sign = _mm256_or_si256(sign, _mm256_slli_epi16(sign, 8));
563 | 
564 |         __m256i r_c = _mm256_blendv_epi8(r_ct, n_ct, sign);
565 |         __m256i c_c = _mm256_blendv_epi8(c_ct, b_ct, sign);
566 |         __m256i n_c = _mm256_blendv_epi8(n_ct, r_ct, sign);
567 |         __m256i b_c = _mm256_blendv_epi8(b_ct, c_ct, sign);
568 | 
569 |         sign = _mm256_slli_epi16(sign, 15);
570 | 
571 |         __m256i r_i16 = _mm256_srli_epi16(_mm256_mullo_epi16(r_c, _mm256_set1_epi16(4675)), 12);
572 |         __m256i c_i16 = _mm256_srli_epi16(_mm256_mullo_epi16(c_c, _mm256_set1_epi16(11482)), 12);
573 |         __m256i n_i16 = _mm256_srli_epi16(_mm256_mullo_epi16(n_c, _mm256_set1_epi16(9726)), 12);
574 |         __m256i b_i16 = _mm256_and_si256(_mm256_add_epi16(b_c,
575 |                                                           _mm256_add_epi16(
576 |                                                      _mm256_srli_epi16(b_c, 1),
577 |                                                      _mm256_srli_epi16(b_c, 3))),
578 |                                          _mm256_set1_epi16(15));
579 | 
580 |         __m256i rc_i = _mm256_permute4x64_epi64(_mm256_packus_epi16(r_i16, c_i16), 0xd8);
581 |         __m256i nb_i = _mm256_permute4x64_epi64(_mm256_packus_epi16(n_i16, b_i16), 0xd8);
582 | 
583 |         __m256i rc_v = _mm256_shuffle_epi8(rc_t, rc_i);
584 |         __m256i nb_v = _mm256_shuffle_epi8(nb_t, nb_i);
585 | 
586 |         __m256i bad_v = _mm256_or_si256(
587 |                 _mm256_cmpeq_epi8(rc_v, _mm256_set1_epi8(-1)),
588 |                 _mm256_cmpeq_epi8(nb_v, _mm256_set1_epi8(-1))
589 |                 );
590 | 
591 |         __m256i rn_cb_1 = _mm256_unpacklo_epi8(rc_v, nb_v);
592 |         __m256i rn_cb_2 = _mm256_unpackhi_epi8(rc_v, nb_v);
593 |         rn_cb_1 = _mm256_maddubs_epi16(mul, rn_cb_1);
594 |         rn_cb_2 = _mm256_maddubs_epi16(mul, rn_cb_2);
595 | 
596 |         if (_mm256_movemask_epi8(bad_v)) {
597 |             return 0;
598 |         }
599 | 
600 |         __m256i rn = _mm256_permute2x128_si256(rn_cb_1, rn_cb_2, 0x20);
601 |         __m256i cb = _mm256_permute2x128_si256(rn_cb_1, rn_cb_2, 0x31);
602 |         __m256i result = _mm256_add_epi16(_mm256_add_epi16(_mm256_slli_epi16(rn, 3), _mm256_slli_epi16(rn, 1)), cb);
603 |         result = _mm256_or_si256(result, sign);
604 |         result = _mm256_shuffle_epi8(result, r_swizzle);
605 | 
606 |         _mm256_storeu_si256((__m256i*)value_out, result);
607 |         value_out += 32;
608 |     }
609 |     return 1;
610 | }
611 | #endif
612 | 


--------------------------------------------------------------------------------