├── .gitattributes
├── .github
    └── workflows
    │   └── push.yml
├── .gitignore
├── .vscode
    ├── launch.json
    └── tasks.json
├── CMakeLists.txt
├── README.md
├── dev
    ├── .clang-format
    ├── CMakeLists.txt
    ├── credits.md
    ├── jebpfuzz.c
    ├── jebptest.c
    ├── lossless
    │   ├── compass.webp
    │   ├── dice.webp
    │   ├── rose.webp
    │   └── tux.webp
    └── toolchain
    │   ├── aarch64-linux-gnu.cmake
    │   ├── arm-linux-gnueabi.cmake
    │   ├── arm-linux-gnueabihf.cmake
    │   ├── i686-linux-gnu.cmake
    │   └── x86_64-w64-mingw32.cmake
├── jebp.c
├── jebp.h
└── jebpview.c


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.webp filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/.github/workflows/push.yml:
--------------------------------------------------------------------------------
 1 | name: On Push
 2 | on: push
 3 | 
 4 | jobs:
 5 |   test:
 6 |     strategy:
 7 |       fail-fast: false
 8 |       matrix:
 9 |         include:
10 |           - name: Ubuntu GCC x86-64
11 |             runs-on: ubuntu-latest
12 |             cmake-flags: -DJEBP_SANITIZE=ON
13 |           - name: Ubuntu Clang x86-64
14 |             runs-on: ubuntu-latest
15 |             cc: clang
16 |             cmake-flags: -DJEBP_SANITIZE=ON
17 |           - name: Ubuntu MinGW x86-64
18 |             runs-on: ubuntu-latest
19 |             packages: gcc-mingw-w64-x86-64 wine-stable
20 |             cmake-flags: --toolchain dev/toolchain/x86_64-w64-mingw32.cmake
21 |           - name: Ubuntu GCC x86-32
22 |             runs-on: ubuntu-latest
23 |             packages: gcc-i686-linux-gnu libc-dev-i386-cross qemu-user
24 |             cmake-flags: --toolchain dev/toolchain/i686-linux-gnu.cmake
25 |           - name: Ubuntu GCC AArch64
26 |             runs-on: ubuntu-latest
27 |             packages: gcc-aarch64-linux-gnu libc-dev-arm64-cross qemu-user
28 |             cmake-flags: --toolchain dev/toolchain/aarch64-linux-gnu.cmake
29 |           - name: Ubuntu GCC AArch32
30 |             runs-on: ubuntu-latest
31 |             packages: gcc-arm-linux-gnueabi libc-dev-armel-cross qemu-user
32 |             cmake-flags: --toolchain dev/toolchain/arm-linux-gnueabi.cmake
33 |           - name: Ubuntu GCC AArch32 hard-float
34 |             runs-on: ubuntu-latest
35 |             packages: gcc-arm-linux-gnueabihf libc-dev-armhf-cross qemu-user
36 |             cmake-flags: --toolchain dev/toolchain/arm-linux-gnueabihf.cmake
37 |           - name: Windows MSVC x86-64
38 |             runs-on: windows-latest
39 |             cmake-flags: -DJEBP_SANITIZE=ON
40 |           - name: MacOS Clang x86-64
41 |             runs-on: macos-latest
42 |             cmake-flags: -DJEBP_SANITIZE=ON
43 |     name: Tests (${{ matrix.name }})
44 |     runs-on: ${{ matrix.runs-on }}
45 |     steps:
46 |       - name: Checkout repository
47 |         uses: actions/checkout@v2
48 |         with:
49 |           lfs: true
50 |       - if: matrix.packages != ''
51 |         name: Install packages
52 |         run: |
53 |           sudo apt-get update
54 |           sudo apt-get install -y ${{ matrix.packages }}
55 |       - name: Configure project
56 |         run: cmake -B bin -DCMAKE_BUILD_TYPE=Release ${{ matrix.cmake-flags }}
57 |         env:
58 |           CC: ${{ matrix.cc }}
59 |           CFLAGS: ${{ matrix.cflags }}
60 |       - name: Compile project
61 |         run: cmake --build bin --config Release
62 |       - name: Run tests
63 |         run: ctest --test-dir bin --build-config Release --verbose
64 | 
65 |   format:
66 |     name: Format
67 |     runs-on: ubuntu-latest
68 |     steps:
69 |       - name: Checkout repository
70 |         uses: actions/checkout@v2
71 |       - name: Configure project
72 |         run: cmake -B bin
73 |       - name: Format code
74 |         run: cmake --build bin --target format
75 |       - name: Check for differences
76 |         run: git diff --exit-code --color
77 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Build files
 2 | /bin/
 3 | /build/
 4 | 
 5 | # OS files
 6 | .DS_Store
 7 | 
 8 | # Fuzzing
 9 | *.log
10 | crash-*
11 | leak-*
12 | timeout-*
13 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "0.2.0",
 3 |     "inputs": [
 4 |         {
 5 |             "description": "WebP input image",
 6 |             "id": "image",
 7 |             "type": "pickString",
 8 |             "options": [
 9 |                 "lossless/rose.webp",
10 |                 "lossless/tux.webp",
11 |                 "lossless/dice.webp",
12 |                 "lossless/compass.webp"
13 |             ]
14 |         }
15 |     ],
16 |     "configurations": [
17 |         {
18 |             "name": "Debug jebpview",
19 |             "type": "cppdbg",
20 |             "request": "launch",
21 |             "MIMode": "gdb",
22 |             "cwd": "${workspaceFolder}",
23 |             "program": "${workspaceFolder}/bin/jebpview",
24 |             "args": ["dev/${input:image}"],
25 |             "preLaunchTask": "Build",
26 |             "osx": {
27 |                 "MIMode": "lldb"
28 |             }
29 |         },
30 |         {
31 |             "name": "Debug jebptest (lossless rose)",
32 |             "type": "cppdbg",
33 |             "request": "launch",
34 |             "MIMode": "gdb",
35 |             "cwd": "${workspaceFolder}",
36 |             "program": "${workspaceFolder}/bin/dev/jebptest",
37 |             "args": ["dev/lossless/rose.webp", "512x384", "943882e6"],
38 |             "preLaunchTask": "Build",
39 |             "osx": {
40 |                 "MIMode": "lldb"
41 |             }
42 |         }
43 |     ]
44 | }
45 | 


--------------------------------------------------------------------------------
/.vscode/tasks.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "2.0.0",
 3 |     "tasks": [
 4 |         {
 5 |             "label": "Build",
 6 |             "type": "shell",
 7 |             "group": "build",
 8 |             "command": "make -C bin"
 9 |         }
10 |     ]
11 | }
12 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | project(jebp C)
 3 | include(CTest)
 4 | include(CheckCCompilerFlag)
 5 | 
 6 | # Options
 7 | if(CMAKE_BUILD_TYPE STREQUAL Debug)
 8 |     set(jebp_debug ON)
 9 | endif()
10 | if(CMAKE_BUILD_TYPE MATCHES Rel)
11 |     set(jebp_release ON)
12 | endif()
13 | option(JEBP_WERROR "Treat warnings as errors" ${jebp_release})
14 | option(JEBP_SANITIZE "Enable sanitizers" ${jebp_debug})
15 | option(JEBP_STDIO "Enable I/O support" ON)
16 | option(JEBP_SIMD "Enable SIMD optimizations" ON)
17 | option(JEBP_VP8L "Enable VP8L support" ON)
18 | 
19 | # Utility functions
20 | function(add_c_compiler_flag flag var)
21 |     check_c_compiler_flag(${flag} ${var})
22 |     if(${var})
23 |         add_compile_options(${flag})
24 |     endif()
25 | endfunction()
26 | 
27 | function(add_c_flag flag var)
28 |     set(CMAKE_REQUIRED_LIBRARIES ${flag})
29 |     check_c_compiler_flag(${flag} ${var})
30 |     unset(CMAKE_REQUIRED_LIBRARIES)
31 |     if(${var})
32 |         add_compile_options(${flag})
33 |         set(link_options ${link_options} ${flag} PARENT_SCOPE)
34 |     endif()
35 | endfunction()
36 | 
37 | # Compiler flags
38 | set(CMAKE_C_STANDARD_REQUIRED ON)
39 | if(MSVC)
40 |     set(CMAKE_C_STANDARD 11)
41 |     add_c_compiler_flag(-W3 HAVE_W3)
42 |     add_c_compiler_flag(-external:W0 HAVE_EXTERNAL)
43 |     add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
44 |     if(JEBP_WERROR)
45 |         add_c_compiler_flag(-WX HAVE_WX)
46 |     endif()
47 | else()
48 |     set(CMAKE_C_STANDARD 99)
49 |     add_c_compiler_flag(-Wall HAVE_WALL)
50 |     add_c_compiler_flag(-Wextra HAVE_WEXTRA)
51 |     add_c_compiler_flag(-Wpedantic HAVE_WPEDANTIC)
52 |     if(JEBP_WERROR)
53 |         add_c_compiler_flag(-Werror HAVE_WERROR)
54 |     endif()
55 | endif()
56 | if(JEBP_SANITIZE)
57 |     add_c_flag(-fsanitize=address HAVE_FSANITIZE_ADDRESS)
58 |     if(HAVE_FSANITIZE_ADDRESS)
59 |         add_c_flag(-fsanitize-address-use-after-return=always HAVE_FSANITIZE_USE_AFTER_RETURN)
60 |         add_c_flag(-fsanitize-address-use-after-scope HAVE_FSANITIZE_USE_AFTER_SCOPE)
61 |     endif()
62 |     add_c_flag(-fsanitize=leak HAVE_FSANITIZE_LEAK)
63 |     add_c_flag(-fsanitize=undefined HAVE_FSANITIZE_UNDEFINED)
64 | endif()
65 | if(NOT JEBP_STDIO)
66 |     add_compile_definitions(JEBP_NO_STDIO)
67 | endif()
68 | if(NOT JEBP_SIMD)
69 |     add_compile_definitions(JEBP_NO_SIMD)
70 | endif()
71 | if(NOT JEBP_VP8L)
72 |     add_compile_definitions(JEBP_NO_VP8L)
73 | endif()
74 | 
75 | # Demo program
76 | find_package(SDL2)
77 | if(SDL2_FOUND)
78 |     add_executable(jebpview WIN32 jebpview.c)
79 |     target_link_libraries(jebpview PRIVATE ${link_options} SDL2::SDL2 SDL2::SDL2main)
80 | endif()
81 | 
82 | add_subdirectory(dev)
83 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # JebP - Single header WebP decoder
 2 | Don't let all the [files](#what-are-all-these-files) scare you away, infact only
 3 | [one header file](/jebp.h) is required to use this project.
 4 | 
 5 | Currently this project only supports lossless (VP8L) images. This project:
 6 | - Does not support decoding lossy files with VP8
 7 |   ([Draft PR](https://github.com/matanui159/jebp/pull/2)).
 8 | - Does not support extended file-formats with VP8X.
 9 | - Does not support VP8L lossless images with the color-indexing transform
10 |   (palleted images).
11 | - Does not support VP8L images with more than 256 huffman groups. This is an
12 |   arbitrary limit to prevent bad images from using too much memory. In theory,
13 |   images requiring more groups should be very rare. This limit may be increased
14 |   in the future.
15 | 
16 | This project is still very new so there is very likely bugs in it. Be kind and
17 | report [any issues](https://github.com/matanui159/jebp/issues) you find.
18 | 
19 | For usage and documentation, see the [header file](/jebp.h#L132).
20 | 
21 | ## Why?
22 | I genuinely think WebP is a really cool format. Both PNG and JPEG (that are
23 | still in wide use) still use ancient technologies. Such technologies have
24 | improved alot since when they were released, and while there have been many
25 | attempts to invent the "new PNG" or the "new JPEG" most of them fall flat. WebP
26 | seems to be the only one that has made a name for itself and has gotten a
27 | following, especially in the web. But, despite the name, I do believe WebP could
28 | be used in so many more scenarios, not just the web.
29 | 
30 | However, not everyone might want to import the entire `libwebp` library,
31 | especially if they're used to something like
32 | [`stb_image.h`](https://github.com/nothings/stb/blob/master/stb_image.h) (which
33 | doesn't support WebP). And no one else seemed to be doing it, which is
34 | understandable being how large the WebP specification is using two different
35 | codecs for lossless and lossy. So I decided to try and put one together myself.
36 | It may never get used but it was fun making anyway, and I learnt alot along the
37 | way :)
38 | 
39 | ## What are all these files?
40 | Do not worry, these are all just files to help with development, none (except
41 | `jebp.h`) are needed to use the library.
42 | - `jebp.h` is the single-header library you're probably looking for.
43 | - `jebp.c` is a "source file" you can use. All it does is define
44 |   `JEBP_IMPLEMENTATION` and then include `jebp.h`.
45 | - `jebpview.c` is a demo program using SDL2 to show a WebP image. Use it with
46 |   `jebpview <WebP image>`.
47 | - `CMakeLists.txt` and `dev` are build and development files to help with
48 |   development and CI. They build `jebpview`, E2E tests and the fuzz test, along
49 |   with making it easier to run tests and format code. The cross files are
50 |   designed to run in the CI.
51 | - `.gitignore`, `.gitattributes` and `.github` are you standard git and GitHub
52 |   information files. Notably, included inside of `.github` is a CI to test JebP
53 |   on various supported platforms (see below).
54 | 
55 | ## What platforms are supported?
56 | Any compiler that is configured to support at-least C99 (VLA support is not
57 | required) is supported on any architecture, operating-system or endianness
58 | (including PDP-endian technically, see below). If you find a compiler issue or
59 | bug with any compiler that 100% supports C99 (without VLA) including support for
60 | the required system headers documented in the source, then I will be very happy
61 | to fix it, no matter how obscure the compiler is.
62 | 
63 | However, certain operating-systems, architectures and compilers are explicitly
64 | detected by the code to enable better support, extended features or
65 | optimizations. Some of these supported platforms also are tested by the CI for
66 | every change. The full list of macros used to detect the platform is documented
67 | in the source file. The detected platforms are:
68 | - The compilers GCC, Clang and MSVC. Only GCC currently has version detection,
69 |   the other two assume everything is supported.
70 | - The architectures x86-32 (x86, IA-32, i386), x86-64 (AMD64), AArch32 (arm32,
71 |   armhf) and AArch64 (arm64).
72 | - The SIMD variants SSE2 (x86) and Neon (AArch). Currently unused.
73 | - Any little-endian architecture. Any other form of endianness, including
74 |   big-endian, are not detected and do not have optimized functions. Instead,
75 |   they use naive approaches.
76 | 
77 | As a subset of these, the following platforms are tested by the CI:
78 | - GCC on Ubuntu.
79 | - Clang on Ubuntu.
80 | - MSVC on Windows.
81 | - Clang on macOS.
82 | - MinGW on Ubuntu.
83 | - Cross-compile GCC x86-32, AArch64, AArch32 (soft and hard float) on Ubuntu.
84 | 
85 | Finally, the following is also supported:
86 | - Transgender guys, gals, and non-binary pals :transgender_flag: because
87 |   everyone needs a bit of support in their life :heart:
88 | 


--------------------------------------------------------------------------------
/dev/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle: LLVM
2 | IndentWidth: 4
3 | AllowShortFunctionsOnASingleLine: Empty 
4 | 


--------------------------------------------------------------------------------
/dev/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Formating
 2 | set(format_files
 3 |     ../jebp.h
 4 |     ../jebp.c
 5 |     ../jebpview.c
 6 |     jebptest.c
 7 |     jebpfuzz.c
 8 | )
 9 | find_program(CLANG_FORMAT clang-format)
10 | if(CLANG_FORMAT)
11 |     add_custom_target(
12 |         format
13 |         COMMAND ${CLANG_FORMAT} -style=file -i ${format_files}
14 |         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
15 |     )
16 | endif()
17 | 
18 | # Tests
19 | function(add_test_image name image size hash)
20 |     add_test(
21 |         NAME ${name}
22 |         COMMAND jebptest ${image} ${size} ${hash}
23 |         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
24 |     )
25 |     if(HAVE_FSANITIZE_LEAK)
26 |         set_tests_properties(${name} PROPERTIES ENVIRONMENT ASAN_OPTIONS=detect_leaks=1)
27 |     endif()
28 | endfunction()
29 | if(BUILD_TESTING)
30 |     add_executable(jebptest jebptest.c)
31 |     target_link_libraries(jebptest PRIVATE ${link_options})
32 |     add_test_image(lossless_rose lossless/rose.webp 512x384 943882e6)
33 |     add_test_image(lossless_tux lossless/tux.webp 386x395 dd77ded2)
34 |     add_test_image(lossless_dice lossless/dice.webp 800x600 f3285d5a)
35 |     add_test_image(lossless_compass lossless/compass.webp 593x593 72c1b232)
36 | endif()
37 | 
38 | # Fuzz test
39 | if(JEBP_SANITIZE)
40 |     add_c_flag(-fsanitize=fuzzer HAVE_FSANITIZE_FUZZER)
41 |     if(HAVE_FSANITIZE_FUZZER)
42 |         set(fuzz_dir ${CMAKE_BINARY_DIR}/fuzz)
43 |         file(MAKE_DIRECTORY ${fuzz_dir})
44 |         add_executable(jebpfuzz jebpfuzz.c)
45 |         target_link_libraries(jebpfuzz PRIVATE ${link_options})
46 |         add_custom_target(
47 |             fuzz
48 |             COMMAND jebpfuzz ${fuzz_dir} lossless
49 |             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
50 |         )
51 |     endif()
52 | endif()
53 | 


--------------------------------------------------------------------------------
/dev/credits.md:
--------------------------------------------------------------------------------
 1 | ### [`lossless/rose.webp`](lossless/rose.webp)
 2 | "Free Stock Photo in High Resolution - Yellow Rose 3 - Flowers" \
 3 | Image Author: Jon Sullivan \
 4 | This file is in the public domain. \
 5 | [JPEG source](https://web.archive.org/web/20190909072451/http://www.public-domain-photos.com/free-stock-photos-4/flowers/yellow-rose-3.jpg)
 6 | 
 7 | ### [`lossless/tux.webp`](lossless/tux.webp)
 8 | "baby tux for my user page" \
 9 | Image Author: Fizyplankton \
10 | This file is in the public domain. \
11 | [PNG source](https://web.archive.org/web/20111213095701/http://www.minecraftwiki.net/images/8/85/Fizyplankton.png)
12 | 
13 | ### [`lossless/dice.webp`](lossless/dice.webp)
14 | "PNG transparency demonstration" \
15 | Image Author: POV-Ray source code \
16 | Photo licensed under the [Creative Commons][cc] [Attribution-Share Alike 3.0 Unported][by-sa] license. \
17 | [PNG source](https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png)
18 | 
19 | ### [`lossless/compass.webp`](lossless/compass.webp)
20 | "Transparent compass card for overlays" \
21 | Image Author: Denelson83 \
22 | This file is licenced under the [Creative Commons][cc] [Attribution-Share Alike 3.0 Unported][by-sa] license. \
23 | [PNG source](https://upload.wikimedia.org/wikipedia/commons/d/d3/Compass_Card_transparent.png)
24 | 
25 | [cc]: https://en.wikipedia.org/wiki/en:Creative_Commons
26 | [by-sa]: https://creativecommons.org/licenses/by-sa/3.0/deed.en
27 | 


--------------------------------------------------------------------------------
/dev/jebpfuzz.c:
--------------------------------------------------------------------------------
 1 | #define JEBP_IMPLEMENTATION
 2 | #include "../jebp.h"
 3 | 
 4 | int LLVMFuzzerTestOneInput(const jebp_ubyte *data, size_t size) {
 5 |     jebp_image_t image;
 6 |     jebp_decode(&image, size, data);
 7 |     jebp_free_image(&image);
 8 |     return 0;
 9 | }
10 | 


--------------------------------------------------------------------------------
/dev/jebptest.c:
--------------------------------------------------------------------------------
  1 | #ifdef NDEBUG
  2 | #undef NDEBUG
  3 | #endif // NDEBUG
  4 | #include <assert.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #define JEBP_IMPLEMENTATION
  8 | #include "../jebp.h"
  9 | 
 10 | static jebp_uint test_image_hash(jebp_image_t *image) {
 11 |     jebp_uint hash = 2166136261;
 12 |     for (int i = 0; i < image->width * image->height; i += 1) {
 13 |         hash ^= image->pixels[i].r;
 14 |         hash *= 16777619;
 15 |         hash ^= image->pixels[i].g;
 16 |         hash *= 16777619;
 17 |         hash ^= image->pixels[i].b;
 18 |         hash *= 16777619;
 19 |         hash ^= image->pixels[i].a;
 20 |         hash *= 16777619;
 21 |     }
 22 |     return hash;
 23 | }
 24 | 
 25 | int main(int argc, char **argv) {
 26 |     assert(argc == 4);
 27 |     const char *path = argv[1];
 28 |     jebp_int width, height;
 29 |     assert(sscanf(argv[2], "%ix%i", &width, &height) == 2);
 30 |     jebp_uint hash;
 31 |     assert(sscanf(argv[3], "%08x", &hash) == 1);
 32 |     jebp_image_t image = {0, 0, NULL};
 33 |     jebp_uint image_hash;
 34 | 
 35 |     // jebp_error_string
 36 |     const char *unknown = jebp_error_string(JEBP_ERROR_UNKNOWN);
 37 |     assert(unknown != NULL);
 38 |     for (int i = -100; i < 100; i += 1) {
 39 |         const char *error = jebp_error_string(i);
 40 |         assert(error != NULL);
 41 |         if (i >= JEBP_OK && i < JEBP_ERROR_UNKNOWN) {
 42 |             assert(error != unknown);
 43 |         } else {
 44 |             assert(error == unknown);
 45 |         }
 46 |     }
 47 | 
 48 | #ifndef JEBP_NO_STDIO
 49 |     // jebp_read_size
 50 |     assert(jebp_read_size(NULL, path) == JEBP_ERROR_INVAL);
 51 |     assert(jebp_read_size(&image, NULL) == JEBP_ERROR_INVAL);
 52 |     assert(jebp_read_size(&image, ".") == JEBP_ERROR_IO);
 53 |     assert(jebp_read_size(&image, path) == JEBP_OK);
 54 |     printf("Image size: %ix%i\n", image.width, image.height);
 55 |     assert(image.width == width && image.height == height);
 56 |     assert(image.pixels == NULL);
 57 |     image.width = 0;
 58 |     image.height = 0;
 59 | 
 60 |     // jebp_read
 61 |     assert(jebp_read(NULL, path) == JEBP_ERROR_INVAL);
 62 |     assert(jebp_read(&image, NULL) == JEBP_ERROR_INVAL);
 63 |     assert(jebp_read(&image, ".") == JEBP_ERROR_IO);
 64 |     assert(jebp_read(&image, path) == JEBP_OK);
 65 |     printf("Image size: %ix%i\n", image.width, image.height);
 66 |     assert(image.width == width && image.height == height);
 67 |     assert(image.pixels != NULL);
 68 |     image_hash = test_image_hash(&image);
 69 |     printf("Image hash: %08x\n", image_hash);
 70 |     assert(image_hash == hash);
 71 | 
 72 |     // jebp_free_image
 73 |     jebp_free_image(NULL);
 74 |     jebp_free_image(&image);
 75 |     assert(image.width == 0 && image.height == 0);
 76 |     assert(image.pixels == NULL);
 77 | #endif // JEBP_NO_STDIO
 78 | 
 79 |     FILE *file = fopen(path, "rb");
 80 |     assert(file != NULL);
 81 |     assert(fseek(file, 0, SEEK_END) == 0);
 82 |     size_t size = ftell(file);
 83 |     assert(size > 32);
 84 |     void *data = malloc(size);
 85 |     assert(data != NULL);
 86 |     assert(fseek(file, 0, SEEK_SET) == 0);
 87 |     assert(fread(data, 1, size, file) == size);
 88 |     assert(fclose(file) == 0);
 89 | 
 90 |     // jebp_decode_size
 91 |     assert(jebp_decode_size(NULL, size, data) == JEBP_ERROR_INVAL);
 92 |     assert(jebp_decode_size(&image, 0, data) == JEBP_ERROR_EOF);
 93 |     assert(jebp_decode_size(&image, size, NULL) == JEBP_ERROR_INVAL);
 94 |     assert(jebp_decode_size(&image, 32, data) == JEBP_OK);
 95 |     printf("Image size: %ix%i\n", image.width, image.height);
 96 |     assert(image.width == width && image.height == height);
 97 |     assert(image.pixels == NULL);
 98 |     image.width = 0;
 99 |     image.height = 0;
100 | 
101 |     // jebp_decode
102 |     assert(jebp_decode(NULL, size, data) == JEBP_ERROR_INVAL);
103 |     assert(jebp_decode(&image, 0, data) == JEBP_ERROR_EOF);
104 |     assert(jebp_decode(&image, size, NULL) == JEBP_ERROR_INVAL);
105 |     assert(jebp_decode(&image, size, data) == JEBP_OK);
106 |     printf("Image size: %ix%i\n", image.width, image.height);
107 |     assert(image.width == width && image.height == height);
108 |     assert(image.pixels != NULL);
109 |     image_hash = test_image_hash(&image);
110 |     printf("Image hash: %08x\n", image_hash);
111 |     assert(image_hash == hash);
112 | 
113 |     // jebp_free_image
114 |     jebp_free_image(NULL);
115 |     jebp_free_image(&image);
116 |     assert(image.width == 0 && image.height == 0);
117 |     assert(image.pixels == NULL);
118 | 
119 |     free(data);
120 |     return EXIT_SUCCESS;
121 | }
122 | 


--------------------------------------------------------------------------------
/dev/lossless/compass.webp:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ba2fa4ca9fce2275438cf397bc600be8b1c2bb1042cc6cd0c1be2de7d6454849
3 | size 212298
4 | 


--------------------------------------------------------------------------------
/dev/lossless/dice.webp:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b03cc8e96d391ec222a48023f9f49b16f64f6cc9fc7366c8d02a85c6e8a49644
3 | size 155574
4 | 


--------------------------------------------------------------------------------
/dev/lossless/rose.webp:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c5c8fdad726feba1c7f4a98aef98db477b8cb9e5554b8dc16bad1cf48b6d09c3
3 | size 212520
4 | 


--------------------------------------------------------------------------------
/dev/lossless/tux.webp:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:67d332b637d19997ca95359099067205ce28226d15320ae8f17da51e4f814c95
3 | size 27650
4 | 


--------------------------------------------------------------------------------
/dev/toolchain/aarch64-linux-gnu.cmake:
--------------------------------------------------------------------------------
1 | set(CMAKE_SYSTEM_NAME Linux)
2 | set(CMAKE_SYSTEM_PROCESSOR arm64)
3 | set(prefix aarch64-linux-gnu)
4 | set(CMAKE_C_COMPILER ${prefix}-gcc)
5 | set(CMAKE_CROSSCOMPILING_EMULATOR qemu-aarch64 -L /usr/${prefix})
6 | 


--------------------------------------------------------------------------------
/dev/toolchain/arm-linux-gnueabi.cmake:
--------------------------------------------------------------------------------
1 | set(CMAKE_SYSTEM_NAME Linux)
2 | set(CMAKE_SYSTEM_PROCESSOR arm)
3 | set(prefix arm-linux-gnueabi)
4 | set(CMAKE_C_COMPILER ${prefix}-gcc)
5 | set(CMAKE_CROSSCOMPILING_EMULATOR qemu-arm -L /usr/${prefix})
6 | 


--------------------------------------------------------------------------------
/dev/toolchain/arm-linux-gnueabihf.cmake:
--------------------------------------------------------------------------------
1 | set(CMAKE_SYSTEM_NAME Linux)
2 | set(CMAKE_SYSTEM_PROCESSOR armhf)
3 | set(prefix arm-linux-gnueabihf)
4 | set(CMAKE_C_COMPILER ${prefix}-gcc)
5 | set(CMAKE_CROSSCOMPILING_EMULATOR qemu-arm -L /usr/${prefix})
6 | 


--------------------------------------------------------------------------------
/dev/toolchain/i686-linux-gnu.cmake:
--------------------------------------------------------------------------------
1 | set(CMAKE_SYSTEM_NAME Linux)
2 | set(CMAKE_SYSTEM_PROCESSOR i686)
3 | set(prefix i686-linux-gnu)
4 | set(CMAKE_C_COMPILER ${prefix}-gcc)
5 | set(CMAKE_CROSSCOMPILING_EMULATOR qemu-i386 -L /usr/${prefix})
6 | 


--------------------------------------------------------------------------------
/dev/toolchain/x86_64-w64-mingw32.cmake:
--------------------------------------------------------------------------------
1 | set(CMAKE_SYSTEM_NAME Windows)
2 | set(CMAKE_SYSTEM_PROCESSOR x86_64)
3 | set(prefix x86_64-w64-mingw32)
4 | set(CMAKE_C_COMPILER ${prefix}-gcc)
5 | set(CMAKE_CROSSCOMPILING_EMULATOR wine)
6 | 


--------------------------------------------------------------------------------
/jebp.c:
--------------------------------------------------------------------------------
1 | #define JEBP_IMPLEMENTATION
2 | #include "jebp.h"
3 | 


--------------------------------------------------------------------------------
/jebp.h:
--------------------------------------------------------------------------------
   1 | /**
   2 |  * JebP - Single header WebP decoder
   3 |  */
   4 | 
   5 | /**
   6 |  * LICENSE
   7 |  **
   8 |  * MIT No Attribution
   9 |  *
  10 |  * Copyright 2022 Jasmine Minter
  11 |  *
  12 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  13 |  * of this software and associated documentation files (the "Software"), to deal
  14 |  * in the Software without restriction, including without limitation the rights
  15 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  16 |  * copies of the Software, and to permit persons to whom the Software is
  17 |  * furnished to do so.
  18 |  *
  19 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  20 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  21 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  22 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  23 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  24 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  25 |  * SOFTWARE.
  26 |  */
  27 | 
  28 | // Attribution is not required, but would be appreciated :)
  29 | 
  30 | /**
  31 |  * DOCUMENTATION
  32 |  **
  33 |  * First and foremost, this project uses some custom types:
  34 |  *   `jebp_byte`/`jebp_ubyte` is a singular byte.
  35 |  *   `jebp_short`/jebp_ushort` is an integer of atleast 16-bits.
  36 |  *   `jebp_int`/`jebp_uint` is an integer of atleast 32-bits.
  37 |  *
  38 |  * This is a header only file. This means that it operates as a standard header
  39 |  * and to generate the source file you define `JEBP_IMPLEMENTATION` in ONE file
  40 |  * only. For example:
  41 |  * ```c
  42 |  * #define JEBP_IMPLEMENTATION
  43 |  * #include "jebp.h"
  44 |  * ```
  45 |  *
  46 |  * The most basic API call in this library is:
  47 |  * ```c
  48 |  * err = jebp_decode(&image, size, data);
  49 |  * ```
  50 |  * where:
  51 |  *   `jebp_image_t *image` is a pointer to an image structure to receive the
  52 |  *                         decoded data.
  53 |  *   `size_t size` is the size of the WebP-encoded data buffer.
  54 |  *   `const void *data` is a pointer to the WebP encoded data buffer, `size`
  55 |  *                      bytes large.
  56 |  *   `jebp_error_t err` is the result of the operation (OK or an error code).
  57 |  *
  58 |  * For reading from a provided file path, this API call can be used instead:
  59 |  * ```c
  60 |  * err = jebp_read(&image, path);
  61 |  * ```
  62 |  * where:
  63 |  *   `const char *path` is the path of the file to be read.
  64 |  *
  65 |  * It is currently not possible to read from a `FILE *` object.
  66 |  * If you only want to get the size of the image without a full read, these
  67 |  * functions can be used instead:
  68 |  * ```c
  69 |  * err = jebp_decode_size(&image, size, data);
  70 |  * err = jebp_read_size(&image, path);
  71 |  * ```
  72 |  *
  73 |  * The `jebp_image_t` structure has the following properties:
  74 |  *   `jebp_int width` is the width of the image.
  75 |  *   `jebp_int height` is the height of the image.
  76 |  *   `jebp_color_t *pixels` is a pointer to an array pixels. Each `jebp_color_t`
  77 |  *                          structure contains four `jebp_ubyte` values for `r`,
  78 |  *                          `g`, `b` and `a`. This allows the `pixels` pointer
  79 |  *                          to be cast to `jebp_ubyte *` to get an RGBA pixel
  80 |  *                          buffer.
  81 |  * The allocated data in the image can be free'd with:
  82 |  * ```c
  83 |  * jebp_free_image(&image);
  84 |  * ```
  85 |  * This function will also clear the structure, notably width and height will be
  86 |  * set to 0.
  87 |  *
  88 |  * The `jebp_error_t` enumeration has the following values:
  89 |  *   `JEBP_OK` means the operation completed successfully.
  90 |  *   `JEBP_ERROR_INVAL` means one of the arguments provided is invalid, usually
  91 |  *                      this refers to a NULL pointer.
  92 |  *   `JEBP_ERROR_INVDATA` means the WebP-encoded data is invalid or corrupted.
  93 |  *   `JEBP_ERROR_INVDATA_HEADER` is a suberror of `INVDATA` that indicates that
  94 |  *                      the header bytes are invalid. This file is likely not a
  95 |  *                      WebP file.
  96 |  *   `JEBP_ERROR_EOF` means the end of the file (or data buffer) was reached
  97 |  *                    before the operation could successfully complete.
  98 |  *   `JEBP_ERROR_NOSUP` means there is a feature in the WebP stream that is not
  99 |  *                      currently supported (see below). This can also represent
 100 |  *                      new features, versions or RIFF-chunks that were not in
 101 |  *                      the specification when writing.
 102 |  *   `JEBP_ERROR_NOSUP_CODEC` is a suberror of `NOSUP` that indicates that the
 103 |  *                      RIFF chunk that is most likely for the codec is not
 104 |  *                      recognized. Currently lossy images are not supported
 105 |  *                      (see below) and lossless image support can be disabled
 106 |  *                      (see `JEBP_NO_VP8L`).
 107 |  *   `JEBP_ERROR_NOSUP_PALETTE` is a suberror of `NOSUP` that indicates that the
 108 |  *                      image has a color-index transform (in WebP terminology,
 109 |  *                      this would be a paletted image). Color-indexing
 110 |  *                      transforms are not currently supported (see below). Note
 111 |  *                      that this error code might be removed after
 112 |  *                      color-indexing transform support is added, this is only
 113 |  *                      here for now to help detecting common issues.
 114 |  *   `JEBP_ERROR_NOMEM` means that a memory allocation failed, indicating that
 115 |  *                      there is no more memory available.
 116 |  *   `JEBP_ERROR_IO` represents any generic I/O error, usually from
 117 |  *                   file-reading.
 118 |  *   `JEBP_ERROR_UNKNOWN` means any unknown error. Currently this is only used
 119 |  *                        when an unknown value is passed into
 120 |  *                        `jebp_error_string`.
 121 |  * To get a human-readable string of the error, the following function can be
 122 |  * used:
 123 |  * ```c
 124 |  * const char *error = jebp_error_string(err);
 125 |  * ```
 126 |  *
 127 |  * This is not a feature-complete WebP decoder and has the following
 128 |  * limitations:
 129 |  *   - Does not support decoding lossy files with VP8.
 130 |  *   - Does not support extended file-formats with VP8X.
 131 |  *   - Does not support VP8L lossless images with the color-indexing transform
 132 |  *     (palleted images).
 133 |  *   - Does not support VP8L images with more than 256 huffman groups. This is
 134 |  *     an arbitrary limit to prevent bad images from using too much memory. In
 135 |  *     theory, images requiring more groups should be very rare. This limit may
 136 |  *     be increased in the future.
 137 |  *
 138 |  * Features that will probably never be supported due to complexity or API
 139 |  * constraints:
 140 |  *   - Decoding color profiles.
 141 |  *   - Decoding metadata.
 142 |  *   - Full color-indexing/palette support will be a bit of a mess, so don't
 143 |  *     expect full support of that coming anytime soon. Simple color-indexing
 144 |  *     support (more than 16 colors, skipping the need for bit-packing) is
 145 |  *     definitely alot more do-able.
 146 |  *
 147 |  * Along with `JEBP_IMPLEMENTATION` defined above, there are a few other macros
 148 |  * that can be defined to change how JebP operates:
 149 |  *   `JEBP_NO_STDIO` will disable the file-reading API.
 150 |  *   `JEBP_NO_SIMD` will disable SIMD optimizations. These are currently
 151 |  *                  not-used but the detection is there ready for further work.
 152 |  *   `JEBP_NO_VP8L` will disable VP8L (lossless) decoding support. Note that
 153 |  *                  currently this will make all images fail since VP8L is the
 154 |  *                  only supported codec right now.
 155 |  *   `JEBP_ALLOC` and `JEBP_FREE` can be defined to functions for a custom
 156 |  *                allocator. They either both have to be defined or neither
 157 |  *                defined.
 158 |  *
 159 |  * This single-header library requires C99 to be supported. Along with this it
 160 |  * requires the following headers from the system to successfully compile. Some
 161 |  * of these can be disabled with the above macros:
 162 |  *   `stddef.h` is used for the definition of the `size_t` type.
 163 |  *   `limits.h` is used for the `UINT_MAX` macro to check the size of `int`. If
 164 |  *              `int` is not 32-bits, `long` will be used for `jebp_int`
 165 |  *              instead.
 166 |  *   `string.h` is used for `memset` to clear out memory.
 167 |  *   `stdio.h` is used for I/O support and logging errors. If `JEBP_NO_STDIO` is
 168 |  *             defined and `JEBP_LOG_ERRORS` is not defined, this will not be
 169 |  *             included.
 170 |  *   `stdlib.h` is used for the default implementations of `JEBP_ALLOC`
 171 |  *              and `JEBP_FREE`, using `malloc` and `free` respectively. If
 172 |  *              those macros are already defined to something else, this will
 173 |  *              not be included.
 174 |  *   `emmintrin.h` and `arm_neon.h` is used for SIMD intrinsice. If
 175 |  *                 `JEBP_NO_SIMD` is defined these will not be included.
 176 |  *
 177 |  * The following predefined macros are also used for compiler-feature, SIMD and
 178 |  * endianness detection. These can be changed or modified before import to
 179 |  * change JebP's detection logic:
 180 |  *   `__STDC_VERSION__` is used to detect if the compiler supports C99 and also
 181 |  *                      checks for C11 support to use `_Noreturn`.
 182 |  *   `__has_attribute` and `__has_builtin` are used to detect the `noreturn` and
 183 |  *                     `always_inline` attributes, along with the
 184 |  *                     `__builtin_bswap32` builtin. Note that `__has_attribute`
 185 |  *                     does not fallback to compiler-version checks since most
 186 |  *                     compilers already support `__has_attribute`.
 187 |  *   `__GNUC__` and `__GNUC_MINOR__` are used to detect if the compiler is GCC
 188 |  *              (or GCC compatible) and what version of GCC it is. This, in
 189 |  *              turn, is used to polyfill `__has_builtin` on older compilers
 190 |  *              that may not support it.
 191 |  *   `__clang__` is used to detect the Clang compiler. This is only used to set
 192 |  *               the detected GCC version higher since Clang still marks itself
 193 |  *               as GCC 4.2 by default. No Clang version detection is done.
 194 |  *   `_MSC_VER` is used to detect the MSVC compiler. This is used to check
 195 |  *              support for `__declspec(noreturn)`, `__forceinline` and
 196 |  *              `_byteswap_ulong`. No MSVC version detection is done.
 197 |  *   `__LITTLE_ENDIAN__` is used to check if the architecture is little-endian.
 198 |  *                       Note that this is only checked either if the
 199 |  *                       architecture cannot be detected or, in special cases,
 200 |  *                       where there is not enough information from the
 201 |  *                       architecture or compiler to detect endianness. Also
 202 |  *                       note that big-endian and other more-obscure endian
 203 |  *                       types are not detected. Little-endian is the only
 204 |  *                       endianness detected and is used for optimization in a
 205 |  *                       few areas. If the architecture is not little-endian or
 206 |  *                       cannot be detected as such, a naive solution is used
 207 |  *                       instead.
 208 |  *   `__i386`, `__i386__` and `_M_IX86` are used to detect if this is being
 209 |  *           compiled for x86-32 (also known as x86, IA-32, or i386). If one of
 210 |  *           these are defined, it is also assumed that the architecture is
 211 |  *           little-endian. `_M_IX86` is usually present on MSVC, while
 212 |  *           the other two are usually present on most other compilers.
 213 |  *   `__SSE2__` and `_M_IX86_FP` are used to detect SSE2 support on x86-32.
 214 |  *              `_M_IX86`, which is usually present on MSVC, must equal 2 to
 215 |  *              indicate that the code is being compiled for a SSE2-compatible
 216 |  *              floating-point unit. `__SSE2__` is usually present on most other
 217 |  *              compilers.
 218 |  *   `__x86_64`, `__x86_64__` and `_M_X64` are used to detect if this is being
 219 |  *            compiled for x86-64 (also known as AMD64). If one of these are
 220 |  *            defined, it is also assumed that the architecture is little-endian
 221 |  *            and that SSE2 is supported (which is required for x86-64 support).
 222 |  *            `_M_X64` is usually present on MSVC, while the other two are
 223 |  *            usually present on most other compilers.
 224 |  *   `__arm`, `__arm__` and `_M_ARM` are used to detect if this is being
 225 |  *            compiled for AArch32 (also known as arm32 or armhf). If one of
 226 |  *            these are defined on Windows, it is also assumed that Neon is
 227 |  *            supported (which is required for Windows). `_M_ARM` is usually
 228 |  *            present on MSVC while the other two are usually present on most
 229 |  *            other compilers.
 230 |  *   `__ARM_NEON` is used to detect Neon support on AArch32. MSVC doesn't seem
 231 |  *                to support this and I can't find any info on detecting Neon
 232 |  *                support for MSVC. I have found mentions of Windows requiring
 233 |  *                Neon support but cannot find any concrete proof anywhere.
 234 |  *   `__aarch64`, `__aarch64__` and `_M_ARM64` are used to detect if this is
 235 |  *                being compiled for AArch64 (also known as arm64). If one of
 236 |  *                these are defined, it is also assumed that Neon is supported
 237 |  *                (which is required for AArch64 support). `_M_ARM64` is usually
 238 |  *                present on MSVC, while the other two are usually present on
 239 |  *                most other compilers.
 240 |  *   `__ARM_BIG_ENDIAN` is used to detect, on AArch/ARM architectures, if it is
 241 |  *                      in big-endian mode. However, as mentioned above, there
 242 |  *                      is no special code for big-endian and it's worth noting
 243 |  *                      that this is just used to force-disable little-endian.
 244 |  *                      If this is not present, it falls back to using
 245 |  *                      `__LITTLE_ENDIAN__`. It is also worth noting that MSVC
 246 |  *                      does not seem to provide a way to detect endianness. It
 247 |  *                      may be that Windows requires little-endian but I can't
 248 |  *                      find any concrete sources on this so currently
 249 |  *                      little-endian detection is not supported on MSVC.
 250 |  */
 251 | 
 252 | /**
 253 |  * HEADER
 254 |  */
 255 | #ifndef JEBP__HEADER
 256 | #define JEBP__HEADER
 257 | #ifdef __cplusplus
 258 | extern "C" {
 259 | #endif // __cplusplus
 260 | #include <limits.h>
 261 | #include <stddef.h>
 262 | 
 263 | #if UINT_MAX >= 0xffffffff
 264 | #define JEBP__INT int
 265 | #else
 266 | #define JEBP__INT long
 267 | #endif
 268 | typedef signed char jebp_byte;
 269 | typedef unsigned char jebp_ubyte;
 270 | typedef short jebp_short;
 271 | typedef unsigned short jebp_ushort;
 272 | typedef JEBP__INT jebp_int;
 273 | typedef unsigned JEBP__INT jebp_uint;
 274 | 
 275 | typedef enum jebp_error_t {
 276 |     JEBP_OK,
 277 |     JEBP_ERROR_INVAL,
 278 |     JEBP_ERROR_INVDATA,
 279 |     JEBP_ERROR_INVDATA_HEADER,
 280 |     JEBP_ERROR_EOF,
 281 |     JEBP_ERROR_NOSUP,
 282 |     JEBP_ERROR_NOSUP_CODEC,
 283 |     JEBP_ERROR_NOSUP_PALETTE,
 284 |     JEBP_ERROR_NOMEM,
 285 |     JEBP_ERROR_IO,
 286 |     JEBP_ERROR_UNKNOWN,
 287 |     JEBP_NB_ERRORS
 288 | } jebp_error_t;
 289 | 
 290 | typedef struct jebp_color_t {
 291 |     jebp_ubyte r;
 292 |     jebp_ubyte g;
 293 |     jebp_ubyte b;
 294 |     jebp_ubyte a;
 295 | } jebp_color_t;
 296 | 
 297 | typedef struct jebp_image_t {
 298 |     jebp_int width;
 299 |     jebp_int height;
 300 |     jebp_color_t *pixels;
 301 | } jebp_image_t;
 302 | 
 303 | const char *jebp_error_string(jebp_error_t err);
 304 | void jebp_free_image(jebp_image_t *image);
 305 | jebp_error_t jebp_decode_size(jebp_image_t *image, size_t size,
 306 |                               const void *data);
 307 | jebp_error_t jebp_decode(jebp_image_t *image, size_t size, const void *data);
 308 | 
 309 | // I/O API
 310 | #ifndef JEBP_NO_STDIO
 311 | jebp_error_t jebp_read_size(jebp_image_t *image, const char *path);
 312 | jebp_error_t jebp_read(jebp_image_t *image, const char *path);
 313 | #endif // JEBP_NO_STDIO
 314 | 
 315 | #ifdef __cplusplus
 316 | }
 317 | #endif // __cplusplus
 318 | #endif // JEBP__HEADER
 319 | 
 320 | /**
 321 |  * IMPLEMENTATION
 322 |  */
 323 | #ifdef JEBP_IMPLEMENTATION
 324 | #include <string.h>
 325 | #if !defined(JEBP_NO_STDIO) || defined(JEBP_LOG_ERRORS)
 326 | #include <stdio.h>
 327 | #endif
 328 | #if !defined(JEBP_ALLOC) && !defined(JEBP_FREE)
 329 | #include <stdlib.h>
 330 | #define JEBP_ALLOC malloc
 331 | #define JEBP_FREE free
 332 | #elif !defined(JEBP_ALLOC) || !defined(JEBP_FREE)
 333 | #error "Both JEBP_ALLOC and JEBP_FREE have to be defined"
 334 | #endif
 335 | 
 336 | /**
 337 |  * Predefined macro detection
 338 |  */
 339 | #ifdef __STDC_VERSION__
 340 | #if __STDC_VERSION__ < 199901
 341 | #error "Standard C99 support is required"
 342 | #endif
 343 | #else // __STDC_VERSION__
 344 | #if defined(__GNUC__)
 345 | #warning "C version cannot be checked, compilation may fail"
 346 | #elif defined(_MSC_VER)
 347 | #pragma message(                                                               \
 348 |     "MSVC by default is C89 'with extensions', use /std:c11 to ensure there are no errors")
 349 | #endif
 350 | #endif // __STDC_VERSION__
 351 | #if defined(__clang__)
 352 | // The default GNUC version provided by Clang is just short of what we need
 353 | #define JEBP__GNU_VERSION 403
 354 | #elif defined(__GNUC__)
 355 | #define JEBP__GNU_VERSION ((__GNUC__ * 100) + __GNUC_MINOR__)
 356 | #else
 357 | #define JEBP__GNU_VERSION 0
 358 | #endif // __GNUC__
 359 | 
 360 | #ifdef __has_attribute
 361 | #define JEBP__HAS_ATTRIBUTE __has_attribute
 362 | #else // __has_attribute
 363 | // We don't add GCC version checks since, unlike __has_builtin, __has_attribute
 364 | // has been out for so long that its more likely that the compiler supports it.
 365 | #define JEBP__HAS_ATTRIBUTE(attr) 0
 366 | #endif // __has_attribute
 367 | #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
 368 | #define JEBP__NORETURN _Noreturn
 369 | #elif JEBP__HAS_ATTRIBUTE(noreturn)
 370 | #define JEBP__NORETURN __attribute__((noreturn))
 371 | #elif defined(_MSC_VER)
 372 | #define JEBP__NORETURN __declspec(noreturn)
 373 | #else
 374 | #define JEBP__NORETURN
 375 | #endif
 376 | #if JEBP__HAS_ATTRIBUTE(always_inline)
 377 | #define JEBP__ALWAYS_INLINE __attribute__((always_inline))
 378 | #elif defined(_MSC_VER)
 379 | #define JEBP__ALWAYS_INLINE __forceinline
 380 | #else
 381 | #define JEBP__ALWAYS_INLINE
 382 | #endif
 383 | #define JEBP__INLINE static inline JEBP__ALWAYS_INLINE
 384 | 
 385 | #ifdef __has_builtin
 386 | #define JEBP__HAS_BUILTIN __has_builtin
 387 | #else // __has_builtin
 388 | #define JEBP__HAS_BUILTIN(builtin)                                             \
 389 |     JEBP__VERSION##builtin != 0 && JEBP__GNU_VERSION >= JEBP__VERSION##builtin
 390 | // I believe this was added earlier but GCC 4.3 is the first time it was
 391 | // mentioned in the changelog and manual.
 392 | #define JEBP__VERSION__builtin_bswap32 403
 393 | #endif // __has_builtin
 394 | #if JEBP__HAS_BUILTIN(__builtin_bswap32)
 395 | #define JEBP__SWAP32(value) __builtin_bswap32(value)
 396 | #elif defined(_MSC_VER)
 397 | #define JEBP__SWAP32(value) _byteswap_ulong(value)
 398 | #endif
 399 | 
 400 | // We don't do any SIMD runtime detection since that causes alot of
 401 | // heavily-documented issues that I won't go into here. Instead, if the compiler
 402 | // supports it (and requests it) we will use it. It helps that both x86-64 and
 403 | // AArch64 always support the SIMD from their 32-bit counterparts.
 404 | #if defined(__i386) || defined(__i386__) || defined(_M_IX86)
 405 | #define JEBP__ARCH_X86
 406 | #if defined(__SSE2__) || _M_IX86_FP == 2
 407 | #define JEBP__SIMD_SSE2
 408 | #endif
 409 | #elif defined(__x86_64) || defined(__x86_64__) || defined(_M_X64)
 410 | #define JEBP__ARCH_X86
 411 | #define JEBP__SIMD_SSE2
 412 | #elif defined(__arm) || defined(__arm__) || defined(_M_ARM)
 413 | #define JEBP__ARCH_ARM
 414 | #if defined(__ARM_NEON) || defined(_MSC_VER)
 415 | // According to the following article, MSVC requires Neon support
 416 | // https://docs.microsoft.com/en-us/cpp/build/overview-of-arm-abi-conventions
 417 | #define JEBP__SIMD_NEON
 418 | #endif
 419 | #elif defined(__aarch64) || defined(__aarch64__) || defined(_M_ARM64)
 420 | #define JEBP__ARCH_ARM
 421 | #define JEBP__SIMD_NEON
 422 | #define JEBP__SIMD_NEON64
 423 | #endif
 424 | 
 425 | #if defined(JEBP__ARCH_X86)
 426 | // x86 is always little-endian
 427 | #define JEBP__LITTLE_ENDIAN
 428 | #elif defined(JEBP__ARCH_ARM) && defined(__ARM_BIG_ENDIAN)
 429 | // The ACLE big-endian define overrules everything else, including the defualt
 430 | // endianness detection
 431 | #elif defined(JEBP__ARCH_ARM) && (defined(__ARM_ACLE) || defined(_MSC_VER))
 432 | // If ACLE is supported and big-endian is not defined, it must be little-endian
 433 | // According to the article linked above, MSVC only supports little-endian
 434 | #define JEBP__LITTLE_ENDIAN
 435 | #elif defined(__LITTLE_ENDIAN__) || __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 436 | #define JEBP__LITTLE_ENDIAN
 437 | #endif
 438 | 
 439 | #ifdef JEBP_NO_SIMD
 440 | #undef JEBP__SIMD_SSE2
 441 | #undef JEBP__SIMD_NEON
 442 | #endif // JEBP_NO_SIMD
 443 | #ifdef JEBP__SIMD_SSE2
 444 | #include <emmintrin.h>
 445 | #endif // JEBP__SIMD_SSE2
 446 | #ifdef JEBP__SIMD_NEON
 447 | #include <arm_neon.h>
 448 | #endif // JEBP__SIMD_NEON
 449 | 
 450 | /**
 451 |  * Common utilities
 452 |  */
 453 | // TODO: Maybe we should  have a logging flag and add custom logs with more
 454 | //       information to each error (and maybe other stuff like allocations)
 455 | #define JEBP__MIN(a, b) ((a) < (b) ? (a) : (b))
 456 | #define JEBP__MAX(a, b) ((a) > (b) ? (a) : (b))
 457 | #define JEBP__ABS(a) ((a) < 0 ? -(a) : (a))
 458 | #define JEBP__AVG(a, b) (((a) + (b)) / 2)
 459 | #define JEBP__CEIL_SHIFT(a, b) (((a) + (1 << (b)) - 1) >> (b))
 460 | #define JEBP__CLAMP(x, min, max) JEBP__MIN(JEBP__MAX(x, min), max)
 461 | #define JEBP__CLAMP_UBYTE(x) JEBP__CLAMP(x, 0, 255)
 462 | #define JEBP__CLEAR(ptr, size) memset(ptr, 0, size)
 463 | 
 464 | // A simple utility that updates an error pointer if it currently does not have
 465 | // an error
 466 | JEBP__INLINE jebp_error_t jebp__error(jebp_error_t *err, jebp_error_t error) {
 467 |     if (*err == JEBP_OK) {
 468 |         *err = error;
 469 |     }
 470 |     return *err;
 471 | }
 472 | 
 473 | // Currently only used by VP8L
 474 | // TODO: after VP8(no-L) support is added, make it an error to remove both
 475 | //       VP8 and VP8L
 476 | #ifndef JEBP_NO_VP8L
 477 | static jebp_error_t jebp__alloc_image(jebp_image_t *image) {
 478 |     image->pixels =
 479 |         JEBP_ALLOC(image->width * image->height * sizeof(jebp_color_t));
 480 |     if (image->pixels == NULL) {
 481 |         return JEBP_ERROR_NOMEM;
 482 |     }
 483 |     return JEBP_OK;
 484 | }
 485 | #endif // JEBP_NO_VP8L
 486 | 
 487 | /**
 488 |  * Reader abstraction
 489 |  */
 490 | #define JEBP__BUFFER_SIZE 4096
 491 | 
 492 | typedef struct jebp__reader_t {
 493 |     size_t nb_bytes;
 494 |     const jebp_ubyte *bytes;
 495 | #ifndef JEBP_NO_STDIO
 496 |     FILE *file;
 497 |     void *buffer;
 498 | #endif // JEBP_NO_STDIO
 499 | } jebp__reader_t;
 500 | 
 501 | static void jebp__init_memory(jebp__reader_t *reader, size_t size,
 502 |                               const void *data) {
 503 |     reader->nb_bytes = size;
 504 |     reader->bytes = data;
 505 | #ifndef JEBP_NO_STDIO
 506 |     reader->file = NULL;
 507 | #endif // JEBP_NO_STDIO
 508 | }
 509 | 
 510 | #ifndef JEBP_NO_STDIO
 511 | static jebp_error_t jebp__open_file(jebp__reader_t *reader, const char *path) {
 512 |     reader->nb_bytes = 0;
 513 |     reader->file = fopen(path, "rb");
 514 |     if (reader->file == NULL) {
 515 |         return JEBP_ERROR_IO;
 516 |     }
 517 |     reader->buffer = JEBP_ALLOC(JEBP__BUFFER_SIZE);
 518 |     if (reader->buffer == NULL) {
 519 |         fclose(reader->file);
 520 |         return JEBP_ERROR_NOMEM;
 521 |     }
 522 |     return JEBP_OK;
 523 | }
 524 | 
 525 | static void jebp__close_file(jebp__reader_t *reader) {
 526 |     JEBP_FREE(reader->buffer);
 527 |     fclose(reader->file);
 528 | }
 529 | #endif // JEBP_NO_STDIO
 530 | 
 531 | static jebp_error_t jebp__buffer_bytes(jebp__reader_t *reader) {
 532 |     if (reader->nb_bytes > 0) {
 533 |         return JEBP_OK;
 534 |     }
 535 | #ifndef JEBP_NO_STDIO
 536 |     if (reader->file != NULL) {
 537 |         reader->nb_bytes =
 538 |             fread(reader->buffer, 1, JEBP__BUFFER_SIZE, reader->file);
 539 |         reader->bytes = reader->buffer;
 540 |         if (ferror(reader->file)) {
 541 |             return JEBP_ERROR_IO;
 542 |         }
 543 |     }
 544 | #endif // JEBP_NO_STDIO
 545 |     if (reader->nb_bytes == 0) {
 546 |         return JEBP_ERROR_EOF;
 547 |     }
 548 |     return JEBP_OK;
 549 | }
 550 | 
 551 | // TODO: Most reads are only a few bytes so maybe I should optimize for that
 552 | static jebp_error_t jebp__read_bytes(jebp__reader_t *reader, size_t size,
 553 |                                      void *data) {
 554 |     jebp_error_t err;
 555 |     jebp_ubyte *bytes = data;
 556 |     while (size > 0) {
 557 |         if ((err = jebp__buffer_bytes(reader)) != JEBP_OK) {
 558 |             return err;
 559 |         }
 560 |         size_t nb_bytes = JEBP__MIN(size, reader->nb_bytes);
 561 |         if (bytes != NULL) {
 562 |             memcpy(bytes, reader->bytes, nb_bytes);
 563 |             bytes += nb_bytes;
 564 |         }
 565 |         size -= nb_bytes;
 566 |         reader->nb_bytes -= nb_bytes;
 567 |         reader->bytes += nb_bytes;
 568 |     }
 569 |     return JEBP_OK;
 570 | }
 571 | 
 572 | // 8-bit uint reading is currently only used by the bit-reader
 573 | #ifndef JEBP_NO_VP8L
 574 | static jebp_ubyte jebp__read_uint8(jebp__reader_t *reader, jebp_error_t *err) {
 575 |     if (*err != JEBP_OK) {
 576 |         return 0;
 577 |     }
 578 |     if ((*err = jebp__buffer_bytes(reader)) != JEBP_OK) {
 579 |         return 0;
 580 |     }
 581 |     reader->nb_bytes -= 1;
 582 |     return *(reader->bytes++);
 583 | }
 584 | #endif // JEBP_NO_VP8L
 585 | 
 586 | static jebp_uint jebp__read_uint32(jebp__reader_t *reader, jebp_error_t *err) {
 587 |     if (*err != JEBP_OK) {
 588 |         return 0;
 589 |     }
 590 | #ifdef JEBP__LITTLE_ENDIAN
 591 |     jebp_uint value = 0;
 592 |     *err = jebp__read_bytes(reader, 4, &value);
 593 |     return value;
 594 | #else  // JEBP__LITTLE_ENDIAN
 595 |     jebp_ubyte bytes[4];
 596 |     *err = jebp__read_bytes(reader, 4, bytes);
 597 |     return (jebp_uint)bytes[0] | ((jebp_uint)bytes[1] << 8) |
 598 |            ((jebp_uint)bytes[2] << 16) | ((jebp_uint)bytes[3] << 24);
 599 | #endif // JEBP__LITTLE_ENDIAN
 600 | }
 601 | 
 602 | /**
 603 |  * RIFF container
 604 |  */
 605 | #define JEBP__RIFF_TAG 0x46464952
 606 | #define JEBP__WEBP_TAG 0x50424557
 607 | 
 608 | typedef struct jebp__chunk_t {
 609 |     jebp_uint tag;
 610 |     jebp_uint size;
 611 | } jebp__chunk_t;
 612 | 
 613 | typedef struct jebp__riff_reader_t {
 614 |     jebp__reader_t *reader;
 615 |     jebp__chunk_t header;
 616 | } jebp__riff_reader_t;
 617 | 
 618 | static jebp_error_t jebp__read_chunk(jebp__riff_reader_t *riff,
 619 |                                      jebp__chunk_t *chunk) {
 620 |     jebp_error_t err = JEBP_OK;
 621 |     chunk->tag = jebp__read_uint32(riff->reader, &err);
 622 |     chunk->size = jebp__read_uint32(riff->reader, &err);
 623 |     chunk->size += chunk->size % 2; // round up to even
 624 |     return err;
 625 | }
 626 | 
 627 | static jebp_error_t jebp__read_riff_header(jebp__riff_reader_t *riff,
 628 |                                            jebp__reader_t *reader) {
 629 |     jebp_error_t err;
 630 |     riff->reader = reader;
 631 |     if ((err = jebp__read_chunk(riff, &riff->header)) != JEBP_OK) {
 632 |         return err;
 633 |     }
 634 |     if (riff->header.tag != JEBP__RIFF_TAG) {
 635 |         return JEBP_ERROR_INVDATA_HEADER;
 636 |     }
 637 |     if (jebp__read_uint32(reader, &err) != JEBP__WEBP_TAG) {
 638 |         return jebp__error(&err, JEBP_ERROR_INVDATA_HEADER);
 639 |     }
 640 |     return err;
 641 | }
 642 | 
 643 | static jebp_error_t jebp__read_riff_chunk(jebp__riff_reader_t *riff,
 644 |                                           jebp__chunk_t *chunk) {
 645 |     jebp_error_t err;
 646 |     if ((err = jebp__read_chunk(riff, chunk)) != JEBP_OK) {
 647 |         return err;
 648 |     }
 649 |     if (chunk->size > riff->header.size) {
 650 |         return JEBP_ERROR_INVDATA;
 651 |     }
 652 |     riff->header.size -= chunk->size;
 653 |     return JEBP_OK;
 654 | }
 655 | 
 656 | /**
 657 |  * Bit reader
 658 |  */
 659 | #ifndef JEBP_NO_VP8L
 660 | typedef struct jebp__bit_reader_t {
 661 |     jebp__reader_t *reader;
 662 |     size_t nb_bytes;
 663 |     jebp_int nb_bits;
 664 |     jebp_uint bits;
 665 | } jebp__bit_reader_t;
 666 | 
 667 | static void jepb__init_bit_reader(jebp__bit_reader_t *bits,
 668 |                                   jebp__reader_t *reader, size_t size) {
 669 |     bits->reader = reader;
 670 |     bits->nb_bytes = size;
 671 |     bits->nb_bits = 0;
 672 |     bits->bits = 0;
 673 | }
 674 | 
 675 | // buffer/peek/skip should be used together to optimize bit-reading
 676 | static jebp_error_t jebp__buffer_bits(jebp__bit_reader_t *bits, jebp_int size) {
 677 |     jebp_error_t err = JEBP_OK;
 678 |     while (bits->nb_bits < size && bits->nb_bytes > 0) {
 679 |         bits->bits |= jebp__read_uint8(bits->reader, &err) << bits->nb_bits;
 680 |         bits->nb_bits += 8;
 681 |         bits->nb_bytes -= 1;
 682 |     }
 683 |     return err;
 684 | }
 685 | 
 686 | JEBP__INLINE jebp_int jepb__peek_bits(jebp__bit_reader_t *bits, jebp_int size) {
 687 |     return bits->bits & ((1 << size) - 1);
 688 | }
 689 | 
 690 | JEBP__INLINE jebp_error_t jebp__skip_bits(jebp__bit_reader_t *bits,
 691 |                                           jebp_int size) {
 692 |     if (size > bits->nb_bits) {
 693 |         return JEBP_ERROR_INVDATA;
 694 |     }
 695 |     bits->nb_bits -= size;
 696 |     bits->bits >>= size;
 697 |     return JEBP_OK;
 698 | }
 699 | 
 700 | static jebp_uint jebp__read_bits(jebp__bit_reader_t *bits, jebp_int size,
 701 |                                  jebp_error_t *err) {
 702 |     if (*err != JEBP_OK) {
 703 |         return 0;
 704 |     }
 705 |     if ((*err = jebp__buffer_bits(bits, size)) != JEBP_OK) {
 706 |         return 0;
 707 |     }
 708 |     jebp_uint value = jepb__peek_bits(bits, size);
 709 |     if ((*err = jebp__skip_bits(bits, size)) != JEBP_OK) {
 710 |         return 0;
 711 |     }
 712 |     return value;
 713 | }
 714 | 
 715 | /**
 716 |  * Huffman coding
 717 |  */
 718 | #define JEBP__MAX_HUFFMAN_LENGTH 15
 719 | #define JEBP__MAX_PRIMARY_LENGTH 8
 720 | #define JEBP__MAX_SECONDARY_LENGTH                                             \
 721 |     (JEBP__MAX_HUFFMAN_LENGTH - JEBP__MAX_PRIMARY_LENGTH)
 722 | #define JEBP__NB_PRIMARY_HUFFMANS (1 << JEBP__MAX_PRIMARY_LENGTH)
 723 | #define JEBP__NO_HUFFMAN_SYMBOL 0xffff
 724 | 
 725 | #define JEBP__NB_META_SYMBOLS 19
 726 | #define JEBP__NB_COLOR_SYMBOLS 256
 727 | #define JEBP__NB_LENGTH_SYMBOLS 24
 728 | #define JEBP__NB_DIST_SYMBOLS 40
 729 | #define JEBP__NB_MAIN_SYMBOLS (JEBP__NB_COLOR_SYMBOLS + JEBP__NB_LENGTH_SYMBOLS)
 730 | 
 731 | // The huffman decoding is done in one or two steps, both using a lookup table.
 732 | // These tables are called the "primary" table and "secondary" tables. First
 733 | // 8-bits are peeked from the stream to index the primary table. If the symbol
 734 | // is in this table (indicated by length <= 8) then the symbol from that is used
 735 | // and the length is used to skip that many bits. Codes which are smaller than
 736 | // 8-bits are represented by filling the table such that any index with a prefix
 737 | // of the given code will have the same entry. If the symbol requires more bits
 738 | // (indiciated by length > 8) then the symbol is used as an offset pointing to
 739 | // the secondary table which has an index size of (length - 8) bits.
 740 | typedef struct jebp__huffman_t {
 741 |     // <= 8: length is the number of bits actually used, and symbol is the
 742 |     //       decoded symbol or `JEBP__NO_HUFFMAN_SYMBOL` for an invalid code.
 743 |     // >  8: length is the maximum number of bits for any code with this prefix,
 744 |     //       and symbol is the offset in the array to the secondary table.
 745 |     jebp_short length;
 746 |     jebp_ushort symbol;
 747 | } jebp__huffman_t;
 748 | 
 749 | typedef struct jebp__huffman_group_t {
 750 |     jebp__huffman_t *main;
 751 |     jebp__huffman_t *red;
 752 |     jebp__huffman_t *blue;
 753 |     jebp__huffman_t *alpha;
 754 |     jebp__huffman_t *dist;
 755 | } jebp__huffman_group_t;
 756 | 
 757 | static const jebp_byte jebp__meta_length_order[JEBP__NB_META_SYMBOLS];
 758 | 
 759 | // Reverse increment, returns truthy on overflow
 760 | JEBP__INLINE jebp_int jebp__increment_code(jebp_int *code, jebp_int length) {
 761 |     jebp_int inc = 1 << (length - 1);
 762 |     while (*code & inc) {
 763 |         inc >>= 1;
 764 |     }
 765 |     if (inc == 0) {
 766 |         return 1;
 767 |     }
 768 |     *code = (*code & (inc - 1)) + inc;
 769 |     return 0;
 770 | }
 771 | 
 772 | // This function is a bit confusing so I have attempted to document it well
 773 | static jebp_error_t jebp__alloc_huffman(jebp__huffman_t **huffmans,
 774 |                                         jebp_int nb_lengths,
 775 |                                         const jebp_byte *lengths) {
 776 |     // Stack allocate the primary table and set it all to invalid values
 777 |     jebp__huffman_t primary[JEBP__NB_PRIMARY_HUFFMANS];
 778 |     for (jebp_int i = 0; i < JEBP__NB_PRIMARY_HUFFMANS; i += 1) {
 779 |         primary[i].symbol = JEBP__NO_HUFFMAN_SYMBOL;
 780 |     }
 781 | 
 782 |     // Fill in the 8-bit codes in the primary table
 783 |     jebp_int len = 1;
 784 |     jebp_int code = 0;
 785 |     jebp_int overflow = 0;
 786 |     jebp_ushort symbol = JEBP__NO_HUFFMAN_SYMBOL;
 787 |     jebp_int nb_symbols = 0;
 788 |     for (; len <= JEBP__MAX_PRIMARY_LENGTH; len += 1) {
 789 |         for (jebp_int i = 0; i < nb_lengths; i += 1) {
 790 |             if (lengths[i] != len) {
 791 |                 continue;
 792 |             }
 793 |             if (overflow) {
 794 |                 // Fail now if the last increment overflowed
 795 |                 return JEBP_ERROR_INVDATA;
 796 |             }
 797 |             for (jebp_int c = code; c < JEBP__NB_PRIMARY_HUFFMANS;
 798 |                  c += 1 << len) {
 799 |                 primary[c].length = len;
 800 |                 primary[c].symbol = i;
 801 |             }
 802 |             overflow = jebp__increment_code(&code, len);
 803 |             symbol = i;
 804 |             nb_symbols += 1;
 805 |         }
 806 |     }
 807 | 
 808 |     // Fill in the secondary table lengths in the primary table
 809 |     jebp_int secondary_code = code;
 810 |     for (; len <= JEBP__MAX_HUFFMAN_LENGTH; len += 1) {
 811 |         for (jebp_int i = 0; i < nb_lengths; i += 1) {
 812 |             if (lengths[i] != len) {
 813 |                 continue;
 814 |             }
 815 |             if (overflow) {
 816 |                 return JEBP_ERROR_INVDATA;
 817 |             }
 818 |             jebp_int prefix = code & (JEBP__NB_PRIMARY_HUFFMANS - 1);
 819 |             primary[prefix].length = len;
 820 |             overflow = jebp__increment_code(&code, len);
 821 |             symbol = i;
 822 |             nb_symbols += 1;
 823 |         }
 824 |     }
 825 | 
 826 |     // Calculate the total no. of huffman entries and fill in the secondary
 827 |     // table offsets
 828 |     jebp_int nb_huffmans = JEBP__NB_PRIMARY_HUFFMANS;
 829 |     for (jebp_int i = 0; i < JEBP__NB_PRIMARY_HUFFMANS; i += 1) {
 830 |         if (nb_symbols <= 1) {
 831 |             // Special case: if there is only one symbol, use this iteration to
 832 |             //               instead fill the primary table with 0-length
 833 |             //               entries
 834 |             primary[i].length = 0;
 835 |             primary[i].symbol = symbol;
 836 |             continue;
 837 |         }
 838 |         jebp_int suffix_length = primary[i].length - JEBP__MAX_PRIMARY_LENGTH;
 839 |         if (suffix_length > 0) {
 840 |             primary[i].symbol = nb_huffmans;
 841 |             nb_huffmans += 1 << suffix_length;
 842 |         }
 843 |     }
 844 | 
 845 |     // Allocate, copy over the primary table, and assign the rest to invalid
 846 |     // values
 847 |     *huffmans = JEBP_ALLOC(nb_huffmans * sizeof(jebp__huffman_t));
 848 |     if (*huffmans == NULL) {
 849 |         return JEBP_ERROR_NOMEM;
 850 |     }
 851 |     memcpy(*huffmans, primary, sizeof(primary));
 852 |     if (nb_huffmans == JEBP__NB_PRIMARY_HUFFMANS) {
 853 |         // Special case: we can stop here if we don't have to fill any secondary
 854 |         //               tables
 855 |         return JEBP_OK;
 856 |     }
 857 |     for (jebp_int i = JEBP__NB_PRIMARY_HUFFMANS; i < nb_huffmans; i += 1) {
 858 |         (*huffmans)[i].symbol = JEBP__NO_HUFFMAN_SYMBOL;
 859 |     }
 860 | 
 861 |     // Fill in the secondary tables
 862 |     len = JEBP__MAX_PRIMARY_LENGTH + 1;
 863 |     code = secondary_code;
 864 |     for (; len <= JEBP__MAX_HUFFMAN_LENGTH; len += 1) {
 865 |         for (jebp_int i = 0; i < nb_lengths; i += 1) {
 866 |             if (lengths[i] != len) {
 867 |                 continue;
 868 |             }
 869 |             jebp_int prefix = code & (JEBP__NB_PRIMARY_HUFFMANS - 1);
 870 |             jebp_int nb_secondary_huffmans = 1 << primary[prefix].length;
 871 |             jebp__huffman_t *secondary = *huffmans + primary[prefix].symbol;
 872 |             for (jebp_int c = code; c < nb_secondary_huffmans; c += 1 << len) {
 873 |                 secondary[c >> JEBP__MAX_PRIMARY_LENGTH].length = len;
 874 |                 secondary[c >> JEBP__MAX_PRIMARY_LENGTH].symbol = i;
 875 |             }
 876 |             jebp__increment_code(&code, len);
 877 |         }
 878 |     }
 879 |     return JEBP_OK;
 880 | }
 881 | 
 882 | static jebp_int jebp__read_symbol(jebp__huffman_t *huffmans,
 883 |                                   jebp__bit_reader_t *bits, jebp_error_t *err) {
 884 |     if (*err != JEBP_OK) {
 885 |         return 0;
 886 |     }
 887 |     if ((*err = jebp__buffer_bits(bits, JEBP__MAX_HUFFMAN_LENGTH)) != JEBP_OK) {
 888 |         return 0;
 889 |     }
 890 |     jebp_int code = jepb__peek_bits(bits, JEBP__MAX_PRIMARY_LENGTH);
 891 |     if (huffmans[code].symbol == JEBP__NO_HUFFMAN_SYMBOL) {
 892 |         *err = JEBP_ERROR_INVDATA;
 893 |         return 0;
 894 |     }
 895 |     jebp_int length = huffmans[code].length;
 896 |     jebp_int skip = JEBP__MIN(length, JEBP__MAX_PRIMARY_LENGTH);
 897 |     if ((*err = jebp__skip_bits(bits, skip)) != JEBP_OK) {
 898 |         return 0;
 899 |     }
 900 |     if (skip == length) {
 901 |         return huffmans[code].symbol;
 902 |     }
 903 | 
 904 |     huffmans += huffmans[code].symbol;
 905 |     code = jepb__peek_bits(bits, length - skip);
 906 |     if (huffmans[code].symbol == JEBP__NO_HUFFMAN_SYMBOL) {
 907 |         *err = JEBP_ERROR_INVDATA;
 908 |         return 0;
 909 |     }
 910 |     if ((*err = jebp__skip_bits(bits, huffmans[code].length - skip)) !=
 911 |         JEBP_OK) {
 912 |         return 0;
 913 |     }
 914 |     return huffmans[code].symbol;
 915 | }
 916 | 
 917 | static jebp_error_t jebp__read_huffman(jebp__huffman_t **huffmans,
 918 |                                        jebp__bit_reader_t *bits,
 919 |                                        jebp_int nb_lengths,
 920 |                                        jebp_byte *lengths) {
 921 |     // This part of the spec is INCREDIBLY wrong and partly missing
 922 |     jebp_error_t err = JEBP_OK;
 923 |     JEBP__CLEAR(lengths, nb_lengths);
 924 | 
 925 |     if (jebp__read_bits(bits, 1, &err)) {
 926 |         // simple length storage with only 1 (first) or 2 (second) symbols, both
 927 |         // with a length of 1
 928 |         jebp_int has_second = jebp__read_bits(bits, 1, &err);
 929 |         jebp_int first_bits = jebp__read_bits(bits, 1, &err) ? 8 : 1;
 930 |         jebp_int first = jebp__read_bits(bits, first_bits, &err);
 931 |         if (first >= nb_lengths) {
 932 |             return jebp__error(&err, JEBP_ERROR_INVDATA);
 933 |         }
 934 |         lengths[first] = 1;
 935 |         if (has_second) {
 936 |             jebp_int second = jebp__read_bits(bits, 8, &err);
 937 |             if (second >= nb_lengths) {
 938 |                 return jebp__error(&err, JEBP_ERROR_INVDATA);
 939 |             }
 940 |             lengths[second] = 1;
 941 |         }
 942 | 
 943 |     } else {
 944 |         jebp_byte meta_lengths[JEBP__NB_META_SYMBOLS] = {0};
 945 |         jebp_int nb_meta_lengths = jebp__read_bits(bits, 4, &err) + 4;
 946 |         for (jebp_int i = 0; i < nb_meta_lengths; i += 1) {
 947 |             meta_lengths[jebp__meta_length_order[i]] =
 948 |                 jebp__read_bits(bits, 3, &err);
 949 |         }
 950 |         if (err != JEBP_OK) {
 951 |             return err;
 952 |         }
 953 |         jebp__huffman_t *meta_huffmans;
 954 |         if ((err = jebp__alloc_huffman(&meta_huffmans, JEBP__NB_META_SYMBOLS,
 955 |                                        meta_lengths)) != JEBP_OK) {
 956 |             return err;
 957 |         }
 958 | 
 959 |         jebp_int nb_meta_symbols = nb_lengths;
 960 |         if (jebp__read_bits(bits, 1, &err)) {
 961 |             // limit codes
 962 |             jebp_int symbols_bits = jebp__read_bits(bits, 3, &err) * 2 + 2;
 963 |             nb_meta_symbols = jebp__read_bits(bits, symbols_bits, &err) + 2;
 964 |         }
 965 | 
 966 |         jebp_int prev_length = 8;
 967 |         for (jebp_int i = 0; i < nb_lengths && nb_meta_symbols > 0;
 968 |              nb_meta_symbols -= 1) {
 969 |             jebp_int symbol = jebp__read_symbol(meta_huffmans, bits, &err);
 970 |             jebp_int length;
 971 |             jebp_int repeat;
 972 |             switch (symbol) {
 973 |             case 16:
 974 |                 length = prev_length;
 975 |                 repeat = jebp__read_bits(bits, 2, &err) + 3;
 976 |                 break;
 977 |             case 17:
 978 |                 length = 0;
 979 |                 repeat = jebp__read_bits(bits, 3, &err) + 3;
 980 |                 break;
 981 |             case 18:
 982 |                 length = 0;
 983 |                 repeat = jebp__read_bits(bits, 7, &err) + 11;
 984 |                 break;
 985 |             default:
 986 |                 prev_length = symbol;
 987 |                 /* fallthrough */
 988 |             case 0:
 989 |                 // We don't ever repeat 0 values.
 990 |                 lengths[i++] = symbol;
 991 |                 continue;
 992 |             }
 993 |             if (i + repeat > nb_lengths) {
 994 |                 jebp__error(&err, JEBP_ERROR_INVDATA);
 995 |                 break;
 996 |             }
 997 |             for (jebp_int j = 0; j < repeat; j += 1) {
 998 |                 lengths[i++] = length;
 999 |             }
1000 |         }
1001 |         JEBP_FREE(meta_huffmans);
1002 |     }
1003 | 
1004 |     if (err != JEBP_OK) {
1005 |         return err;
1006 |     }
1007 |     return jebp__alloc_huffman(huffmans, nb_lengths, lengths);
1008 | }
1009 | 
1010 | static jebp_error_t jebp__read_huffman_group(jebp__huffman_group_t *group,
1011 |                                              jebp__bit_reader_t *bits,
1012 |                                              jebp_int nb_main_symbols,
1013 |                                              jebp_byte *lengths) {
1014 |     jebp_error_t err;
1015 |     if ((err = jebp__read_huffman(&group->main, bits, nb_main_symbols,
1016 |                                   lengths)) != JEBP_OK) {
1017 |         return err;
1018 |     }
1019 |     if ((err = jebp__read_huffman(&group->red, bits, JEBP__NB_COLOR_SYMBOLS,
1020 |                                   lengths)) != JEBP_OK) {
1021 |         return err;
1022 |     }
1023 |     if ((err = jebp__read_huffman(&group->blue, bits, JEBP__NB_COLOR_SYMBOLS,
1024 |                                   lengths)) != JEBP_OK) {
1025 |         return err;
1026 |     }
1027 |     if ((err = jebp__read_huffman(&group->alpha, bits, JEBP__NB_COLOR_SYMBOLS,
1028 |                                   lengths)) != JEBP_OK) {
1029 |         return err;
1030 |     }
1031 |     if ((err = jebp__read_huffman(&group->dist, bits, JEBP__NB_DIST_SYMBOLS,
1032 |                                   lengths)) != JEBP_OK) {
1033 |         return err;
1034 |     }
1035 |     return JEBP_OK;
1036 | }
1037 | 
1038 | static void jebp__free_huffman_group(jebp__huffman_group_t *group) {
1039 |     JEBP_FREE(group->main);
1040 |     JEBP_FREE(group->red);
1041 |     JEBP_FREE(group->blue);
1042 |     JEBP_FREE(group->alpha);
1043 |     JEBP_FREE(group->dist);
1044 | }
1045 | 
1046 | /**
1047 |  * Color cache
1048 |  */
1049 | typedef struct jebp__colcache_t {
1050 |     jebp_int bits;
1051 |     jebp_color_t *colors;
1052 | } jebp__colcache_t;
1053 | 
1054 | static jebp_error_t jebp__read_colcache(jebp__colcache_t *colcache,
1055 |                                         jebp__bit_reader_t *bits) {
1056 |     jebp_error_t err = JEBP_OK;
1057 |     if (!jebp__read_bits(bits, 1, &err)) {
1058 |         // no color cache
1059 |         colcache->bits = 0;
1060 |         return err;
1061 |     }
1062 |     colcache->bits = jebp__read_bits(bits, 4, &err);
1063 |     if (err != JEBP_OK || colcache->bits < 1 || colcache->bits > 11) {
1064 |         return jebp__error(&err, JEBP_ERROR_INVDATA);
1065 |     }
1066 | 
1067 |     size_t colcache_size = ((size_t)1 << colcache->bits) * sizeof(jebp_color_t);
1068 |     colcache->colors = JEBP_ALLOC(colcache_size);
1069 |     if (colcache->colors == NULL) {
1070 |         return JEBP_ERROR_NOMEM;
1071 |     }
1072 |     JEBP__CLEAR(colcache->colors, colcache_size);
1073 |     return JEBP_OK;
1074 | }
1075 | 
1076 | static void jebp__free_colcache(jebp__colcache_t *colcache) {
1077 |     if (colcache->bits > 0) {
1078 |         JEBP_FREE(colcache->colors);
1079 |     }
1080 | }
1081 | 
1082 | static void jebp__colcache_insert(jebp__colcache_t *colcache,
1083 |                                   jebp_color_t *color) {
1084 |     if (colcache->bits == 0) {
1085 |         return;
1086 |     }
1087 | #if defined(JEBP__LITTLE_ENDIAN) && defined(JEBP__SWAP32)
1088 |     jebp_uint hash = *(jebp_uint *)color; // ABGR due to little-endian
1089 |     hash = JEBP__SWAP32(hash);            // RGBA
1090 |     hash = (hash >> 8) | (hash << 24);    // ARGB
1091 | #else
1092 |     jebp_uint hash = ((jebp_uint)color->a << 24) | ((jebp_uint)color->r << 16) |
1093 |                      ((jebp_uint)color->g << 8) | (jebp_uint)color->b;
1094 | #endif
1095 |     hash = (0x1e35a7bd * hash) >> (32 - colcache->bits);
1096 |     colcache->colors[hash] = *color;
1097 | }
1098 | 
1099 | /**
1100 |  * VP8L image
1101 |  */
1102 | #define JEBP__NB_VP8L_OFFSETS 120
1103 | 
1104 | typedef struct jebp__subimage_t {
1105 |     jebp_int width;
1106 |     jebp_int height;
1107 |     jebp_color_t *pixels;
1108 |     jebp_int block_bits;
1109 | } jebp__subimage_t;
1110 | 
1111 | static const jebp_byte jebp__vp8l_offsets[JEBP__NB_VP8L_OFFSETS][2];
1112 | 
1113 | JEBP__INLINE jebp_int jebp__read_vp8l_extrabits(jebp__bit_reader_t *bits,
1114 |                                                 jebp_int symbol,
1115 |                                                 jebp_error_t *err) {
1116 |     if (*err != JEBP_OK) {
1117 |         return 1;
1118 |     }
1119 |     if (symbol < 4) {
1120 |         return symbol + 1;
1121 |     }
1122 |     jebp_int extrabits = symbol / 2 - 1;
1123 |     symbol = ((symbol % 2 + 2) << extrabits) + 1;
1124 |     return symbol + jebp__read_bits(bits, extrabits, err);
1125 | }
1126 | 
1127 | static jebp_error_t jebp__read_vp8l_image(jebp_image_t *image,
1128 |                                           jebp__bit_reader_t *bits,
1129 |                                           jebp__colcache_t *colcache,
1130 |                                           jebp__subimage_t *huffman_image) {
1131 |     jebp_error_t err;
1132 |     jebp_int nb_groups = 1;
1133 |     jebp__huffman_group_t *groups = &(jebp__huffman_group_t){0};
1134 |     if (huffman_image != NULL) {
1135 |         for (jebp_int i = 0; i < huffman_image->width * huffman_image->height;
1136 |              i += 1) {
1137 |             jebp_color_t *huffman = &huffman_image->pixels[i];
1138 |             if (huffman->r != 0) {
1139 |                 // Currently only 256 huffman groups are supported
1140 |                 return JEBP_ERROR_NOSUP;
1141 |             }
1142 |             nb_groups = JEBP__MAX(nb_groups, huffman->g + 1);
1143 |             huffman += 1;
1144 |         }
1145 |         if (nb_groups > 1) {
1146 |             groups = JEBP_ALLOC(nb_groups * sizeof(jebp__huffman_group_t));
1147 |             if (groups == NULL) {
1148 |                 return JEBP_ERROR_NOMEM;
1149 |             }
1150 |         }
1151 |     }
1152 | 
1153 |     jebp_int nb_main_symbols = JEBP__NB_MAIN_SYMBOLS;
1154 |     if (colcache->bits > 0) {
1155 |         nb_main_symbols += 1 << colcache->bits;
1156 |     }
1157 |     jebp_byte *lengths = JEBP_ALLOC(nb_main_symbols);
1158 |     if (lengths == NULL) {
1159 |         err = JEBP_ERROR_NOMEM;
1160 |         goto free_groups;
1161 |     }
1162 |     jebp_int nb_read_groups = 0;
1163 |     for (; nb_read_groups < nb_groups; nb_read_groups += 1) {
1164 |         if ((err = jebp__read_huffman_group(&groups[nb_read_groups], bits,
1165 |                                             nb_main_symbols, lengths)) !=
1166 |             JEBP_OK) {
1167 |             break;
1168 |         }
1169 |     }
1170 |     JEBP_FREE(lengths);
1171 |     if (err != JEBP_OK) {
1172 |         goto free_read_groups;
1173 |     }
1174 |     if ((err = jebp__alloc_image(image)) != JEBP_OK) {
1175 |         goto free_read_groups;
1176 |     }
1177 | 
1178 |     jebp_color_t *pixel = image->pixels;
1179 |     jebp_color_t *end = pixel + image->width * image->height;
1180 |     jebp_int x = 0;
1181 |     for (jebp_int y = 0; y < image->height;) {
1182 |         jebp_color_t *huffman_row = NULL;
1183 |         if (huffman_image != NULL) {
1184 |             huffman_row =
1185 |                 &huffman_image->pixels[(y >> huffman_image->block_bits) *
1186 |                                        huffman_image->width];
1187 |         }
1188 |         do {
1189 |             jebp__huffman_group_t *group;
1190 |             if (huffman_image == NULL) {
1191 |                 group = groups;
1192 |             } else {
1193 |                 jebp_color_t *huffman =
1194 |                     &huffman_row[x >> huffman_image->block_bits];
1195 |                 group = &groups[huffman->g];
1196 |             }
1197 | 
1198 |             jebp_int main = jebp__read_symbol(group->main, bits, &err);
1199 |             if (main < JEBP__NB_COLOR_SYMBOLS) {
1200 |                 pixel->g = main;
1201 |                 pixel->r = jebp__read_symbol(group->red, bits, &err);
1202 |                 pixel->b = jebp__read_symbol(group->blue, bits, &err);
1203 |                 pixel->a = jebp__read_symbol(group->alpha, bits, &err);
1204 |                 jebp__colcache_insert(colcache, pixel++);
1205 |                 x += 1;
1206 |             } else if (main >= JEBP__NB_MAIN_SYMBOLS) {
1207 |                 *(pixel++) = colcache->colors[main - JEBP__NB_MAIN_SYMBOLS];
1208 |                 x += 1;
1209 |             } else {
1210 |                 jebp_int length = jebp__read_vp8l_extrabits(
1211 |                     bits, main - JEBP__NB_COLOR_SYMBOLS, &err);
1212 |                 jebp_int dist = jebp__read_symbol(group->dist, bits, &err);
1213 |                 dist = jebp__read_vp8l_extrabits(bits, dist, &err);
1214 |                 if (dist > JEBP__NB_VP8L_OFFSETS) {
1215 |                     dist -= JEBP__NB_VP8L_OFFSETS;
1216 |                 } else {
1217 |                     const jebp_byte *offset = jebp__vp8l_offsets[dist - 1];
1218 |                     dist = offset[1] * image->width + offset[0];
1219 |                     dist = JEBP__MAX(dist, 1);
1220 |                 }
1221 |                 jebp_color_t *repeat = pixel - dist;
1222 |                 if (repeat < image->pixels || pixel + length > end) {
1223 |                     jebp__error(&err, JEBP_ERROR_INVDATA);
1224 |                     break;
1225 |                 }
1226 |                 for (jebp_int i = 0; i < length; i += 1) {
1227 |                     jebp__colcache_insert(colcache, repeat);
1228 |                     *(pixel++) = *(repeat++);
1229 |                 }
1230 |                 x += length;
1231 |             }
1232 |         } while (x < image->width);
1233 |         y += x / image->width;
1234 |         x %= image->width;
1235 |     }
1236 | 
1237 |     if (err != JEBP_OK) {
1238 |         jebp_free_image(image);
1239 |     }
1240 | free_read_groups:
1241 |     for (nb_read_groups -= 1; nb_read_groups >= 0; nb_read_groups -= 1) {
1242 |         jebp__free_huffman_group(&groups[nb_read_groups]);
1243 |     }
1244 | free_groups:
1245 |     if (nb_groups > 1) {
1246 |         JEBP_FREE(groups);
1247 |     }
1248 |     return err;
1249 | }
1250 | 
1251 | static jebp_error_t jebp__read_subimage(jebp__subimage_t *subimage,
1252 |                                         jebp__bit_reader_t *bits,
1253 |                                         jebp_image_t *image) {
1254 |     jebp_error_t err = JEBP_OK;
1255 |     subimage->block_bits = jebp__read_bits(bits, 3, &err) + 2;
1256 |     subimage->width = JEBP__CEIL_SHIFT(image->width, subimage->block_bits);
1257 |     subimage->height = JEBP__CEIL_SHIFT(image->height, subimage->block_bits);
1258 |     if (err != JEBP_OK) {
1259 |         return err;
1260 |     }
1261 |     jebp__colcache_t colcache;
1262 |     if ((err = jebp__read_colcache(&colcache, bits)) != JEBP_OK) {
1263 |         return err;
1264 |     }
1265 |     err =
1266 |         jebp__read_vp8l_image((jebp_image_t *)subimage, bits, &colcache, NULL);
1267 |     jebp__free_colcache(&colcache);
1268 |     return err;
1269 | }
1270 | 
1271 | /**
1272 |  * VP8L predictions
1273 |  */
1274 | #define JEBP__NB_VP8L_PRED_TYPES 14
1275 | 
1276 | // I don't like the way it formats this
1277 | // clang-format off
1278 | #define JEBP__UNROLL4(var, body) \
1279 |     { var = 0; body } \
1280 |     { var = 1; body } \
1281 |     { var = 2; body } \
1282 |     { var = 3; body }
1283 | // clang-format on
1284 | 
1285 | typedef void (*jebp__vp8l_pred_t)(jebp_color_t *pixel, jebp_color_t *top,
1286 |                                   jebp_int width);
1287 | 
1288 | #ifdef JEBP__SIMD_SSE2
1289 | typedef struct jebp__m128x4i {
1290 |     __m128i v[4];
1291 | } jebp__m128x4i;
1292 | 
1293 | JEBP__INLINE __m128i jebp__sse_move_px1(__m128i v_dst, __m128i v_src) {
1294 |     __m128 v_dstf = _mm_castsi128_ps(v_dst);
1295 |     __m128 v_srcf = _mm_castsi128_ps(v_src);
1296 |     __m128 v_movf = _mm_move_ss(v_dstf, v_srcf);
1297 |     return _mm_castps_si128(v_movf);
1298 | }
1299 | 
1300 | JEBP__INLINE __m128i jebp__sse_avg_u8x16(__m128i v1, __m128i v2) {
1301 |     __m128i v_one = _mm_set1_epi8(1);
1302 |     __m128i v_avg = _mm_avg_epu8(v1, v2);
1303 |     // SSE2 `avg` rounds up, we have to check if a round-up occured (one of the
1304 |     // low bits was set but the other wasn't) and subtract 1 if so
1305 |     __m128i v_err = _mm_xor_si128(v1, v2);
1306 |     v_err = _mm_and_si128(v_err, v_one);
1307 |     return _mm_sub_epi8(v_avg, v_err);
1308 | }
1309 | 
1310 | JEBP__INLINE __m128i jebp__sse_avg2_u8x16(__m128i v1, __m128i v2, __m128i v3) {
1311 |     __m128i v_one = _mm_set1_epi8(1);
1312 |     // We can further optimise two avg calls but noting that the error will
1313 |     // propogate
1314 |     __m128i v_avg1 = _mm_avg_epu8(v1, v2);
1315 |     __m128i v_err1 = _mm_xor_si128(v1, v2);
1316 |     __m128i v_avg2 = _mm_avg_epu8(v_avg1, v3);
1317 |     __m128i v_err2 = _mm_xor_si128(v_avg1, v3);
1318 |     v_err2 = _mm_or_si128(v_err1, v_err2);
1319 |     v_err2 = _mm_and_si128(v_err2, v_one);
1320 |     return _mm_sub_epi8(v_avg2, v_err2);
1321 | }
1322 | 
1323 | JEBP__INLINE __m128i jebp__sse_flatten_px4(jebp__m128x4i v_pixel4) {
1324 |     __m128i v_pixello = jebp__sse_move_px1(v_pixel4.v[1], v_pixel4.v[0]);
1325 |     __m128i v_pixel3 = _mm_bsrli_si128(v_pixel4.v[3], 4);
1326 |     __m128i v_pixelhi = _mm_unpackhi_epi32(v_pixel4.v[2], v_pixel3);
1327 |     return _mm_unpacklo_epi64(v_pixello, v_pixelhi);
1328 | }
1329 | 
1330 | // Bit-select and accumulate, used by prediction filters 11-13
1331 | JEBP__INLINE __m128i jebp__sse_bsela_u8x16(__m128i v_acc, __m128i v_mask,
1332 |                                            __m128i v1, __m128i v0) {
1333 |     // This is faster than using and/andnot/or since SSE only supports two
1334 |     // operands so prefers chaining outputs
1335 |     __m128i v_sel = _mm_xor_si128(v0, v1);
1336 |     v_sel = _mm_and_si128(v_sel, v_mask);
1337 |     v_sel = _mm_xor_si128(v_sel, v0);
1338 |     return _mm_add_epi8(v_acc, v_sel);
1339 | }
1340 | #endif // JEBP__SIMD_SSE2
1341 | 
1342 | #ifdef JEBP__SIMD_NEON
1343 | JEBP__INLINE uint8x16_t jebp__neon_load_px1(jebp_color_t *pixel) {
1344 |     uint8x16_t v_pixel = vreinterpretq_u8_u32(vld1q_dup_u32((uint32_t *)pixel));
1345 | #ifndef JEBP__LITTLE_ENDIAN
1346 |     v_pixel = vrev32q_u8(v_pixel);
1347 | #endif // JEBP__LITTLE_ENDIAN
1348 |     return v_pixel;
1349 | }
1350 | 
1351 | JEBP__INLINE uint8x16_t jebp__neon_flatten_px4(uint8x16x4_t v_pixel4) {
1352 | #ifdef JEBP__SIMD_NEON64
1353 |     uint8x16_t v_table = vcombine_u8(vcreate_u8(0x1716151403020100),
1354 |                                      vcreate_u8(0x3f3e3d3c2b2a2928));
1355 |     return vqtbl4q_u8(v_pixel4, v_table);
1356 | #else  // JEBP__SIMD_NEON64
1357 |     uint8x16_t v_mask1 =
1358 |         vcombine_u8(vcreate_u8((uint32_t)-1), vcreate_u8((uint32_t)-1));
1359 |     uint8x16_t v_mask2 = vcombine_u8(vcreate_u8((uint64_t)-1), vcreate_u8(0));
1360 |     uint8x16_t v_pixello = vbslq_u8(v_mask1, v_pixel4.val[0], v_pixel4.val[1]);
1361 |     uint8x16_t v_pixelhi = vbslq_u8(v_mask1, v_pixel4.val[2], v_pixel4.val[3]);
1362 |     return vbslq_u8(v_mask2, v_pixello, v_pixelhi);
1363 | #endif // JEBP__SIMD_NEON64
1364 | }
1365 | 
1366 | JEBP__INLINE uint32x4_t jebp__neon_sad_px4(uint8x16_t v_pix1,
1367 |                                            uint8x16_t v_pix2) {
1368 |     uint8x16_t v_diff8 = vabdq_u8(v_pix1, v_pix2);
1369 |     uint16x8_t v_diff16 = vpaddlq_u8(v_diff8);
1370 |     return vpaddlq_u16(v_diff16);
1371 | }
1372 | #endif // JEBP__SIMD_NEON
1373 | 
1374 | JEBP__INLINE void jebp__vp8l_pred_black(jebp_color_t *pixel, jebp_int width) {
1375 |     jebp_int x = 0;
1376 | #if defined(JEBP__SIMD_SSE2)
1377 |     __m128i v_black = _mm_set1_epi32((int)0xff000000);
1378 |     for (; x + 4 <= width; x += 4) {
1379 |         __m128i v_pixel = _mm_loadu_si128((__m128i *)&pixel[x]);
1380 |         v_pixel = _mm_add_epi8(v_pixel, v_black);
1381 |         _mm_storeu_si128((__m128i *)&pixel[x], v_pixel);
1382 |     }
1383 | #elif defined(JEBP__SIMD_NEON)
1384 |     uint8x8_t v_black = vdup_n_u8(0xff);
1385 |     for (; x + 8 <= width; x += 8) {
1386 |         uint8x8x4_t v_pixel = vld4_u8((uint8_t *)&pixel[x]);
1387 |         v_pixel.val[3] = vadd_u8(v_pixel.val[3], v_black);
1388 |         vst4_u8((uint8_t *)&pixel[x], v_pixel);
1389 |     }
1390 | #endif
1391 |     for (; x < width; x += 1) {
1392 |         pixel[x].a += 0xff;
1393 |     }
1394 | }
1395 | 
1396 | static void jebp__vp8l_pred0(jebp_color_t *pixel, jebp_color_t *top,
1397 |                              jebp_int width) {
1398 |     (void)top;
1399 |     jebp__vp8l_pred_black(pixel, width);
1400 | }
1401 | 
1402 | JEBP__INLINE void jebp__vp8l_pred_left(jebp_color_t *pixel, jebp_int width) {
1403 |     jebp_int x = 0;
1404 | #if defined(JEBP__SIMD_SSE2)
1405 |     __m128i v_left;
1406 |     if (width >= 4) {
1407 |         v_left = _mm_cvtsi32_si128(*(int *)&pixel[-1]);
1408 |     }
1409 |     for (; x + 4 <= width; x += 4) {
1410 |         __m128i v_pixel = _mm_loadu_si128((__m128i *)&pixel[x]);
1411 |         v_pixel = _mm_add_epi8(v_pixel, v_left);
1412 |         v_left = _mm_bslli_si128(v_pixel, 4);
1413 |         v_pixel = _mm_add_epi8(v_pixel, v_left);
1414 |         v_left = _mm_bslli_si128(v_pixel, 8);
1415 |         v_pixel = _mm_add_epi8(v_pixel, v_left);
1416 |         _mm_storeu_si128((__m128i *)&pixel[x], v_pixel);
1417 |         v_left = _mm_bsrli_si128(v_pixel, 12);
1418 |     }
1419 | #elif defined(JEBP__SIMD_NEON)
1420 |     uint8x16_t v_zero = vdupq_n_u8(0);
1421 |     uint8x16_t v_left;
1422 |     if (width >= 4) {
1423 |         v_left = jebp__neon_load_px1(&pixel[-1]);
1424 |         v_left = vextq_u8(v_left, v_zero, 12);
1425 |     }
1426 |     for (; x + 4 <= width; x += 4) {
1427 |         uint8x16_t v_pixel = vld1q_u8((uint8_t *)&pixel[x]);
1428 |         v_pixel = vaddq_u8(v_pixel, v_left);
1429 |         v_left = vextq_u8(v_zero, v_pixel, 12);
1430 |         v_pixel = vaddq_u8(v_pixel, v_left);
1431 |         v_left = vextq_u8(v_zero, v_pixel, 8);
1432 |         v_pixel = vaddq_u8(v_pixel, v_left);
1433 |         vst1q_u8((uint8_t *)&pixel[x], v_pixel);
1434 |         v_left = vextq_u8(v_pixel, v_zero, 12);
1435 |     }
1436 | #endif
1437 |     for (; x < width; x += 1) {
1438 |         pixel[x].r += pixel[x - 1].r;
1439 |         pixel[x].g += pixel[x - 1].g;
1440 |         pixel[x].b += pixel[x - 1].b;
1441 |         pixel[x].a += pixel[x - 1].a;
1442 |     }
1443 | }
1444 | 
1445 | static void jebp__vp8l_pred1(jebp_color_t *pixel, jebp_color_t *top,
1446 |                              jebp_int width) {
1447 |     (void)top;
1448 |     jebp__vp8l_pred_left(pixel, width);
1449 | }
1450 | 
1451 | JEBP__INLINE void jebp__vp8l_pred_top(jebp_color_t *pixel, jebp_color_t *top,
1452 |                                       jebp_int width) {
1453 |     jebp_int x = 0;
1454 | #if defined(JEBP__SIMD_SSE2)
1455 |     for (; x + 4 <= width; x += 4) {
1456 |         __m128i v_pixel = _mm_loadu_si128((__m128i *)&pixel[x]);
1457 |         __m128i v_top = _mm_loadu_si128((__m128i *)&top[x]);
1458 |         v_pixel = _mm_add_epi8(v_pixel, v_top);
1459 |         _mm_storeu_si128((__m128i *)&pixel[x], v_pixel);
1460 |     }
1461 | #elif defined(JEBP__SIMD_NEON)
1462 |     for (; x + 4 <= width; x += 4) {
1463 |         uint8x16_t v_pixel = vld1q_u8((uint8_t *)&pixel[x]);
1464 |         uint8x16_t v_top = vld1q_u8((uint8_t *)&top[x]);
1465 |         v_pixel = vaddq_u8(v_pixel, v_top);
1466 |         vst1q_u8((uint8_t *)&pixel[x], v_pixel);
1467 |     }
1468 | #endif
1469 |     for (; x < width; x += 1) {
1470 |         pixel[x].r += top[x].r;
1471 |         pixel[x].g += top[x].g;
1472 |         pixel[x].b += top[x].b;
1473 |         pixel[x].a += top[x].a;
1474 |     }
1475 | }
1476 | 
1477 | static void jebp__vp8l_pred2(jebp_color_t *pixel, jebp_color_t *top,
1478 |                              jebp_int width) {
1479 |     jebp__vp8l_pred_top(pixel, top, width);
1480 | }
1481 | 
1482 | static void jebp__vp8l_pred3(jebp_color_t *pixel, jebp_color_t *top,
1483 |                              jebp_int width) {
1484 |     jebp__vp8l_pred_top(pixel, &top[1], width);
1485 | }
1486 | 
1487 | static void jebp__vp8l_pred4(jebp_color_t *pixel, jebp_color_t *top,
1488 |                              jebp_int width) {
1489 |     jebp__vp8l_pred_top(pixel, &top[-1], width);
1490 | }
1491 | 
1492 | static void jebp__vp8l_pred5(jebp_color_t *pixel, jebp_color_t *top,
1493 |                              jebp_int width) {
1494 |     jebp_int x = 0;
1495 | #if defined(JEBP__SIMD_SSE2)
1496 |     __m128i v_left;
1497 |     __m128i v_top;
1498 |     if (width >= 4) {
1499 |         v_left = _mm_cvtsi32_si128(*(int *)&pixel[-1]);
1500 |         v_top = _mm_loadu_si128((__m128i *)top);
1501 |     }
1502 |     for (; x + 4 <= width; x += 4) {
1503 |         __m128i v_pixel = _mm_loadu_si128((__m128i *)&pixel[x]);
1504 |         __m128i v_next = _mm_loadu_si128((__m128i *)&top[x + 4]);
1505 |         __m128i v_tr = jebp__sse_move_px1(v_top, v_next);
1506 |         v_tr = _mm_shuffle_epi32(v_tr, _MM_SHUFFLE(0, 3, 2, 1));
1507 |         jebp__m128x4i v_pixel4;
1508 |         JEBP__UNROLL4(jebp_int i, {
1509 |             __m128i v_avg = jebp__sse_avg2_u8x16(v_left, v_tr, v_top);
1510 |             v_pixel4.v[i] = _mm_add_epi8(v_pixel, v_avg);
1511 |             v_left = _mm_shuffle_epi32(v_pixel4.v[i], _MM_SHUFFLE(2, 1, 0, 3));
1512 |         })
1513 |         v_pixel = jebp__sse_flatten_px4(v_pixel4);
1514 |         _mm_storeu_si128((__m128i *)&pixel[x], v_pixel);
1515 |         v_top = v_next;
1516 |     }
1517 | #elif defined(JEBP__SIMD_NEON)
1518 |     uint8x16_t v_left;
1519 |     uint8x16_t v_top;
1520 |     if (width >= 4) {
1521 |         v_left = jebp__neon_load_px1(&pixel[-1]);
1522 |         v_top = vld1q_u8((uint8_t *)top);
1523 |     }
1524 |     for (; x + 4 <= width; x += 4) {
1525 |         uint8x16_t v_pixel = vld1q_u8((uint8_t *)&pixel[x]);
1526 |         uint8x16_t v_next = vld1q_u8((uint8_t *)&top[x + 4]);
1527 |         uint8x16_t v_tr = vextq_u8(v_top, v_next, 4);
1528 |         uint8x16x4_t v_pixel4;
1529 |         JEBP__UNROLL4(jebp_int i, {
1530 |             uint8x16_t v_avg = vhaddq_u8(v_left, v_tr);
1531 |             v_avg = vhaddq_u8(v_avg, v_top);
1532 |             v_pixel4.val[i] = vaddq_u8(v_pixel, v_avg);
1533 |             v_left = vextq_u8(v_pixel4.val[i], v_pixel4.val[i], 12);
1534 |         })
1535 |         v_pixel = jebp__neon_flatten_px4(v_pixel4);
1536 |         vst1q_u8((uint8_t *)&pixel[x], v_pixel);
1537 |         v_top = v_next;
1538 |     }
1539 | #endif
1540 |     for (; x < width; x += 1) {
1541 |         pixel[x].r +=
1542 |             JEBP__AVG(JEBP__AVG(pixel[x - 1].r, top[x + 1].r), top[x].r);
1543 |         pixel[x].g +=
1544 |             JEBP__AVG(JEBP__AVG(pixel[x - 1].g, top[x + 1].g), top[x].g);
1545 |         pixel[x].b +=
1546 |             JEBP__AVG(JEBP__AVG(pixel[x - 1].b, top[x + 1].b), top[x].b);
1547 |         pixel[x].a +=
1548 |             JEBP__AVG(JEBP__AVG(pixel[x - 1].a, top[x + 1].a), top[x].a);
1549 |     }
1550 | }
1551 | 
1552 | JEBP__INLINE void jebp__vp8l_pred_avgtl(jebp_color_t *pixel, jebp_color_t *top,
1553 |                                         jebp_int width) {
1554 |     jebp_int x = 0;
1555 | #if defined(JEBP__SIMD_SSE2)
1556 |     __m128i v_left;
1557 |     if (width >= 4) {
1558 |         v_left = _mm_cvtsi32_si128(*(int *)&pixel[-1]);
1559 |     }
1560 |     for (; x + 4 <= width; x += 4) {
1561 |         __m128i v_pixel = _mm_loadu_si128((__m128i *)&pixel[x]);
1562 |         __m128i v_top = _mm_loadu_si128((__m128i *)&top[x]);
1563 |         jebp__m128x4i v_pixel4;
1564 |         JEBP__UNROLL4(jebp_int i, {
1565 |             __m128i v_avg = jebp__sse_avg_u8x16(v_left, v_top);
1566 |             v_pixel4.v[i] = _mm_add_epi8(v_pixel, v_avg);
1567 |             v_left = _mm_shuffle_epi32(v_pixel4.v[i], _MM_SHUFFLE(2, 1, 0, 3));
1568 |         })
1569 |         v_pixel = jebp__sse_flatten_px4(v_pixel4);
1570 |         _mm_storeu_si128((__m128i *)&pixel[x], v_pixel);
1571 |     }
1572 | #elif defined(JEBP__SIMD_NEON)
1573 |     uint8x16_t v_left;
1574 |     if (width >= 4) {
1575 |         v_left = jebp__neon_load_px1(&pixel[-1]);
1576 |     }
1577 |     for (; x + 4 <= width; x += 4) {
1578 |         uint8x16_t v_pixel = vld1q_u8((uint8_t *)&pixel[x]);
1579 |         uint8x16_t v_top = vld1q_u8((uint8_t *)&top[x]);
1580 |         uint8x16x4_t v_pixel4;
1581 |         JEBP__UNROLL4(jebp_int i, {
1582 |             uint8x16_t v_avg = vhaddq_u8(v_left, v_top);
1583 |             v_pixel4.val[i] = vaddq_u8(v_pixel, v_avg);
1584 |             v_left = vextq_u8(v_pixel4.val[i], v_pixel4.val[i], 12);
1585 |         })
1586 |         v_pixel = jebp__neon_flatten_px4(v_pixel4);
1587 |         vst1q_u8((uint8_t *)&pixel[x], v_pixel);
1588 |     }
1589 | #endif
1590 |     for (; x < width; x += 1) {
1591 |         pixel[x].r += JEBP__AVG(pixel[x - 1].r, top[x].r);
1592 |         pixel[x].g += JEBP__AVG(pixel[x - 1].g, top[x].g);
1593 |         pixel[x].b += JEBP__AVG(pixel[x - 1].b, top[x].b);
1594 |         pixel[x].a += JEBP__AVG(pixel[x - 1].a, top[x].a);
1595 |     }
1596 | }
1597 | 
1598 | static void jebp__vp8l_pred6(jebp_color_t *pixel, jebp_color_t *top,
1599 |                              jebp_int width) {
1600 |     jebp__vp8l_pred_avgtl(pixel, &top[-1], width);
1601 | }
1602 | 
1603 | static void jebp__vp8l_pred7(jebp_color_t *pixel, jebp_color_t *top,
1604 |                              jebp_int width) {
1605 |     jebp__vp8l_pred_avgtl(pixel, top, width);
1606 | }
1607 | 
1608 | JEBP__INLINE void jebp__vp8l_pred_avgtr(jebp_color_t *pixel, jebp_color_t *top,
1609 |                                         jebp_int width) {
1610 |     jebp_int x = 0;
1611 | #if defined(JEBP__SIMD_SSE2)
1612 |     __m128i v_top;
1613 |     if (width >= 4) {
1614 |         v_top = _mm_loadu_si128((__m128i *)top);
1615 |     }
1616 |     for (; x + 4 <= width; x += 4) {
1617 |         __m128i v_pixel = _mm_loadu_si128((__m128i *)&pixel[x]);
1618 |         __m128i v_next = _mm_loadu_si128((__m128i *)&top[x + 4]);
1619 |         __m128i v_tr = jebp__sse_move_px1(v_top, v_next);
1620 |         v_tr = _mm_shuffle_epi32(v_tr, _MM_SHUFFLE(0, 3, 2, 1));
1621 |         v_tr = jebp__sse_avg_u8x16(v_top, v_tr);
1622 |         v_pixel = _mm_add_epi8(v_pixel, v_tr);
1623 |         _mm_storeu_si128((__m128i *)&pixel[x], v_pixel);
1624 |         v_top = v_next;
1625 |     }
1626 | #elif defined(JEBP__SIMD_NEON)
1627 |     uint8x16_t v_top;
1628 |     if (width >= 4) {
1629 |         v_top = vld1q_u8((uint8_t *)top);
1630 |     }
1631 |     for (; x + 4 <= width; x += 4) {
1632 |         uint8x16_t v_pixel = vld1q_u8((uint8_t *)&pixel[x]);
1633 |         uint8x16_t v_next = vld1q_u8((uint8_t *)&top[x + 4]);
1634 |         uint8x16_t v_tr = vextq_u8(v_top, v_next, 4);
1635 |         v_tr = vhaddq_u8(v_top, v_tr);
1636 |         v_pixel = vaddq_u8(v_pixel, v_tr);
1637 |         vst1q_u8((uint8_t *)&pixel[x], v_pixel);
1638 |         v_top = v_next;
1639 |     }
1640 | #endif
1641 |     for (; x < width; x += 1) {
1642 |         pixel[x].r += JEBP__AVG(top[x].r, top[x + 1].r);
1643 |         pixel[x].g += JEBP__AVG(top[x].g, top[x + 1].g);
1644 |         pixel[x].b += JEBP__AVG(top[x].b, top[x + 1].b);
1645 |         pixel[x].a += JEBP__AVG(top[x].a, top[x + 1].a);
1646 |     }
1647 | }
1648 | 
1649 | static void jebp__vp8l_pred8(jebp_color_t *pixel, jebp_color_t *top,
1650 |                              jebp_int width) {
1651 |     jebp__vp8l_pred_avgtr(pixel, &top[-1], width);
1652 | }
1653 | 
1654 | static void jebp__vp8l_pred9(jebp_color_t *pixel, jebp_color_t *top,
1655 |                              jebp_int width) {
1656 |     jebp__vp8l_pred_avgtr(pixel, top, width);
1657 | }
1658 | 
1659 | static void jebp__vp8l_pred10(jebp_color_t *pixel, jebp_color_t *top,
1660 |                               jebp_int width) {
1661 |     jebp_int x = 0;
1662 | #if defined(JEBP__SIMD_SSE2)
1663 |     __m128i v_left;
1664 |     __m128i v_tl;
1665 |     __m128i v_top;
1666 |     if (width >= 4) {
1667 |         v_left = _mm_cvtsi32_si128(*(int *)&pixel[-1]);
1668 |         v_tl = _mm_cvtsi32_si128(*(int *)&top[-1]);
1669 |         v_top = _mm_loadu_si128((__m128i *)top);
1670 |     }
1671 |     for (; x + 4 <= width; x += 4) {
1672 |         __m128i v_pixel = _mm_loadu_si128((__m128i *)&pixel[x]);
1673 |         __m128i v_next = _mm_loadu_si128((__m128i *)&top[x + 4]);
1674 |         __m128i v_rot = _mm_shuffle_epi32(v_top, _MM_SHUFFLE(2, 1, 0, 3));
1675 |         v_tl = jebp__sse_move_px1(v_rot, v_tl);
1676 |         __m128i v_tr = jebp__sse_move_px1(v_top, v_next);
1677 |         v_tr = _mm_shuffle_epi32(v_tr, _MM_SHUFFLE(0, 3, 2, 1));
1678 |         v_tr = jebp__sse_avg_u8x16(v_top, v_tr);
1679 |         jebp__m128x4i v_pixel4;
1680 |         JEBP__UNROLL4(jebp_int i, {
1681 |             __m128i v_avg = jebp__sse_avg2_u8x16(v_left, v_tl, v_tr);
1682 |             v_pixel4.v[i] = _mm_add_epi8(v_pixel, v_avg);
1683 |             v_left = _mm_shuffle_epi32(v_pixel4.v[i], _MM_SHUFFLE(2, 1, 0, 3));
1684 |         })
1685 |         v_pixel = jebp__sse_flatten_px4(v_pixel4);
1686 |         _mm_storeu_si128((__m128i *)&pixel[x], v_pixel);
1687 |         v_tl = v_rot;
1688 |         v_top = v_next;
1689 |     }
1690 | #elif defined(JEBP__SIMD_NEON)
1691 |     uint8x16_t v_left;
1692 |     uint8x16_t v_tl;
1693 |     uint8x16_t v_top;
1694 |     if (width >= 4) {
1695 |         v_left = jebp__neon_load_px1(&pixel[-1]);
1696 |         v_tl = jebp__neon_load_px1(&top[-1]);
1697 |         v_top = vld1q_u8((uint8_t *)top);
1698 |     }
1699 |     for (; x + 4 <= width; x += 4) {
1700 |         uint8x16_t v_pixel = vld1q_u8((uint8_t *)&pixel[x]);
1701 |         uint8x16_t v_next = vld1q_u8((uint8_t *)&top[x + 4]);
1702 |         v_tl = vextq_u8(v_tl, v_top, 12);
1703 |         uint8x16_t v_tr = vextq_u8(v_top, v_next, 4);
1704 |         v_tr = vhaddq_u8(v_top, v_tr);
1705 |         uint8x16x4_t v_pixel4;
1706 |         JEBP__UNROLL4(jebp_int i, {
1707 |             uint8x16_t v_avg = vhaddq_u8(v_left, v_tl);
1708 |             v_avg = vhaddq_u8(v_avg, v_tr);
1709 |             v_pixel4.val[i] = vaddq_u8(v_pixel, v_avg);
1710 |             v_left = vextq_u8(v_pixel4.val[i], v_pixel4.val[i], 12);
1711 |         })
1712 |         v_pixel = jebp__neon_flatten_px4(v_pixel4);
1713 |         vst1q_u8((uint8_t *)&pixel[x], v_pixel);
1714 |         v_tl = v_top;
1715 |         v_top = v_next;
1716 |     }
1717 | #endif
1718 |     for (; x < width; x += 1) {
1719 |         pixel[x].r += JEBP__AVG(JEBP__AVG(pixel[x - 1].r, top[x - 1].r),
1720 |                                 JEBP__AVG(top[x].r, top[x + 1].r));
1721 |         pixel[x].g += JEBP__AVG(JEBP__AVG(pixel[x - 1].g, top[x - 1].g),
1722 |                                 JEBP__AVG(top[x].g, top[x + 1].g));
1723 |         pixel[x].b += JEBP__AVG(JEBP__AVG(pixel[x - 1].b, top[x - 1].b),
1724 |                                 JEBP__AVG(top[x].b, top[x + 1].b));
1725 |         pixel[x].a += JEBP__AVG(JEBP__AVG(pixel[x - 1].a, top[x - 1].a),
1726 |                                 JEBP__AVG(top[x].a, top[x + 1].a));
1727 |     }
1728 | }
1729 | 
1730 | JEBP__INLINE jebp_int jebp__vp8l_pred_dist(jebp_color_t *pix1,
1731 |                                            jebp_color_t *pix2) {
1732 |     return JEBP__ABS(pix1->r - pix2->r) + JEBP__ABS(pix1->g - pix2->g) +
1733 |            JEBP__ABS(pix1->b - pix2->b) + JEBP__ABS(pix1->a - pix2->a);
1734 | }
1735 | 
1736 | static void jebp__vp8l_pred11(jebp_color_t *pixel, jebp_color_t *top,
1737 |                               jebp_int width) {
1738 |     jebp_int x = 0;
1739 | #if defined(JEBP__SIMD_SSE2)
1740 |     __m128i v_left;
1741 |     __m128i v_tl;
1742 |     if (width >= 4) {
1743 |         v_left = _mm_cvtsi32_si128(*(int *)&pixel[-1]);
1744 |         v_tl = _mm_cvtsi32_si128(*(int *)&top[-1]);
1745 |     }
1746 |     for (; x + 4 <= width; x += 4) {
1747 |         __m128i v_ldist, v_tdist, v_cmp, v_pixello, v_pixelhi;
1748 |         __m128i v_pixel = _mm_loadu_si128((__m128i *)&pixel[x]);
1749 |         __m128i v_top = _mm_loadu_si128((__m128i *)&top[x]);
1750 |         __m128i v_rot = _mm_shuffle_epi32(v_top, _MM_SHUFFLE(2, 1, 0, 3));
1751 |         v_tl = jebp__sse_move_px1(v_rot, v_tl);
1752 |         // Pixel 0
1753 |         // This does double the SAD result but if both distances are doubled the
1754 |         // comparison should still be the same
1755 |         __m128i v_tllo = _mm_unpacklo_epi32(v_tl, v_tl);
1756 |         __m128i v_toplo = _mm_unpacklo_epi32(v_top, v_top);
1757 |         v_ldist = _mm_sad_epu8(v_tllo, v_toplo);
1758 |         v_tdist = _mm_unpacklo_epi32(v_left, v_left);
1759 |         v_tdist = _mm_sad_epu8(v_tllo, v_tdist);
1760 |         v_cmp = _mm_cmplt_epi32(v_ldist, v_tdist);
1761 |         v_pixello = jebp__sse_bsela_u8x16(v_pixel, v_cmp, v_left, v_top);
1762 |         v_left = _mm_bslli_si128(v_pixello, 4);
1763 |         // Pixel 1
1764 |         v_tdist = _mm_unpacklo_epi32(v_left, v_left);
1765 |         v_tdist = _mm_sad_epu8(v_tllo, v_tdist);
1766 |         v_cmp = _mm_cmplt_epi32(v_ldist, v_tdist);
1767 |         v_cmp = _mm_bsrli_si128(v_cmp, 4);
1768 |         v_pixello = jebp__sse_bsela_u8x16(v_pixel, v_cmp, v_left, v_top);
1769 |         v_pixello = _mm_unpacklo_epi32(v_left, v_pixello);
1770 |         v_left = _mm_bsrli_si128(v_pixello, 4);
1771 |         // Pixel 2
1772 |         __m128i v_tlhi = _mm_shuffle_epi32(v_tl, _MM_SHUFFLE(2, 2, 3, 3));
1773 |         __m128i v_tophi = _mm_shuffle_epi32(v_top, _MM_SHUFFLE(2, 2, 3, 3));
1774 |         v_ldist = _mm_sad_epu8(v_tlhi, v_tophi);
1775 |         v_tdist = _mm_shuffle_epi32(v_left, _MM_SHUFFLE(2, 2, 3, 3));
1776 |         v_tdist = _mm_sad_epu8(v_tlhi, v_tdist);
1777 |         v_cmp = _mm_cmplt_epi32(v_ldist, v_tdist);
1778 |         v_pixelhi = jebp__sse_bsela_u8x16(v_pixel, v_cmp, v_left, v_top);
1779 |         v_left = _mm_bslli_si128(v_pixelhi, 4);
1780 |         // Pixel 3
1781 |         v_tdist = _mm_shuffle_epi32(v_left, _MM_SHUFFLE(2, 2, 3, 3));
1782 |         v_tdist = _mm_sad_epu8(v_tlhi, v_tdist);
1783 |         v_cmp = _mm_cmplt_epi32(v_ldist, v_tdist);
1784 |         v_cmp = _mm_bslli_si128(v_cmp, 12);
1785 |         v_pixelhi = jebp__sse_bsela_u8x16(v_pixel, v_cmp, v_left, v_top);
1786 |         v_pixelhi = _mm_unpackhi_epi32(v_left, v_pixelhi);
1787 |         v_left = _mm_bsrli_si128(v_pixelhi, 12);
1788 |         v_pixel = _mm_unpackhi_epi64(v_pixello, v_pixelhi);
1789 |         _mm_storeu_si128((__m128i *)&pixel[x], v_pixel);
1790 |         v_tl = v_rot;
1791 |     }
1792 | #elif defined(JEBP__SIMD_NEON)
1793 |     uint8x16_t v_left;
1794 |     uint8x16_t v_tl;
1795 |     if (width >= 4) {
1796 |         v_left = jebp__neon_load_px1(&pixel[-1]);
1797 |         v_tl = jebp__neon_load_px1(&top[-1]);
1798 |     }
1799 |     for (; x + 4 <= width; x += 4) {
1800 |         uint8x16_t v_pixel = vld1q_u8((uint8_t *)&pixel[x]);
1801 |         uint8x16_t v_top = vld1q_u8((uint8_t *)&top[x]);
1802 |         v_tl = vextq_u8(v_tl, v_top, 12);
1803 |         uint32x4_t v_ldist = jebp__neon_sad_px4(v_tl, v_top);
1804 |         uint8x16x4_t v_pixel4;
1805 |         JEBP__UNROLL4(jebp_int i, {
1806 |             uint32x4_t v_tdist = jebp__neon_sad_px4(v_tl, v_left);
1807 |             uint32x4_t v_cmp = vcltq_u32(v_ldist, v_tdist);
1808 |             uint8x16_t v_pred = vbslq_u8((uint8x16_t)v_cmp, v_left, v_top);
1809 |             v_pixel4.val[i] = vaddq_u8(v_pixel, v_pred);
1810 |             v_left = vextq_u8(v_pixel4.val[i], v_pixel4.val[i], 12);
1811 |         })
1812 |         v_pixel = jebp__neon_flatten_px4(v_pixel4);
1813 |         vst1q_u8((uint8_t *)&pixel[x], v_pixel);
1814 |         v_tl = v_top;
1815 |     }
1816 | #endif
1817 |     for (; x < width; x += 1) {
1818 |         jebp_int ldist = jebp__vp8l_pred_dist(&top[x - 1], &top[x]);
1819 |         jebp_int tdist = jebp__vp8l_pred_dist(&top[x - 1], &pixel[x - 1]);
1820 |         if (ldist < tdist) {
1821 |             jebp__vp8l_pred_left(&pixel[x], 1);
1822 |         } else {
1823 |             jebp__vp8l_pred_top(&pixel[x], &top[x], 1);
1824 |         }
1825 |     }
1826 | }
1827 | 
1828 | static void jebp__vp8l_pred12(jebp_color_t *pixel, jebp_color_t *top,
1829 |                               jebp_int width) {
1830 |     jebp_int x = 0;
1831 | #if defined(JEBP__SIMD_SSE2)
1832 |     __m128i v_left;
1833 |     __m128i v_tl;
1834 |     if (width >= 4) {
1835 |         v_left = _mm_cvtsi32_si128(*(int *)&pixel[-1]);
1836 |         v_tl = _mm_cvtsi32_si128(*(int *)&top[-1]);
1837 |     }
1838 |     for (; x + 4 <= width; x += 4) {
1839 |         __m128i v_pixel = _mm_loadu_si128((__m128i *)&pixel[x]);
1840 |         __m128i v_top = _mm_loadu_si128((__m128i *)&top[x]);
1841 |         __m128i v_rot = _mm_shuffle_epi32(v_top, _MM_SHUFFLE(2, 1, 0, 3));
1842 |         v_tl = jebp__sse_move_px1(v_rot, v_tl);
1843 |         __m128i v_max = _mm_max_epu8(v_top, v_tl);
1844 |         __m128i v_min = _mm_min_epu8(v_top, v_tl);
1845 |         __m128i v_diff = _mm_sub_epi8(v_max, v_min);
1846 |         __m128i v_pos = _mm_cmpeq_epi8(v_max, v_top);
1847 |         jebp__m128x4i v_pixel4;
1848 |         JEBP__UNROLL4(jebp_int i, {
1849 |             __m128i v_add = _mm_adds_epu8(v_left, v_diff);
1850 |             __m128i v_sub = _mm_subs_epu8(v_left, v_diff);
1851 |             v_pixel4.v[i] = jebp__sse_bsela_u8x16(v_pixel, v_pos, v_add, v_sub);
1852 |             v_left = _mm_shuffle_epi32(v_pixel4.v[i], _MM_SHUFFLE(2, 1, 0, 3));
1853 |         })
1854 |         v_pixel = jebp__sse_flatten_px4(v_pixel4);
1855 |         _mm_storeu_si128((__m128i *)&pixel[x], v_pixel);
1856 |         v_tl = v_rot;
1857 |     }
1858 | #elif defined(JEBP__SIMD_NEON)
1859 |     uint8x16_t v_left;
1860 |     uint8x16_t v_tl;
1861 |     if (width >= 4) {
1862 |         v_left = jebp__neon_load_px1(&pixel[-1]);
1863 |         v_tl = jebp__neon_load_px1(&top[-1]);
1864 |     }
1865 |     for (; x + 4 <= width; x += 4) {
1866 |         uint8x16_t v_pixel = vld1q_u8((uint8_t *)&pixel[x]);
1867 |         uint8x16_t v_top = vld1q_u8((uint8_t *)&top[x]);
1868 |         v_tl = vextq_u8(v_tl, v_top, 12);
1869 |         uint8x16_t v_diff = vabdq_u8(v_top, v_tl);
1870 |         uint8x16_t v_neg = vcltq_u8(v_top, v_tl);
1871 |         uint8x16x4_t v_pixel4;
1872 |         JEBP__UNROLL4(jebp_int i, {
1873 |             uint8x16_t v_add = vqaddq_u8(v_left, v_diff);
1874 |             uint8x16_t v_sub = vqsubq_u8(v_left, v_diff);
1875 |             uint8x16_t v_pred = vbslq_u8(v_neg, v_sub, v_add);
1876 |             v_pixel4.val[i] = vaddq_u8(v_pixel, v_pred);
1877 |             v_left = vextq_u8(v_pixel4.val[i], v_pixel4.val[i], 12);
1878 |         })
1879 |         v_pixel = jebp__neon_flatten_px4(v_pixel4);
1880 |         vst1q_u8((uint8_t *)&pixel[x], v_pixel);
1881 |         v_tl = v_top;
1882 |     }
1883 | #endif
1884 |     for (; x < width; x += 1) {
1885 |         pixel[x].r +=
1886 |             JEBP__CLAMP_UBYTE(pixel[x - 1].r + top[x].r - top[x - 1].r);
1887 |         pixel[x].g +=
1888 |             JEBP__CLAMP_UBYTE(pixel[x - 1].g + top[x].g - top[x - 1].g);
1889 |         pixel[x].b +=
1890 |             JEBP__CLAMP_UBYTE(pixel[x - 1].b + top[x].b - top[x - 1].b);
1891 |         pixel[x].a +=
1892 |             JEBP__CLAMP_UBYTE(pixel[x - 1].a + top[x].a - top[x - 1].a);
1893 |     }
1894 | }
1895 | 
1896 | static void jebp__vp8l_pred13(jebp_color_t *pixel, jebp_color_t *top,
1897 |                               jebp_int width) {
1898 |     jebp_int x = 0;
1899 | #if defined(JEBP__SIMD_SSE2)
1900 |     __m128i v_mask = _mm_set1_epi8(0x7f);
1901 |     __m128i v_left;
1902 |     __m128i v_tl;
1903 |     if (width >= 4) {
1904 |         v_left = _mm_cvtsi32_si128(*(int *)&pixel[-1]);
1905 |         v_tl = _mm_cvtsi32_si128(*(int *)&top[-1]);
1906 |     }
1907 |     for (; x + 4 <= width; x += 4) {
1908 |         __m128i v_pixel = _mm_loadu_si128((__m128i *)&pixel[x]);
1909 |         __m128i v_top = _mm_loadu_si128((__m128i *)&top[x]);
1910 |         __m128i v_rot = _mm_shuffle_epi32(v_top, _MM_SHUFFLE(2, 1, 0, 3));
1911 |         v_tl = jebp__sse_move_px1(v_rot, v_tl);
1912 |         jebp__m128x4i v_pixel4;
1913 |         JEBP__UNROLL4(jebp_int i, {
1914 |             __m128i v_avg = jebp__sse_avg_u8x16(v_left, v_top);
1915 |             __m128i v_max = _mm_max_epu8(v_avg, v_tl);
1916 |             __m128i v_min = _mm_min_epu8(v_avg, v_tl);
1917 |             __m128i v_diff = _mm_sub_epi8(v_max, v_min);
1918 |             v_diff = _mm_srli_epi16(v_diff, 1);
1919 |             v_diff = _mm_and_si128(v_diff, v_mask);
1920 |             __m128i v_pos = _mm_cmpeq_epi8(v_max, v_avg);
1921 |             __m128i v_add = _mm_adds_epu8(v_avg, v_diff);
1922 |             __m128i v_sub = _mm_subs_epu8(v_avg, v_diff);
1923 |             v_pixel4.v[i] = jebp__sse_bsela_u8x16(v_pixel, v_pos, v_add, v_sub);
1924 |             v_left = _mm_shuffle_epi32(v_pixel4.v[i], _MM_SHUFFLE(2, 1, 0, 3));
1925 |         })
1926 |         v_pixel = jebp__sse_flatten_px4(v_pixel4);
1927 |         _mm_storeu_si128((__m128i *)&pixel[x], v_pixel);
1928 |         v_tl = v_rot;
1929 |     }
1930 | #elif defined(JEBP__SIMD_NEON)
1931 |     uint8x16_t v_left;
1932 |     uint8x16_t v_tl;
1933 |     if (width >= 4) {
1934 |         v_left = jebp__neon_load_px1(&pixel[-1]);
1935 |         v_tl = jebp__neon_load_px1(&top[-1]);
1936 |     }
1937 |     for (; x + 4 <= width; x += 4) {
1938 |         uint8x16_t v_pixel = vld1q_u8((uint8_t *)&pixel[x]);
1939 |         uint8x16_t v_top = vld1q_u8((uint8_t *)&top[x]);
1940 |         v_tl = vextq_u8(v_tl, v_top, 12);
1941 |         uint8x16x4_t v_pixel4;
1942 |         JEBP__UNROLL4(jebp_int i, {
1943 |             uint8x16_t v_avg = vhaddq_u8(v_left, v_top);
1944 |             uint8x16_t v_diff = vabdq_u8(v_avg, v_tl);
1945 |             v_diff = vshrq_n_u8(v_diff, 1);
1946 |             uint8x16_t v_neg = vcltq_u8(v_avg, v_tl);
1947 |             uint8x16_t v_add = vqaddq_u8(v_avg, v_diff);
1948 |             uint8x16_t v_sub = vqsubq_u8(v_avg, v_diff);
1949 |             uint8x16_t v_pred = vbslq_u8(v_neg, v_sub, v_add);
1950 |             v_pixel4.val[i] = vaddq_u8(v_pixel, v_pred);
1951 |             v_left = vextq_u8(v_pixel4.val[i], v_pixel4.val[i], 12);
1952 |         })
1953 |         v_pixel = jebp__neon_flatten_px4(v_pixel4);
1954 |         vst1q_u8((uint8_t *)&pixel[x], v_pixel);
1955 |         v_tl = v_top;
1956 |     }
1957 | #endif
1958 |     for (; x < width; x += 1) {
1959 |         jebp_color_t avg = {JEBP__AVG(pixel[x - 1].r, top[x].r),
1960 |                             JEBP__AVG(pixel[x - 1].g, top[x].g),
1961 |                             JEBP__AVG(pixel[x - 1].b, top[x].b),
1962 |                             JEBP__AVG(pixel[x - 1].a, top[x].a)};
1963 |         pixel[x].r += JEBP__CLAMP_UBYTE(avg.r + (avg.r - top[x - 1].r) / 2);
1964 |         pixel[x].g += JEBP__CLAMP_UBYTE(avg.g + (avg.g - top[x - 1].g) / 2);
1965 |         pixel[x].b += JEBP__CLAMP_UBYTE(avg.b + (avg.b - top[x - 1].b) / 2);
1966 |         pixel[x].a += JEBP__CLAMP_UBYTE(avg.a + (avg.a - top[x - 1].a) / 2);
1967 |     }
1968 | }
1969 | 
1970 | static const jebp__vp8l_pred_t jebp__vp8l_preds[JEBP__NB_VP8L_PRED_TYPES] = {
1971 |     jebp__vp8l_pred0,  jebp__vp8l_pred1, jebp__vp8l_pred2,  jebp__vp8l_pred3,
1972 |     jebp__vp8l_pred4,  jebp__vp8l_pred5, jebp__vp8l_pred6,  jebp__vp8l_pred7,
1973 |     jebp__vp8l_pred8,  jebp__vp8l_pred9, jebp__vp8l_pred10, jebp__vp8l_pred11,
1974 |     jebp__vp8l_pred12, jebp__vp8l_pred13};
1975 | 
1976 | /**
1977 |  * VP8L transforms
1978 |  */
1979 | typedef enum jebp__transform_type_t {
1980 |     JEBP__TRANSFORM_PREDICT,
1981 |     JEBP__TRANSFORM_COLOR,
1982 |     JEBP__TRANSFORM_GREEN,
1983 |     JEBP__TRANSFORM_PALETTE,
1984 |     JEBP__NB_TRANSFORMS
1985 | } jebp__transform_type_t;
1986 | 
1987 | typedef struct jebp__transform_t {
1988 |     jebp__transform_type_t type;
1989 |     jebp__subimage_t image;
1990 | } jebp__transform_t;
1991 | 
1992 | static jebp_error_t jebp__read_transform(jebp__transform_t *transform,
1993 |                                          jebp__bit_reader_t *bits,
1994 |                                          jebp_image_t *image) {
1995 |     jebp_error_t err = JEBP_OK;
1996 |     transform->type = jebp__read_bits(bits, 2, &err);
1997 |     if (err != JEBP_OK) {
1998 |         return err;
1999 |     }
2000 |     if (transform->type == JEBP__TRANSFORM_PALETTE) {
2001 |         // TODO: support palette images
2002 |         return JEBP_ERROR_NOSUP_PALETTE;
2003 |     } else if (transform->type != JEBP__TRANSFORM_GREEN) {
2004 |         err = jebp__read_subimage(&transform->image, bits, image);
2005 |     }
2006 |     return err;
2007 | }
2008 | 
2009 | static void jebp__free_transform(jebp__transform_t *transform) {
2010 |     if (transform->type != JEBP__TRANSFORM_GREEN) {
2011 |         jebp_free_image((jebp_image_t *)&transform->image);
2012 |     }
2013 | }
2014 | 
2015 | JEBP__INLINE jebp_error_t jebp__apply_predict_row(jebp_color_t *pixel,
2016 |                                                   jebp_color_t *top,
2017 |                                                   jebp_int width,
2018 |                                                   jebp_color_t *predict_pixel) {
2019 |     if (predict_pixel->g >= JEBP__NB_VP8L_PRED_TYPES) {
2020 |         return JEBP_ERROR_INVDATA;
2021 |     }
2022 |     jebp__vp8l_preds[predict_pixel->g](pixel, top, width);
2023 |     return JEBP_OK;
2024 | }
2025 | 
2026 | JEBP__INLINE jebp_error_t jebp__apply_predict_transform(
2027 |     jebp_image_t *image, jebp__subimage_t *predict_image) {
2028 |     jebp_error_t err;
2029 |     jebp_color_t *pixel = image->pixels;
2030 |     jebp_color_t *top = pixel;
2031 |     jebp_int predict_width = predict_image->width - 1;
2032 |     jebp_int block_size = 1 << predict_image->block_bits;
2033 |     jebp_int end_size =
2034 |         image->width - (predict_width << predict_image->block_bits);
2035 |     if (predict_width == 0) {
2036 |         // Special case: if there is only one block the first block which is
2037 |         //               shortened by one pixel (due to the left prediction)
2038 |         //               needs to be `end_size` and the proper end block then
2039 |         //               needs to be skipped.
2040 |         block_size = end_size;
2041 |         end_size = 0;
2042 |     }
2043 |     // Use opaque-black prediction for the top-left pixel
2044 |     jebp__vp8l_pred_black(pixel, 1);
2045 |     // Use left prediction for the top row
2046 |     jebp__vp8l_pred_left(pixel + 1, image->width - 1);
2047 |     pixel += image->width;
2048 |     for (jebp_int y = 1; y < image->height; y += 1) {
2049 |         jebp_color_t *predict_row =
2050 |             &predict_image->pixels[(y >> predict_image->block_bits) *
2051 |                                    predict_image->width];
2052 |         // Use top prediction for the left column
2053 |         jebp__vp8l_pred_top(pixel, top, 1);
2054 |         // Finish the rest of the first block
2055 |         if ((err = jebp__apply_predict_row(pixel + 1, top + 1, block_size - 1,
2056 |                                            predict_row)) != JEBP_OK) {
2057 |             return err;
2058 |         }
2059 |         pixel += block_size;
2060 |         top += block_size;
2061 |         for (jebp_int x = 1; x < predict_width; x += 1) {
2062 |             if ((err = jebp__apply_predict_row(pixel, top, block_size,
2063 |                                                &predict_row[x])) != JEBP_OK) {
2064 |                 return err;
2065 |             }
2066 |             pixel += block_size;
2067 |             top += block_size;
2068 |         }
2069 |         jebp__apply_predict_row(pixel, top, end_size,
2070 |                                 &predict_row[predict_width]);
2071 |         pixel += end_size;
2072 |         top += end_size;
2073 |     }
2074 |     return JEBP_OK;
2075 | }
2076 | 
2077 | JEBP__INLINE void jebp__apply_color_row(jebp_color_t *pixel, jebp_int width,
2078 |                                         jebp_color_t *color_pixel) {
2079 |     jebp_int x = 0;
2080 | #if defined(JEBP__SIMD_SSE2)
2081 |     jebp_ushort color_r = ((jebp_short)(color_pixel->r << 8) >> 5);
2082 |     jebp_ushort color_g = ((jebp_short)(color_pixel->g << 8) >> 5);
2083 |     jebp_ushort color_b = ((jebp_short)(color_pixel->b << 8) >> 5);
2084 |     __m128i v_color_bg = _mm_set1_epi32(color_b | ((jebp_uint)color_g << 16));
2085 |     __m128i v_color_r = _mm_set1_epi32(color_r);
2086 |     __m128i v_masklo = _mm_set1_epi16((short)0x00ff);
2087 |     __m128i v_maskhi = _mm_set1_epi16((short)0xff00);
2088 |     for (; x + 4 <= width; x += 4) {
2089 |         __m128i v_pixel = _mm_loadu_si128((__m128i *)&pixel[x]);
2090 |         __m128i v_green = _mm_and_si128(v_pixel, v_maskhi);
2091 |         v_green = _mm_shufflelo_epi16(v_green, _MM_SHUFFLE(2, 2, 0, 0));
2092 |         v_green = _mm_shufflehi_epi16(v_green, _MM_SHUFFLE(2, 2, 0, 0));
2093 |         __m128i v_bg = _mm_mulhi_epi16(v_green, v_color_bg);
2094 |         v_bg = _mm_and_si128(v_bg, v_masklo);
2095 |         v_pixel = _mm_add_epi8(v_pixel, v_bg);
2096 |         __m128i v_red = _mm_slli_epi16(v_pixel, 8);
2097 |         v_red = _mm_mulhi_epi16(v_red, v_color_r);
2098 |         v_red = _mm_and_si128(v_red, v_masklo);
2099 |         v_red = _mm_slli_epi32(v_red, 16);
2100 |         v_pixel = _mm_add_epi8(v_pixel, v_red);
2101 |         _mm_storeu_si128((__m128i *)&pixel[x], v_pixel);
2102 |     }
2103 | #elif defined(JEBP__SIMD_NEON)
2104 |     int8x8x3_t v_color_pixel = vld3_dup_s8((jebp_byte *)color_pixel);
2105 |     for (; x + 8 <= width; x += 8) {
2106 |         int16x8_t v_mul;
2107 |         int8x8_t v_shr;
2108 |         int8x8x4_t v_pixel = vld4_s8((jebp_byte *)&pixel[x]);
2109 |         v_mul = vmull_s8(v_pixel.val[1], v_color_pixel.val[2]);
2110 |         v_shr = vshrn_n_s16(v_mul, 5);
2111 |         v_pixel.val[0] = vadd_s8(v_pixel.val[0], v_shr);
2112 |         v_mul = vmull_s8(v_pixel.val[1], v_color_pixel.val[1]);
2113 |         v_shr = vshrn_n_s16(v_mul, 5);
2114 |         v_pixel.val[2] = vadd_s8(v_pixel.val[2], v_shr);
2115 |         v_mul = vmull_s8(v_pixel.val[0], v_color_pixel.val[0]);
2116 |         v_shr = vshrn_n_s16(v_mul, 5);
2117 |         v_pixel.val[2] = vadd_s8(v_pixel.val[2], v_shr);
2118 |         vst4_s8((jebp_byte *)&pixel[x], v_pixel);
2119 |     }
2120 | #endif
2121 |     for (; x < width; x += 1) {
2122 |         pixel[x].r += ((jebp_byte)pixel[x].g * (jebp_byte)color_pixel->b) >> 5;
2123 |         pixel[x].b += ((jebp_byte)pixel[x].g * (jebp_byte)color_pixel->g) >> 5;
2124 |         pixel[x].b += ((jebp_byte)pixel[x].r * (jebp_byte)color_pixel->r) >> 5;
2125 |     }
2126 | }
2127 | 
2128 | JEBP__INLINE jebp_error_t jebp__apply_color_transform(
2129 |     jebp_image_t *image, jebp__subimage_t *color_image) {
2130 |     jebp_color_t *pixel = image->pixels;
2131 |     jebp_int color_width = color_image->width - 1;
2132 |     jebp_int block_size = 1 << color_image->block_bits;
2133 |     jebp_int end_size = image->width - (color_width << color_image->block_bits);
2134 |     for (jebp_int y = 0; y < image->height; y += 1) {
2135 |         jebp_color_t *color_row =
2136 |             &color_image
2137 |                  ->pixels[(y >> color_image->block_bits) * color_image->width];
2138 |         for (jebp_int x = 0; x < color_width; x += 1) {
2139 |             jebp__apply_color_row(pixel, block_size, &color_row[x]);
2140 |             pixel += block_size;
2141 |         }
2142 |         jebp__apply_color_row(pixel, end_size, &color_row[color_width]);
2143 |         pixel += end_size;
2144 |     }
2145 |     return JEBP_OK;
2146 | }
2147 | 
2148 | JEBP__INLINE jebp_error_t jebp__apply_green_transform(jebp_image_t *image) {
2149 |     jebp_int size = image->width * image->height;
2150 |     jebp_int i = 0;
2151 | #if defined(JEBP__SIMD_SSE2)
2152 |     for (; i + 4 <= size; i += 4) {
2153 |         __m128i *pixel = (__m128i *)&image->pixels[i];
2154 |         __m128i v_pixel = _mm_loadu_si128(pixel);
2155 |         __m128i v_green = _mm_srli_epi16(v_pixel, 8);
2156 |         v_green = _mm_shufflelo_epi16(v_green, _MM_SHUFFLE(2, 2, 0, 0));
2157 |         v_green = _mm_shufflehi_epi16(v_green, _MM_SHUFFLE(2, 2, 0, 0));
2158 |         v_pixel = _mm_add_epi8(v_pixel, v_green);
2159 |         _mm_storeu_si128(pixel, v_pixel);
2160 |     }
2161 | #elif defined(JEBP__SIMD_NEON)
2162 |     for (; i + 16 <= size; i += 16) {
2163 |         jebp_ubyte *pixel = (jebp_ubyte *)&image->pixels[i];
2164 |         uint8x16x4_t v_pixel = vld4q_u8(pixel);
2165 |         v_pixel.val[0] = vaddq_u8(v_pixel.val[0], v_pixel.val[1]);
2166 |         v_pixel.val[2] = vaddq_u8(v_pixel.val[2], v_pixel.val[1]);
2167 |         vst4q_u8(pixel, v_pixel);
2168 |     }
2169 | #endif
2170 |     for (; i < size; i += 1) {
2171 |         jebp_color_t *pixel = &image->pixels[i];
2172 |         pixel->r += pixel->g;
2173 |         pixel->b += pixel->g;
2174 |     }
2175 |     return JEBP_OK;
2176 | }
2177 | 
2178 | static jebp_error_t jebp__apply_transform(jebp__transform_t *transform,
2179 |                                           jebp_image_t *image) {
2180 |     switch (transform->type) {
2181 |     case JEBP__TRANSFORM_PREDICT:
2182 |         return jebp__apply_predict_transform(image, &transform->image);
2183 |     case JEBP__TRANSFORM_COLOR:
2184 |         return jebp__apply_color_transform(image, &transform->image);
2185 |     case JEBP__TRANSFORM_GREEN:
2186 |         return jebp__apply_green_transform(image);
2187 |     default:
2188 |         return JEBP_ERROR_NOSUP;
2189 |     }
2190 | }
2191 | 
2192 | /**
2193 |  * VP8L lossless codec
2194 |  */
2195 | #define JEBP__VP8L_TAG 0x4c385056
2196 | #define JEBP__VP8L_MAGIC 0x2f
2197 | 
2198 | static jebp_error_t jebp__read_vp8l_header(jebp_image_t *image,
2199 |                                            jebp__reader_t *reader,
2200 |                                            jebp__bit_reader_t *bits,
2201 |                                            jebp__chunk_t *chunk) {
2202 |     jebp_error_t err = JEBP_OK;
2203 |     if (chunk->size < 5) {
2204 |         return JEBP_ERROR_INVDATA_HEADER;
2205 |     }
2206 |     if (jebp__read_uint8(reader, &err) != JEBP__VP8L_MAGIC) {
2207 |         return jebp__error(&err, JEBP_ERROR_INVDATA_HEADER);
2208 |     }
2209 |     jepb__init_bit_reader(bits, reader, chunk->size - 1);
2210 |     image->width = jebp__read_bits(bits, 14, &err) + 1;
2211 |     image->height = jebp__read_bits(bits, 14, &err) + 1;
2212 |     jebp__read_bits(bits, 1, &err); // alpha does not impact decoding
2213 |     if (jebp__read_bits(bits, 3, &err) != 0) {
2214 |         // version must be 0
2215 |         return jebp__error(&err, JEBP_ERROR_NOSUP);
2216 |     }
2217 |     return err;
2218 | }
2219 | 
2220 | static jebp_error_t jebp__read_vp8l_size(jebp_image_t *image,
2221 |                                          jebp__reader_t *reader,
2222 |                                          jebp__chunk_t *chunk) {
2223 |     jebp__bit_reader_t bits;
2224 |     return jebp__read_vp8l_header(image, reader, &bits, chunk);
2225 | }
2226 | 
2227 | static jebp_error_t jebp__read_vp8l_nohead(jebp_image_t *image,
2228 |                                            jebp__bit_reader_t *bits) {
2229 |     jebp_error_t err = JEBP_OK;
2230 |     jebp__transform_t transforms[4];
2231 |     jebp_int nb_transforms = 0;
2232 |     for (; nb_transforms <= JEBP__NB_TRANSFORMS; nb_transforms += 1) {
2233 |         if (!jebp__read_bits(bits, 1, &err)) {
2234 |             // no more transforms to read
2235 |             break;
2236 |         }
2237 |         if (err != JEBP_OK || nb_transforms == JEBP__NB_TRANSFORMS) {
2238 |             // too many transforms
2239 |             jebp__error(&err, JEBP_ERROR_INVDATA);
2240 |             goto free_transforms;
2241 |         }
2242 |         if ((err = jebp__read_transform(&transforms[nb_transforms], bits,
2243 |                                         image)) != JEBP_OK) {
2244 |             goto free_transforms;
2245 |         }
2246 |     }
2247 |     if (err != JEBP_OK) {
2248 |         goto free_transforms;
2249 |     }
2250 | 
2251 |     jebp__colcache_t colcache;
2252 |     if ((err = jebp__read_colcache(&colcache, bits)) != JEBP_OK) {
2253 |         goto free_transforms;
2254 |     }
2255 |     jebp__subimage_t *huffman_image = &(jebp__subimage_t){0};
2256 |     if (!jebp__read_bits(bits, 1, &err)) {
2257 |         // there is no huffman image
2258 |         huffman_image = NULL;
2259 |     }
2260 |     if (err != JEBP_OK) {
2261 |         jebp__free_colcache(&colcache);
2262 |         goto free_transforms;
2263 |     }
2264 |     if (huffman_image != NULL) {
2265 |         if ((err = jebp__read_subimage(huffman_image, bits, image)) !=
2266 |             JEBP_OK) {
2267 |             jebp__free_colcache(&colcache);
2268 |             goto free_transforms;
2269 |         }
2270 |     }
2271 |     err = jebp__read_vp8l_image(image, bits, &colcache, huffman_image);
2272 |     jebp__free_colcache(&colcache);
2273 |     jebp_free_image((jebp_image_t *)huffman_image);
2274 | 
2275 | free_transforms:
2276 |     for (nb_transforms -= 1; nb_transforms >= 0; nb_transforms -= 1) {
2277 |         if (err == JEBP_OK) {
2278 |             err = jebp__apply_transform(&transforms[nb_transforms], image);
2279 |         }
2280 |         jebp__free_transform(&transforms[nb_transforms]);
2281 |     }
2282 |     return err;
2283 | }
2284 | 
2285 | static jebp_error_t jebp__read_vp8l(jebp_image_t *image, jebp__reader_t *reader,
2286 |                                     jebp__chunk_t *chunk) {
2287 |     jebp_error_t err;
2288 |     jebp__bit_reader_t bits;
2289 |     if ((err = jebp__read_vp8l_header(image, reader, &bits, chunk)) !=
2290 |         JEBP_OK) {
2291 |         return err;
2292 |     }
2293 |     if ((err = jebp__read_vp8l_nohead(image, &bits)) != JEBP_OK) {
2294 |         return err;
2295 |     }
2296 |     return JEBP_OK;
2297 | }
2298 | #endif // JEBP_NO_VP8L
2299 | 
2300 | /**
2301 |  * Public API
2302 |  */
2303 | static const char *const jebp__error_strings[JEBP_NB_ERRORS];
2304 | 
2305 | const char *jebp_error_string(jebp_error_t err) {
2306 |     if (err < 0 || err >= JEBP_NB_ERRORS) {
2307 |         err = JEBP_ERROR_UNKNOWN;
2308 |     }
2309 |     return jebp__error_strings[err];
2310 | }
2311 | 
2312 | void jebp_free_image(jebp_image_t *image) {
2313 |     if (image != NULL) {
2314 |         JEBP_FREE(image->pixels);
2315 |         JEBP__CLEAR(image, sizeof(jebp_image_t));
2316 |     }
2317 | }
2318 | 
2319 | static jebp_error_t jebp__read_size(jebp_image_t *image,
2320 |                                     jebp__reader_t *reader) {
2321 |     jebp_error_t err;
2322 |     jebp__riff_reader_t riff;
2323 |     JEBP__CLEAR(image, sizeof(jebp_image_t));
2324 |     if ((err = jebp__read_riff_header(&riff, reader)) != JEBP_OK) {
2325 |         return err;
2326 |     }
2327 |     jebp__chunk_t chunk;
2328 |     if ((err = jebp__read_riff_chunk(&riff, &chunk)) != JEBP_OK) {
2329 |         return err;
2330 |     }
2331 | 
2332 |     switch (chunk.tag) {
2333 | #ifndef JEBP_NO_VP8L
2334 |     case JEBP__VP8L_TAG:
2335 |         return jebp__read_vp8l_size(image, reader, &chunk);
2336 | #endif // JEBP_NO_VP8L
2337 |     default:
2338 |         return JEBP_ERROR_NOSUP_CODEC;
2339 |     }
2340 | }
2341 | 
2342 | jebp_error_t jebp_decode_size(jebp_image_t *image, size_t size,
2343 |                               const void *data) {
2344 |     if (image == NULL || data == NULL) {
2345 |         return JEBP_ERROR_INVAL;
2346 |     }
2347 |     jebp__reader_t reader;
2348 |     jebp__init_memory(&reader, size, data);
2349 |     return jebp__read_size(image, &reader);
2350 | }
2351 | 
2352 | static jebp_error_t jebp__read(jebp_image_t *image, jebp__reader_t *reader) {
2353 |     jebp_error_t err;
2354 |     jebp__riff_reader_t riff;
2355 |     JEBP__CLEAR(image, sizeof(jebp_image_t));
2356 |     if ((err = jebp__read_riff_header(&riff, reader)) != JEBP_OK) {
2357 |         return err;
2358 |     }
2359 |     jebp__chunk_t chunk;
2360 |     if ((err = jebp__read_riff_chunk(&riff, &chunk)) != JEBP_OK) {
2361 |         return err;
2362 |     }
2363 | 
2364 |     switch (chunk.tag) {
2365 | #ifndef JEBP_NO_VP8L
2366 |     case JEBP__VP8L_TAG:
2367 |         return jebp__read_vp8l(image, reader, &chunk);
2368 | #endif // JEBP_NO_VP8L
2369 |     default:
2370 |         return JEBP_ERROR_NOSUP_CODEC;
2371 |     }
2372 | }
2373 | 
2374 | jebp_error_t jebp_decode(jebp_image_t *image, size_t size, const void *data) {
2375 |     if (image == NULL || data == NULL) {
2376 |         return JEBP_ERROR_INVAL;
2377 |     }
2378 |     jebp__reader_t reader;
2379 |     jebp__init_memory(&reader, size, data);
2380 |     return jebp__read(image, &reader);
2381 | }
2382 | 
2383 | #ifndef JEBP_NO_STDIO
2384 | jebp_error_t jebp_read_size(jebp_image_t *image, const char *path) {
2385 |     jebp_error_t err;
2386 |     if (image == NULL || path == NULL) {
2387 |         return JEBP_ERROR_INVAL;
2388 |     }
2389 |     jebp__reader_t reader;
2390 |     if ((err = jebp__open_file(&reader, path)) != JEBP_OK) {
2391 |         return err;
2392 |     }
2393 |     err = jebp__read_size(image, &reader);
2394 |     jebp__close_file(&reader);
2395 |     return err;
2396 | }
2397 | 
2398 | jebp_error_t jebp_read(jebp_image_t *image, const char *path) {
2399 |     jebp_error_t err;
2400 |     if (image == NULL || path == NULL) {
2401 |         return JEBP_ERROR_INVAL;
2402 |     }
2403 |     jebp__reader_t reader;
2404 |     if ((err = jebp__open_file(&reader, path)) != JEBP_OK) {
2405 |         return err;
2406 |     }
2407 |     err = jebp__read(image, &reader);
2408 |     jebp__close_file(&reader);
2409 |     return err;
2410 | }
2411 | #endif // JEBP_NO_STDIO
2412 | 
2413 | /**
2414 |  * Lookup tables
2415 |  */
2416 | // These are moved to the end of the file since some of them are very large and
2417 | // putting them in the middle of the code would disrupt the flow of reading.
2418 | // Especially since in most situations the values in these tables are
2419 | // unimportant to the developer.
2420 | #ifndef JEBP_NO_VP8L
2421 | // The order that meta lengths are read
2422 | static const jebp_byte jebp__meta_length_order[JEBP__NB_META_SYMBOLS] = {
2423 |     17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
2424 | 
2425 | // {X, Y} offsets from the pixel when decoding short distance codes
2426 | static const jebp_byte jebp__vp8l_offsets[JEBP__NB_VP8L_OFFSETS][2] = {
2427 |     {0, 1},  {1, 0},  {1, 1},  {-1, 1}, {0, 2},  {2, 0},  {1, 2},  {-1, 2},
2428 |     {2, 1},  {-2, 1}, {2, 2},  {-2, 2}, {0, 3},  {3, 0},  {1, 3},  {-1, 3},
2429 |     {3, 1},  {-3, 1}, {2, 3},  {-2, 3}, {3, 2},  {-3, 2}, {0, 4},  {4, 0},
2430 |     {1, 4},  {-1, 4}, {4, 1},  {-4, 1}, {3, 3},  {-3, 3}, {2, 4},  {-2, 4},
2431 |     {4, 2},  {-4, 2}, {0, 5},  {3, 4},  {-3, 4}, {4, 3},  {-4, 3}, {5, 0},
2432 |     {1, 5},  {-1, 5}, {5, 1},  {-5, 1}, {2, 5},  {-2, 5}, {5, 2},  {-5, 2},
2433 |     {4, 4},  {-4, 4}, {3, 5},  {-3, 5}, {5, 3},  {-5, 3}, {0, 6},  {6, 0},
2434 |     {1, 6},  {-1, 6}, {6, 1},  {-6, 1}, {2, 6},  {-2, 6}, {6, 2},  {-6, 2},
2435 |     {4, 5},  {-4, 5}, {5, 4},  {-5, 4}, {3, 6},  {-3, 6}, {6, 3},  {-6, 3},
2436 |     {0, 7},  {7, 0},  {1, 7},  {-1, 7}, {5, 5},  {-5, 5}, {7, 1},  {-7, 1},
2437 |     {4, 6},  {-4, 6}, {6, 4},  {-6, 4}, {2, 7},  {-2, 7}, {7, 2},  {-7, 2},
2438 |     {3, 7},  {-3, 7}, {7, 3},  {-7, 3}, {5, 6},  {-5, 6}, {6, 5},  {-6, 5},
2439 |     {8, 0},  {4, 7},  {-4, 7}, {7, 4},  {-7, 4}, {8, 1},  {8, 2},  {6, 6},
2440 |     {-6, 6}, {8, 3},  {5, 7},  {-5, 7}, {7, 5},  {-7, 5}, {8, 4},  {6, 7},
2441 |     {-6, 7}, {7, 6},  {-7, 6}, {8, 5},  {7, 7},  {-7, 7}, {8, 6},  {8, 7}};
2442 | #endif // JEBP_NO_VP8L
2443 | 
2444 | // Error strings to return from jebp_error_string
2445 | static const char *const jebp__error_strings[JEBP_NB_ERRORS] = {
2446 |     "Ok",
2447 |     "Invalid value or argument",
2448 |     "Invalid data or corrupted file",
2449 |     "Invalid WebP header or corrupted file",
2450 |     "End of file",
2451 |     "Feature not supported",
2452 |     "Codec not supported",
2453 |     "Color-indexing or palettes are not supported",
2454 |     "Not enough memory",
2455 |     "I/O error",
2456 |     "Unknown error"};
2457 | #endif // JEBP_IMPLEMENTATION
2458 | 


--------------------------------------------------------------------------------
/jebpview.c:
--------------------------------------------------------------------------------
 1 | #define JEBP_IMPLEMENTATION
 2 | #include "jebp.h"
 3 | #include <SDL.h>
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | 
 7 | #define VIEW_TITLE_SIZE 1024
 8 | #define VIEW_BLOCK_SIZE 8
 9 | 
10 | static char view_title[VIEW_TITLE_SIZE];
11 | 
12 | int main(int argc, char **argv) {
13 |     if (argc != 2) {
14 |         fprintf(stderr, "Usage: jebpview <WebP file>\n");
15 |         return EXIT_FAILURE;
16 |     }
17 | 
18 |     jebp_image_t image;
19 |     jebp_error_t err = jebp_read(&image, argv[1]);
20 |     if (err != JEBP_OK) {
21 |         fprintf(stderr, "Failed to read image: %s\n", jebp_error_string(err));
22 |         return EXIT_FAILURE;
23 |     }
24 |     if (SDL_Init(SDL_INIT_VIDEO) != 0) {
25 |         fprintf(stderr, "Failed to initialize SDL: %s\n", SDL_GetError());
26 |         jebp_free_image(&image);
27 |         return EXIT_FAILURE;
28 |     }
29 | 
30 |     snprintf(view_title, VIEW_TITLE_SIZE, "jebpview - %s", argv[1]);
31 |     SDL_Window *window = SDL_CreateWindow(
32 |         view_title, SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, image.width,
33 |         image.height, SDL_WINDOW_ALLOW_HIGHDPI | SDL_WINDOW_HIDDEN);
34 |     if (window == NULL) {
35 |         fprintf(stderr, "Failed to create window: %s\n", SDL_GetError());
36 |         SDL_Quit();
37 |         jebp_free_image(&image);
38 |         return EXIT_FAILURE;
39 |     }
40 | 
41 |     SDL_Surface *window_surface = SDL_GetWindowSurface(window);
42 |     Uint32 light = SDL_MapRGB(window_surface->format, 0xaa, 0xaa, 0xaa);
43 |     Uint32 dark = SDL_MapRGB(window_surface->format, 0x55, 0x55, 0x55);
44 |     SDL_FillRect(window_surface, NULL, light);
45 |     for (jebp_int y = 0; y < image.height; y += VIEW_BLOCK_SIZE * 2) {
46 |         for (jebp_int x = 0; x < image.width; x += VIEW_BLOCK_SIZE * 2) {
47 |             SDL_Rect rect = {x + VIEW_BLOCK_SIZE, y, VIEW_BLOCK_SIZE,
48 |                              VIEW_BLOCK_SIZE};
49 |             SDL_FillRect(window_surface, &rect, dark);
50 |             rect.x -= VIEW_BLOCK_SIZE;
51 |             rect.y += VIEW_BLOCK_SIZE;
52 |             SDL_FillRect(window_surface, &rect, dark);
53 |         }
54 |     }
55 | 
56 |     SDL_Surface *image_surface = SDL_CreateRGBSurfaceWithFormatFrom(
57 |         image.pixels, image.width, image.height, 32, image.width * 4,
58 |         SDL_PIXELFORMAT_RGBA32);
59 |     if (image_surface == NULL) {
60 |         fprintf(stderr, "Failed to create surface: %s\n", SDL_GetError());
61 |         SDL_DestroyWindow(window);
62 |         SDL_Quit();
63 |         jebp_free_image(&image);
64 |         return EXIT_FAILURE;
65 |     }
66 |     SDL_BlitSurface(image_surface, NULL, window_surface, NULL);
67 |     SDL_UpdateWindowSurface(window);
68 |     SDL_ShowWindow(window);
69 | 
70 |     int ok;
71 |     SDL_Event event;
72 |     do {
73 |         ok = SDL_WaitEvent(&event);
74 |     } while (ok && event.type != SDL_QUIT);
75 |     if (!ok) {
76 |         fprintf(stderr, "Failed to get event: %s\n", SDL_GetError());
77 |     }
78 |     SDL_FreeSurface(image_surface);
79 |     SDL_DestroyWindow(window);
80 |     SDL_Quit();
81 |     jebp_free_image(&image);
82 |     return ok ? EXIT_SUCCESS : EXIT_FAILURE;
83 | }
84 | 


--------------------------------------------------------------------------------