├── .clang-format
├── .clangd
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── README.md
├── cmd
    ├── clean.sh
    ├── dbuild.sh
    ├── dconfigure.sh
    ├── drun.sh
    ├── format.sh
    ├── gen_asmd.sh
    ├── gen_asmr.sh
    ├── out.png
    ├── perf_record.sh
    ├── perf_report.sh
    ├── rbuild.sh
    ├── rconfigure.sh
    ├── rrun.sh
    ├── tbuild.sh
    ├── tconfigure.sh
    └── trun.sh
├── src
    ├── camera.h
    ├── comptime.h
    ├── entry.cpp
    ├── globals.h
    ├── materials.h
    ├── math.h
    ├── rand.h
    ├── render.h
    ├── sphere.h
    ├── types.h
    └── utils.h
├── tests
    ├── CMakeLists.txt
    └── entry.cpp
└── thirdparty
    └── stb_image_write
        └── stb_image_write.h


/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle: LLVM
2 | IndentWidth: 2
3 | TabWidth: 2
4 | ColumnLimit: 100
5 | NamespaceIndentation: All
6 | BreakBeforeBraces: Attach
7 | PointerAlignment: Left
8 | 


--------------------------------------------------------------------------------
/.clangd:
--------------------------------------------------------------------------------
1 | CompileFlags:
2 |   Add: [-std=c++20] 
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | out/
2 | asm_output.s
3 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.22)
 2 | 
 3 | project(crack-tracer)
 4 | 
 5 | find_package(SDL2 REQUIRED)
 6 | 
 7 | if(NOT CMAKE_BUILD_TYPE)
 8 |   set(CMAKE_BUILD_TYPE Release)
 9 | endif()
10 | 
11 | set(CMAKE_CXX_STANDARD 20)
12 | set(CMAKE_CXX_STANDARD_REQUIRED True)
13 | 
14 | add_executable(${PROJECT_NAME} src/entry.cpp)
15 | 
16 | target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_20)
17 | 
18 | target_compile_options(
19 |   ${PROJECT_NAME}
20 |   PRIVATE
21 |     "$<$<CONFIG:DEBUG>:-g;-Wall;-Wextra>;-Wno-missing-field-initializers;-march=native"
22 | )
23 | 
24 | target_compile_options(
25 |   ${PROJECT_NAME}
26 |   PUBLIC
27 |     "$<$<CONFIG:RELEASE>:-Ofast;-g;-fno-signed-zeros;-flto;-Wall;-Wextra>;-Wno-missing-field-initializers;-march=native"
28 | )
29 | 
30 | target_link_libraries(${PROJECT_NAME} PRIVATE SDL2::SDL2 SDL2::SDL2main)
31 | 
32 | target_include_directories(${PROJECT_NAME} SYSTEM
33 |                            PRIVATE thirdparty/stb_image_write)
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Alex Lee
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # crack-tracer
 2 | [YouTube video over this!](https://youtu.be/ulmjqD6Y4do)\
 3 | \
 4 | Ray tracer using no GPU acceleration to see how far we can push the limits.\
 5 | \
 6 | Basically me throwing tons of SIMD and cores at the problem
 7 | 
 8 | ![2400_samples](https://github.com/user-attachments/assets/bd71bab7-e520-455d-b5fd-5971e31e7c43)
 9 | 
10 | ![bright_stars_1600](https://github.com/user-attachments/assets/5fe74016-5f4b-4b0a-a72f-6bd36b222cf0)
11 | 
12 | 


--------------------------------------------------------------------------------
/cmd/clean.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | find ../out/ -mindepth 1 -delete;
3 | 


--------------------------------------------------------------------------------
/cmd/dbuild.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | cd ../out/debug ; make
3 | 


--------------------------------------------------------------------------------
/cmd/dconfigure.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_EXPORT_COMPILE_COMMANDS=1  -S ../ -B ../out/debug
3 | 


--------------------------------------------------------------------------------
/cmd/drun.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | cd ../out/debug ; ./crack-tracer
3 | 


--------------------------------------------------------------------------------
/cmd/format.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # got this script from https://github.com/eklitzke/clang-format-all
 3 | #
 4 | # clang-format-all: a tool to run clang-format on an entire project
 5 | # Copyright (C) 2016 Evan Klitzke <evan@eklitzke.org>
 6 | #
 7 | # This program is free software: you can redistribute it and/or modify
 8 | # it under the terms of the GNU General Public License as published by
 9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | # GNU General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU General Public License
18 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | # Variable that will hold the name of the clang-format command
21 | 
22 | DIR="../src"
23 | FMT=""
24 | 
25 | # Some distros just call it clang-format. Others (e.g. Ubuntu) are insistent
26 | # that the version number be part of the command. We prefer clang-format if
27 | # that's present, otherwise we work backwards from highest version to lowest
28 | # version.
29 | for clangfmt in clang-format{,-{4,3}.{9,8,7,6,5,4,3,2,1,0}}; do
30 |     if which "$clangfmt" &>/dev/null; then
31 |         FMT="$clangfmt"
32 |         break
33 |     fi
34 | done
35 | 
36 | # Check if we found a working clang-format
37 | if [ -z "$FMT" ]; then
38 |     echo "failed to find clang-format"
39 |     exit 1
40 | fi
41 | 
42 | # Check all of the arguments first to make sure they're all directories
43 | for dir in $DIR; do
44 |     if [ ! -d "${dir}" ]; then
45 |         echo "${dir} is not a directory"
46 |         usage
47 |     fi
48 | done
49 | 
50 | # Find a dominating file, starting from a given directory and going up.
51 | find-dominating-file() {
52 |     if [ -r "$1"/"$2" ]; then
53 |         return 0
54 |     fi
55 |     if [ "$1" = "/" ]; then
56 |         return 1
57 |     fi
58 |     find-dominating-file "$(realpath "$1"/..)" "$2"
59 |     return $?
60 | }
61 | 
62 | # Run clang-format -i on all of the things
63 | for dir in $DIR; do
64 |     pushd "${dir}" &>/dev/null
65 |     if ! find-dominating-file . .clang-format; then
66 |         echo "Failed to find dominating .clang-format starting at $PWD"
67 |         continue
68 |     fi
69 |     find . \
70 |          \( -name '*.c' \
71 |          -o -name '*.cc' \
72 |          -o -name '*.cpp' \
73 |          -o -name '*.h' \
74 |          -o -name '*.hh' \
75 |          -o -name '*.hpp' \) \
76 |          -exec "${FMT}" -i '{}' \;
77 |     popd &>/dev/null
78 | done
79 | 


--------------------------------------------------------------------------------
/cmd/gen_asmd.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | objdump -d -S -Mintel ../out/debug/crack-tracer > ../asm_output.s
3 | 


--------------------------------------------------------------------------------
/cmd/gen_asmr.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | objdump -d -S -Mintel ../out/release/crack-tracer > ../asm_output.s
3 | 


--------------------------------------------------------------------------------
/cmd/out.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imalexlee/crack-tracer/b8a93a1f510cd3775c416c370959110acf3a8e2e/cmd/out.png


--------------------------------------------------------------------------------
/cmd/perf_record.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | sudo perf record --call-graph dwarf ../out/release/crack-tracer -f -o per.data
3 | 
4 | 


--------------------------------------------------------------------------------
/cmd/perf_report.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | sudo perf report -Mintel ../out/test/perf.data
3 | 


--------------------------------------------------------------------------------
/cmd/rbuild.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | cd ../out/release ; make
3 | 


--------------------------------------------------------------------------------
/cmd/rconfigure.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_EXPORT_COMPILE_COMMANDS=1 -S ../ -B ../out/release
3 | 


--------------------------------------------------------------------------------
/cmd/rrun.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | cd ../out/release ; ./crack-tracer
3 | 


--------------------------------------------------------------------------------
/cmd/tbuild.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | cd ../out/test ; make
3 | 


--------------------------------------------------------------------------------
/cmd/tconfigure.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | cmake -DCMAKE_BUILD_TYPE=Debug -S ../tests/ -B ../out/test/
3 | 


--------------------------------------------------------------------------------
/cmd/trun.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | cd ../out/test ; ./crack-tracer-tests
3 | 


--------------------------------------------------------------------------------
/src/camera.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "types.h"
 3 | #include <SDL_events.h>
 4 | #include <SDL_keycode.h>
 5 | #include <cstdint>
 6 | 
 7 | class Camera {
 8 | public:
 9 |   Vec3 origin = {-1.2, 1, 5};
10 |   void register_key_event(SDL_Event e) {
11 | 
12 |     uint32_t key = e.key.keysym.sym;
13 | 
14 |     if (e.type == SDL_KEYDOWN) {
15 |       if (key == SDLK_w) {
16 |         velocity.z = -speed;
17 |       }
18 |       if (key == SDLK_s) {
19 |         velocity.z = speed;
20 |       }
21 |       if (key == SDLK_a) {
22 |         velocity.x = -speed;
23 |       }
24 |       if (key == SDLK_d) {
25 |         velocity.x = speed;
26 |       }
27 |     }
28 |     if (e.type == SDL_KEYUP) {
29 |       if (key == SDLK_w) {
30 |         velocity.z = 0;
31 |       }
32 |       if (key == SDLK_s) {
33 |         velocity.z = 0;
34 |       }
35 |       if (key == SDLK_a) {
36 |         velocity.x = 0;
37 |       }
38 |       if (key == SDLK_d) {
39 |         velocity.x = 0;
40 |       }
41 |     }
42 |   }
43 | 
44 |   void update() {
45 |     origin.x += velocity.x;
46 |     origin.y += velocity.y;
47 |     origin.z += velocity.z;
48 |   }
49 | 
50 | private:
51 |   Vec3 velocity = {0, 0, 0};
52 |   const float speed = 0.01f;
53 | };
54 | 


--------------------------------------------------------------------------------
/src/comptime.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "globals.h"
 3 | #include "types.h"
 4 | 
 5 | namespace comptime {
 6 |   // gets us the first pixels row of sample directions during compile time.
 7 |   //
 8 |   // subsequent render iterations will simply scale this by
 9 |   // row and column index to find where to take samples
10 |   consteval Vec3_256 init_ray_directions() {
11 | 
12 |     Vec3 top_left{
13 |         .x = global::cam_origin[0] - global::viewport_width / 2,
14 |         .y = global::cam_origin[1] + global::viewport_height / 2,
15 |         .z = global::cam_origin[2] - global::focal_len,
16 |     };
17 | 
18 |     top_left.x += global::sample_du;
19 |     top_left.y += global::sample_dv;
20 | 
21 |     alignas(32) float x_arr[8];
22 |     x_arr[0] = top_left.x;
23 |     for (int i = 1; i < 8; i++) {
24 |       x_arr[i] = x_arr[i - 1] + global::sample_du;
25 |     }
26 | 
27 |     Vec3_256 init_dirs = {
28 |         .x = {x_arr[0], x_arr[1], x_arr[2], x_arr[3], x_arr[4], x_arr[5], x_arr[6], x_arr[7]},
29 |         .y = {top_left.y, top_left.y, top_left.y, top_left.y, top_left.y, top_left.y, top_left.y,
30 |               top_left.y},
31 |         .z = {top_left.z, top_left.z, top_left.z, top_left.z, top_left.z, top_left.z, top_left.z,
32 |               top_left.z},
33 | 
34 |     };
35 | 
36 |     return init_dirs;
37 |   }
38 |   consteval __m256i init_rseed_arr() {
39 |     uint32_t rseed_arr[8];
40 |     rseed_arr[0] = 0;
41 |     for (size_t i = 1; i < 8; i++) {
42 |       rseed_arr[i] = (rseed_arr[i - 1] * 11035152453u + 12345u) & RAND_MAX;
43 |     }
44 |     __m256 rseed_vec = {
45 |         (float)rseed_arr[0], (float)rseed_arr[1], (float)rseed_arr[2], (float)rseed_arr[3],
46 |         (float)rseed_arr[4], (float)rseed_arr[5], (float)rseed_arr[6], (float)rseed_arr[7],
47 |     };
48 | 
49 |     return (__m256i)rseed_vec;
50 |   }
51 | 
52 | }; // namespace comptime
53 | 


--------------------------------------------------------------------------------
/src/entry.cpp:
--------------------------------------------------------------------------------
 1 | #include "globals.h"
 2 | #include "render.h"
 3 | 
 4 | int main() {
 5 |   if constexpr (global::active_render_mode == RenderMode::real_time) {
 6 |     render_realtime();
 7 |   } else if constexpr (global::active_render_mode == RenderMode::png) {
 8 |     render_png();
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/src/globals.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "immintrin.h"
 3 | #include "types.h"
 4 | #include <cfloat>
 5 | #include <cstdint>
 6 | 
 7 | enum RenderMode {
 8 |   png,
 9 |   real_time,
10 | };
11 | 
12 | constexpr Color_256 night = {
13 |     .x = {0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02},
14 |     .y = {0.08, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08, 0.08},
15 |     .z = {0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35},
16 | };
17 | 
18 | namespace global {
19 |   // each group calculates 8 samples.
20 |   constexpr uint16_t sample_group_num = 500;
21 |   constexpr uint8_t cmpeq = 0;
22 |   constexpr uint8_t cmplt = 1;
23 |   constexpr uint8_t cmple = 2;
24 |   constexpr uint8_t cmpneq = 4;
25 |   constexpr uint8_t cmpnlt = 5;
26 |   constexpr uint8_t cmpnle = 6;
27 |   constexpr uint8_t shuf_all_first = 0;
28 |   constexpr uint8_t shuf_all_second = 85;
29 |   constexpr uint8_t shuf_all_third = 170;
30 |   constexpr uint8_t ray_depth = 20;
31 |   constexpr float float_max = FLT_MAX;
32 |   constexpr uint16_t img_width = 2560;
33 |   constexpr uint16_t img_height = 1440;
34 |   // constexpr uint16_t img_width = 1920;
35 |   // constexpr uint16_t img_height = 1080;
36 |   constexpr float viewport_height = 2.f;
37 |   constexpr float viewport_width = viewport_height * (float(img_width) / img_height);
38 |   constexpr float pix_du = viewport_width / img_width;
39 |   constexpr float pix_dv = -viewport_height / img_height;
40 |   // 8 evenly spread out ray directions. (space-around)
41 |   constexpr float sample_du = pix_du / 9;
42 |   constexpr float sample_dv = pix_dv / (sample_group_num + 1);
43 |   constexpr float focal_len = 1.0;
44 |   constexpr float color_multiplier = 255.f / (sample_group_num * 8);
45 |   constexpr uint8_t thread_count = 60;
46 |   constexpr RenderMode active_render_mode = RenderMode::png;
47 |   // index of refraction
48 |   constexpr float ir = 1.5;
49 |   // constexpr float ir = 1.5;
50 |   constexpr __m256 ir_vec = {ir, ir, ir, ir, ir, ir, ir, ir};
51 |   constexpr float rcp_ir = 1.f / ir;
52 |   constexpr __m256 rcp_ir_vec = {rcp_ir, rcp_ir, rcp_ir, rcp_ir, rcp_ir, rcp_ir, rcp_ir, rcp_ir};
53 |   alignas(32) constexpr float cam_origin[4] = {0.f, 0.f, 0.0f, 0.f};
54 |   const __m256 all_set = _mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(), cmpeq);
55 |   constexpr __m256 zeros = {0, 0, 0, 0, 0, 0, 0, 0};
56 |   constexpr __m256 white = {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f};
57 |   constexpr float t_min = 0.0013f;
58 |   constexpr __m256 t_min_vec = {t_min, t_min, t_min, t_min, t_min, t_min, t_min, t_min};
59 |   constexpr Color_256 background_color = {.x = white, .y = white, .z = white};
60 | } // namespace global
61 | 


--------------------------------------------------------------------------------
/src/materials.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "globals.h"
  3 | #include "math.h"
  4 | #include "rand.h"
  5 | #include "types.h"
  6 | #include <cmath>
  7 | #include <cstdlib>
  8 | #include <immintrin.h>
  9 | 
 10 | constexpr Color silver = {.x = 0.5f, .y = 0.5f, .z = 0.5f};
 11 | constexpr Color grey = {.x = 0.5f, .y = 0.5f, .z = 0.5f};
 12 | constexpr Color white = {.x = 1.f, .y = 1.f, .z = 1.f};
 13 | constexpr Color red = {.x = 0.90f, .y = 0.20f, .z = 0.20f};
 14 | constexpr Color gold = {.x = 0.90f, .y = 0.75f, .z = 0.54f};
 15 | constexpr Color copper = {.x = 0.59f, .y = 0.34f, .z = 0.29f};
 16 | constexpr Color green = {.x = 0.f, .y = 1.f, .z = 0.f};
 17 | constexpr Color moon = {.x = 100.f, .y = 100.f, .z = 100.f};
 18 | 
 19 | constexpr Material silver_metallic = {.atten = silver, .type = MatType::metallic};
 20 | constexpr Material red_metallic = {.atten = red, .type = MatType::metallic};
 21 | constexpr Material gold_metallic = {.atten = gold, .type = MatType::metallic};
 22 | constexpr Material copper_metallic = {.atten = copper, .type = MatType::metallic};
 23 | constexpr Material green_metallic = {.atten = green, .type = MatType::metallic};
 24 | 
 25 | constexpr Material silver_lambertian = {.atten = silver, .type = MatType::lambertian};
 26 | constexpr Material red_lambertian = {.atten = red, .type = MatType::lambertian};
 27 | constexpr Material gold_lambertian = {.atten = gold, .type = MatType::lambertian};
 28 | constexpr Material star_lambertian = {.atten = moon, .type = MatType::lambertian};
 29 | constexpr Material grey_lambertian = {.atten = grey, .type = MatType::lambertian};
 30 | 
 31 | constexpr Material glass = {.atten = white, .type = MatType::dielectric};
 32 | 
 33 | alignas(32) static const int metallic_types[8] = {
 34 |     MatType::metallic, MatType::metallic, MatType::metallic, MatType::metallic,
 35 |     MatType::metallic, MatType::metallic, MatType::metallic, MatType::metallic,
 36 | };
 37 | 
 38 | alignas(32) static const int lambertian_types[8] = {
 39 |     MatType::lambertian, MatType::lambertian, MatType::lambertian, MatType::lambertian,
 40 |     MatType::lambertian, MatType::lambertian, MatType::lambertian, MatType::lambertian,
 41 | };
 42 | 
 43 | alignas(32) static const int dielectric_types[8] = {
 44 |     MatType::dielectric, MatType::dielectric, MatType::dielectric, MatType::dielectric,
 45 |     MatType::dielectric, MatType::dielectric, MatType::dielectric, MatType::dielectric,
 46 | };
 47 | 
 48 | static LCGRand lcg_rand;
 49 | inline static void scatter_metallic(RayCluster* rays, const HitRecords* hit_rec) {
 50 |   Vec3_256 reflected = reflect(&rays->dir, &hit_rec->norm);
 51 |   normalize(&reflected);
 52 | 
 53 |   __m256 dp = dot(&reflected, &hit_rec->norm);
 54 |   __m256 greater_than_zero = _mm256_cmp_ps(dp, global::zeros, global::cmpnle);
 55 |   rays->dir = reflected & greater_than_zero;
 56 | };
 57 | 
 58 | [[nodiscard]] inline static __m256 near_zero(const Vec3_256* vec) {
 59 |   __m256 near_x = _mm256_cmp_ps(abs_256(vec->x), global::t_min_vec, global::cmplt);
 60 |   __m256 near_y = _mm256_cmp_ps(abs_256(vec->y), global::t_min_vec, global::cmplt);
 61 |   __m256 near_z = _mm256_cmp_ps(abs_256(vec->z), global::t_min_vec, global::cmplt);
 62 | 
 63 |   return _mm256_and_ps(near_x, _mm256_and_ps(near_y, near_z));
 64 | };
 65 | 
 66 | inline static void scatter_lambertian(RayCluster* rays, const HitRecords* hit_rec) {
 67 |   Vec3_256 rand_vec = lcg_rand.random_unit_vec();
 68 |   Vec3_256 scatter_dir = rand_vec + hit_rec->norm;
 69 | 
 70 |   //  rays->dir = blend_vec256(&scatter_dir, &hit_rec->norm, near_zero(&scatter_dir));
 71 |   rays->dir = scatter_dir;
 72 | }
 73 | 
 74 | [[nodiscard]] inline static __m256 reflectance(__m256 cos, __m256 ref_idx) {
 75 |   __m256 ref_low = global::white - ref_idx;
 76 |   __m256 ref_high = global::white + ref_idx;
 77 |   ref_high = _mm256_rcp_ps(ref_high);
 78 |   __m256 ref = ref_low * ref_high;
 79 |   ref *= ref;
 80 | 
 81 |   __m256 cos_sub = global::white - cos;
 82 |   // cos_sub^5
 83 |   __m256 cos_5 = cos_sub * cos_sub;
 84 |   cos_5 *= cos_sub;
 85 |   cos_5 *= cos_sub;
 86 |   cos_5 *= cos_sub;
 87 | 
 88 |   __m256 ref_sub = global::white - ref;
 89 |   return _mm256_fmadd_ps(ref_sub, cos_5, ref);
 90 | }
 91 | 
 92 | inline static void scatter_dielectric(RayCluster* rays, const HitRecords* hit_rec) {
 93 | 
 94 |   __m256 ri = _mm256_blendv_ps(global::ir_vec, global::rcp_ir_vec, hit_rec->front_face);
 95 |   Vec3_256 unit_dir = rays->dir;
 96 |   normalize(&unit_dir);
 97 | 
 98 |   Vec3_256 inverse_unit_dir = -unit_dir;
 99 | 
100 |   __m256 cos_theta = dot(&inverse_unit_dir, &hit_rec->norm);
101 |   cos_theta = _mm256_min_ps(cos_theta, global::white);
102 | 
103 |   __m256 sin_theta = _mm256_sqrt_ps(global::white - cos_theta * cos_theta);
104 | 
105 |   __m256 can_refract = ri * sin_theta;
106 |   can_refract = _mm256_cmp_ps(can_refract, global::white, global::cmple);
107 | 
108 |   __m256 ref = reflectance(cos_theta, ri);
109 |   __m256 rand_vec = lcg_rand.rand_in_range_256(0.f, 1.f);
110 |   __m256 low_reflectance_loc = _mm256_cmp_ps(ref, rand_vec, global::cmple);
111 |   __m256 refraction_loc = _mm256_and_ps(can_refract, low_reflectance_loc);
112 |   __m256 reflection_loc = _mm256_xor_ps(refraction_loc, global::all_set);
113 | 
114 |   if (!_mm256_testz_ps(refraction_loc, refraction_loc)) {
115 |     Vec3_256 refract_dir = refract(&unit_dir, &hit_rec->norm, ri);
116 |     rays->dir = blend_vec256(&rays->dir, &refract_dir, refraction_loc);
117 |   }
118 |   if (!_mm256_testz_ps(reflection_loc, reflection_loc)) {
119 |     Vec3_256 reflect_dir = reflect(&unit_dir, &hit_rec->norm);
120 | 
121 |     reflection_loc = _mm256_and_ps(reflection_loc, hit_rec->front_face);
122 |     rays->dir = blend_vec256(&rays->dir, &reflect_dir, reflection_loc);
123 |   }
124 | }
125 | 
126 | inline static void scatter(RayCluster* rays, const HitRecords* hit_rec) {
127 |   __m256i metallic_type = _mm256_load_si256((__m256i*)metallic_types);
128 |   __m256i lambertian_type = _mm256_load_si256((__m256i*)lambertian_types);
129 |   __m256i dielectric_type = _mm256_load_si256((__m256i*)dielectric_types);
130 | 
131 |   __m256i metallic_loc = _mm256_cmpeq_epi32(hit_rec->mat.type, metallic_type);
132 |   __m256i lambertian_loc = _mm256_cmpeq_epi32(hit_rec->mat.type, lambertian_type);
133 |   __m256i dielectric_loc = _mm256_cmpeq_epi32(hit_rec->mat.type, dielectric_type);
134 | 
135 |   if (!_mm256_testz_si256(metallic_loc, metallic_loc)) {
136 |     RayCluster metallic_rays = {
137 |         .dir = rays->dir,
138 |         .orig = hit_rec->orig,
139 |     };
140 |     scatter_metallic(&metallic_rays, hit_rec);
141 | 
142 |     rays->dir = blend_vec256(&rays->dir, &metallic_rays.dir, (__m256)metallic_loc);
143 |     rays->orig = blend_vec256(&rays->orig, &metallic_rays.orig, (__m256)metallic_loc);
144 |   }
145 |   if (!_mm256_testz_si256(lambertian_loc, lambertian_loc)) {
146 |     RayCluster lambertian_rays = {
147 |         .dir = rays->dir,
148 |         .orig = hit_rec->orig,
149 |     };
150 |     scatter_lambertian(&lambertian_rays, hit_rec);
151 | 
152 |     rays->dir = blend_vec256(&rays->dir, &lambertian_rays.dir, (__m256)lambertian_loc);
153 |     rays->orig = blend_vec256(&rays->orig, &lambertian_rays.orig, (__m256)lambertian_loc);
154 |   }
155 |   if (!_mm256_testz_si256(dielectric_loc, dielectric_loc)) {
156 |     RayCluster dielectric_rays = {
157 |         .dir = rays->dir,
158 |         .orig = hit_rec->orig,
159 |     };
160 |     scatter_dielectric(&dielectric_rays, hit_rec);
161 | 
162 |     rays->dir = blend_vec256(&rays->dir, &dielectric_rays.dir, (__m256)dielectric_loc);
163 |     rays->orig = blend_vec256(&rays->orig, &dielectric_rays.orig, (__m256)dielectric_loc);
164 |   }
165 | }
166 | 


--------------------------------------------------------------------------------
/src/math.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "globals.h"
  3 | #include "types.h"
  4 | #include <cstdint>
  5 | #include <cstdio>
  6 | #include <immintrin.h>
  7 | 
  8 | // OPERATORS
  9 | 
 10 | inline static Vec3_256 operator+(const Vec3_256& a, const Vec3_256& b) {
 11 |   return Vec3_256{
 12 |       .x = _mm256_add_ps(a.x, b.x),
 13 |       .y = _mm256_add_ps(a.y, b.y),
 14 |       .z = _mm256_add_ps(a.z, b.z),
 15 |   };
 16 | }
 17 | 
 18 | inline static Vec3_256 operator+(const Vec3_256& a, const __m256& b) {
 19 |   return Vec3_256{
 20 |       .x = _mm256_add_ps(a.x, b),
 21 |       .y = _mm256_add_ps(a.y, b),
 22 |       .z = _mm256_add_ps(a.z, b),
 23 |   };
 24 | }
 25 | 
 26 | inline static Vec3_256& operator+=(Vec3_256& a, const Vec3_256& b) {
 27 |   a.x = _mm256_add_ps(a.x, b.x);
 28 |   a.y = _mm256_add_ps(a.y, b.y);
 29 |   a.z = _mm256_add_ps(a.z, b.z);
 30 |   return a;
 31 | }
 32 | 
 33 | inline static Vec3_256 operator-(const Vec3_256& a, const Vec3_256& b) {
 34 |   return Vec3_256{
 35 |       .x = _mm256_sub_ps(a.x, b.x),
 36 |       .y = _mm256_sub_ps(a.y, b.y),
 37 |       .z = _mm256_sub_ps(a.z, b.z),
 38 |   };
 39 | }
 40 | 
 41 | inline static Vec3_256& operator-=(Vec3_256& a, const Vec3_256& b) {
 42 |   a.x = _mm256_sub_ps(a.x, b.x);
 43 |   a.y = _mm256_sub_ps(a.y, b.y);
 44 |   a.z = _mm256_sub_ps(a.z, b.z);
 45 |   return a;
 46 | }
 47 | 
 48 | // inverse
 49 | inline static Vec3_256 operator-(const Vec3_256& a) {
 50 |   // -1
 51 |   __m256 invert = _mm256_sub_ps(_mm256_setzero_ps(), global::white);
 52 |   return Vec3_256{
 53 |       .x = _mm256_mul_ps(a.x, invert),
 54 |       .y = _mm256_mul_ps(a.y, invert),
 55 |       .z = _mm256_mul_ps(a.z, invert),
 56 |   };
 57 | }
 58 | 
 59 | inline static Vec3_256 operator*(const Vec3_256& a, const Vec3_256& b) {
 60 |   return Vec3_256{
 61 |       .x = _mm256_mul_ps(a.x, b.x),
 62 |       .y = _mm256_mul_ps(a.y, b.y),
 63 |       .z = _mm256_mul_ps(a.z, b.z),
 64 |   };
 65 | }
 66 | 
 67 | inline static Vec3_256 operator*(const Vec3_256& a, const __m256& b) {
 68 |   return Vec3_256{
 69 |       .x = _mm256_mul_ps(a.x, b),
 70 |       .y = _mm256_mul_ps(a.y, b),
 71 |       .z = _mm256_mul_ps(a.z, b),
 72 |   };
 73 | }
 74 | 
 75 | inline static Vec3_256& operator*=(Vec3_256& a, const Vec3_256& b) {
 76 |   a.x = _mm256_mul_ps(a.x, b.x);
 77 |   a.y = _mm256_mul_ps(a.y, b.y);
 78 |   a.z = _mm256_mul_ps(a.z, b.z);
 79 |   return a;
 80 | }
 81 | 
 82 | inline static Vec3_256& operator*=(Vec3_256& a, const __m256& b) {
 83 |   a.x = _mm256_mul_ps(a.x, b);
 84 |   a.y = _mm256_mul_ps(a.y, b);
 85 |   a.z = _mm256_mul_ps(a.z, b);
 86 |   return a;
 87 | }
 88 | 
 89 | inline static Vec3_256 operator/(const Vec3_256& a, const Vec3_256& b) {
 90 | 
 91 |   Vec3_256 rcp_b = {
 92 |       .x = _mm256_rcp_ps(b.x),
 93 |       .y = _mm256_rcp_ps(b.y),
 94 |       .z = _mm256_rcp_ps(b.z),
 95 |   };
 96 | 
 97 |   return Vec3_256{
 98 |       .x = _mm256_mul_ps(a.x, rcp_b.x),
 99 |       .y = _mm256_mul_ps(a.y, rcp_b.y),
100 |       .z = _mm256_mul_ps(a.z, rcp_b.z),
101 |   };
102 | }
103 | 
104 | inline static Vec3_256& operator/=(Vec3_256& a, const __m256& b) {
105 | 
106 |   __m256 rcp_b = _mm256_rcp_ps(b);
107 | 
108 |   a.x = _mm256_mul_ps(a.x, rcp_b);
109 |   a.y = _mm256_mul_ps(a.y, rcp_b);
110 |   a.z = _mm256_mul_ps(a.z, rcp_b);
111 |   return a;
112 | }
113 | 
114 | inline static Vec3_256 operator&(const Vec3_256& a, const __m256& b) {
115 |   return Vec3_256{
116 |       .x = _mm256_and_ps(a.x, b),
117 |       .y = _mm256_and_ps(a.y, b),
118 |       .z = _mm256_and_ps(a.z, b),
119 |   };
120 | }
121 | 
122 | inline static Vec3_256& operator&=(Vec3_256& a, const __m256& b) {
123 |   a.x = _mm256_and_ps(a.x, b);
124 |   a.y = _mm256_and_ps(a.y, b);
125 |   a.z = _mm256_and_ps(a.z, b);
126 |   return a;
127 | }
128 | 
129 | [[nodiscard]] inline static __m256 dot(const Vec3_256* a, const Vec3_256* b) {
130 |   __m256 dot = _mm256_mul_ps(a->x, b->x);
131 |   dot = _mm256_fmadd_ps(a->y, b->y, dot);
132 |   return _mm256_fmadd_ps(a->z, b->z, dot);
133 | }
134 | 
135 | // reflect a ray about the axis
136 | // v = v - 2*dot(v,n)*n;
137 | [[nodiscard]] inline static Vec3_256 reflect(const Vec3_256* ray_dir, const Vec3_256* axis) {
138 |   constexpr __m256 reflect_scale = {
139 |       2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f, 2.f,
140 |   };
141 |   return *ray_dir - *axis * dot(ray_dir, axis) * reflect_scale;
142 | }
143 | 
144 | [[nodiscard]] inline static __m256 abs_256(__m256 vec) {
145 |   __m256i sign_mask = _mm256_srli_epi32((__m256i)global::all_set, 1);
146 |   return _mm256_and_ps(vec, (__m256)sign_mask);
147 | }
148 | 
149 | [[nodiscard]] inline static Vec3_256 refract(const Vec3_256* ray_dir, const Vec3_256* norm,
150 |                                              __m256 ratio) {
151 | 
152 |   Vec3_256 inverted_ray_dir = -*ray_dir;
153 |   __m256 cos_theta = _mm256_min_ps(dot(&inverted_ray_dir, norm), global::white);
154 | 
155 |   Vec3_256 r_out_perp = {
156 |       .x = _mm256_fmadd_ps(cos_theta, norm->x, ray_dir->x),
157 |       .y = _mm256_fmadd_ps(cos_theta, norm->y, ray_dir->y),
158 |       .z = _mm256_fmadd_ps(cos_theta, norm->z, ray_dir->z),
159 |   };
160 |   r_out_perp *= ratio;
161 | 
162 |   __m256 r_out_parallel_scale = global::white - dot(&r_out_perp, &r_out_perp);
163 | 
164 |   r_out_parallel_scale = abs_256(r_out_parallel_scale);
165 | 
166 |   // square then negate
167 |   __m256 parallel_scale_rsqrt = _mm256_rsqrt_ps(r_out_parallel_scale);
168 |   r_out_parallel_scale *= -parallel_scale_rsqrt;
169 | 
170 |   return Vec3_256{
171 |       .x = _mm256_fmadd_ps(r_out_parallel_scale, norm->x, r_out_perp.x),
172 |       .y = _mm256_fmadd_ps(r_out_parallel_scale, norm->y, r_out_perp.y),
173 |       .z = _mm256_fmadd_ps(r_out_parallel_scale, norm->z, r_out_perp.z),
174 |   };
175 | }
176 | 
177 | inline static void normalize(Vec3_256* vec) {
178 |   __m256 vec_len_2 = dot(vec, vec);
179 |   __m256 recip_len = _mm256_rsqrt_ps(vec_len_2);
180 | 
181 |   *vec *= recip_len;
182 | }
183 | 
184 | inline static Vec3_256 broadcast_vec(const Vec3* vec) {
185 |   return Vec3_256{
186 |       .x = _mm256_broadcast_ss(&vec->x),
187 |       .y = _mm256_broadcast_ss(&vec->y),
188 |       .z = _mm256_broadcast_ss(&vec->z),
189 |   };
190 | }
191 | 
192 | inline static Vec3_256 blend_vec256(const Vec3_256* a, const Vec3_256* b, __m256 mask) {
193 |   return Vec3_256{
194 |       .x = _mm256_blendv_ps(a->x, b->x, mask),
195 |       .y = _mm256_blendv_ps(a->y, b->y, mask),
196 |       .z = _mm256_blendv_ps(a->z, b->z, mask),
197 |   };
198 | }
199 | 
200 | inline static uint32_t f_to_i(float f_val) {
201 |   f_val += 1 << 23;
202 |   return ((uint32_t)f_val) & 0x007FFFFF;
203 | }
204 | 


--------------------------------------------------------------------------------
/src/rand.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "comptime.h"
 3 | #include "math.h"
 4 | 
 5 | class LCGRand {
 6 | public:
 7 |   [[nodiscard]] inline Vec3_256 random_unit_vec() {
 8 |     Vec3_256 rand_vec = rand_vec_in_cube();
 9 |     normalize(&rand_vec);
10 |     return rand_vec;
11 |   };
12 | 
13 |   [[nodiscard]] inline float rand_in_range(float min, float max) {
14 |     float scale = lcg_rand() * rcp_rand_max;
15 |     float f = min + scale * (max - min);
16 |     return f;
17 |   }
18 | 
19 |   [[nodiscard]] inline __m256 rand_in_range_256(float min, float max) {
20 |     __m256i scale_i32 = lcg_rand_256();
21 |     __m256 scale = _mm256_cvtepi32_ps(scale_i32);
22 |     __m256 rcp_rand_max_vec = _mm256_broadcast_ss(&rcp_rand_max);
23 |     scale *= rcp_rand_max_vec;
24 | 
25 |     __m256 min_vec = _mm256_broadcast_ss(&min);
26 |     __m256 max_vec = _mm256_broadcast_ss(&max);
27 |     __m256 range = max_vec - min_vec;
28 | 
29 |     return _mm256_fmadd_ps(scale, range, min_vec);
30 |   }
31 | 
32 | private:
33 |   static inline thread_local __m256i rseed_vec = comptime::init_rseed_arr();
34 |   static inline thread_local uint32_t rseed = 0;
35 |   const __m256i r_a = _mm256_set1_epi32((uint32_t)11035152453u);
36 |   const __m256i r_b = _mm256_set1_epi32(12345u);
37 |   const __m256i rand_max_vec = _mm256_set1_epi32(RAND_MAX);
38 |   static constexpr float rcp_rand_max = 1.f / RAND_MAX;
39 | 
40 |   [[nodiscard]] inline Vec3_256 rand_vec_in_cube() {
41 |     float min = -1.0;
42 |     float max = 1.0;
43 | 
44 |     Vec3_256 rand_vec = {
45 |         .x = rand_in_range_256(min, max),
46 |         .y = rand_in_range_256(min, max),
47 |         .z = rand_in_range_256(min, max),
48 |     };
49 | 
50 |     return rand_vec;
51 |   }
52 | 
53 |   [[nodiscard]] inline __m256i lcg_rand_256() {
54 |     rseed_vec = _mm256_mullo_epi32(rseed_vec, r_a);
55 |     rseed_vec = _mm256_add_epi32(rseed_vec, r_b);
56 |     rseed_vec = _mm256_and_si256(rseed_vec, rand_max_vec);
57 |     return rseed_vec;
58 |   };
59 | 
60 |   // scalar versions of rand generation
61 |   [[nodiscard]] inline int lcg_rand() { return rseed = (rseed * 1103515245u + 12345u) & RAND_MAX; }
62 | };
63 | 


--------------------------------------------------------------------------------
/src/render.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "camera.h"
  3 | #include "comptime.h"
  4 | #include "globals.h"
  5 | #include "materials.h"
  6 | #include "math.h"
  7 | #include "sphere.h"
  8 | #include "types.h"
  9 | #include <SDL2/SDL.h>
 10 | #include <array>
 11 | #include <chrono>
 12 | #include <cstdint>
 13 | #include <cstdio>
 14 | #include <future>
 15 | #include <immintrin.h>
 16 | #define STB_IMAGE_WRITE_IMPLEMENTATION
 17 | #include "utils.h"
 18 | #include <stb_image_write.h>
 19 | 
 20 | constexpr Color_256 sky = {
 21 |     .x = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f},
 22 |     .y = {0.7f, 0.7f, 0.7f, 0.7f, 0.7f, 0.7f, 0.7f, 0.7f},
 23 |     .z = {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f},
 24 | };
 25 | 
 26 | inline static void update_colors(Color_256* curr_colors, const Color_256* new_colors,
 27 |                                  __m256 update_mask) {
 28 | 
 29 |   __m256 new_no_hit_mask = _mm256_xor_ps(update_mask, global::all_set);
 30 |   __m256 preserve_curr = _mm256_and_ps(global::white, new_no_hit_mask);
 31 | 
 32 |   // multiply current colors by the attenuation of new hits.
 33 |   // fill 1.0 for no hits in order to preserve current colors when multiplying
 34 |   *curr_colors *= ((*new_colors & update_mask) + preserve_curr);
 35 | }
 36 | 
 37 | inline static Color_256 ray_cluster_colors(RayCluster* rays) {
 38 |   // will be used to add a sky tint to rays that at some point bounce off into space.
 39 |   // if a ray never bounces away (within amount of bounces set by depth), the
 40 |   // hit_mask will be all set (packed floats) and the sky tint will not affect its final color
 41 |   __m256 no_hit_mask = global::zeros;
 42 | 
 43 |   HitRecords hit_rec;
 44 |   hit_rec.front_face = global::zeros;
 45 | 
 46 |   Color_256 colors{
 47 |       .x = global::white,
 48 |       .y = global::white,
 49 |       .z = global::white,
 50 |   };
 51 | 
 52 |   for (int i = 0; i < global::ray_depth; i++) {
 53 | 
 54 |     find_sphere_hits(&hit_rec, rays, INFINITY);
 55 | 
 56 |     // or a mask when a value is not a hit, at any point.
 57 |     // if all are zero, break
 58 |     __m256 new_hit_mask = _mm256_cmp_ps(hit_rec.t, global::zeros, global::cmpnle);
 59 |     __m256 new_no_hit_mask = _mm256_xor_ps(new_hit_mask, global::all_set);
 60 | 
 61 |     no_hit_mask = _mm256_or_ps(no_hit_mask, new_no_hit_mask);
 62 |     if (_mm256_testz_ps(new_hit_mask, new_hit_mask)) {
 63 |       update_colors(&colors, &global::background_color, no_hit_mask);
 64 |       break;
 65 |     }
 66 | 
 67 |     scatter(rays, &hit_rec);
 68 | 
 69 |     update_colors(&colors, &hit_rec.mat.atten, new_hit_mask);
 70 |   }
 71 | 
 72 |   return colors;
 73 | };
 74 | 
 75 | // writes a color buffer of 32 Color values to an image buffer
 76 | // uses non temporal writes to avoid filling data cache
 77 | inline static void write_out_color_buf(const Color* color_buf, CharColor* img_buf,
 78 |                                        uint32_t write_pos) {
 79 | 
 80 |   __m256 cm = _mm256_broadcast_ss(&global::color_multiplier);
 81 |   __m256 colors_1_f32 = _mm256_load_ps((float*)color_buf) * cm;
 82 |   __m256 colors_2_f32 = _mm256_load_ps((float*)(color_buf) + 8) * cm;
 83 |   __m256 colors_3_f32 = _mm256_load_ps((float*)(color_buf) + 16) * cm;
 84 |   __m256 colors_4_f32 = _mm256_load_ps((float*)(color_buf) + 24) * cm;
 85 |   __m256 colors_5_f32 = _mm256_load_ps((float*)(color_buf) + 32) * cm;
 86 |   __m256 colors_6_f32 = _mm256_load_ps((float*)(color_buf) + 40) * cm;
 87 |   __m256 colors_7_f32 = _mm256_load_ps((float*)(color_buf) + 48) * cm;
 88 |   __m256 colors_8_f32 = _mm256_load_ps((float*)(color_buf) + 56) * cm;
 89 |   __m256 colors_9_f32 = _mm256_load_ps((float*)(color_buf) + 64) * cm;
 90 |   __m256 colors_10_f32 = _mm256_load_ps((float*)(color_buf) + 72) * cm;
 91 |   __m256 colors_11_f32 = _mm256_load_ps((float*)(color_buf) + 80) * cm;
 92 |   __m256 colors_12_f32 = _mm256_load_ps((float*)(color_buf) + 88) * cm;
 93 | 
 94 |   __m256i colors_1_i32 = _mm256_cvtps_epi32(colors_1_f32);
 95 |   __m256i colors_2_i32 = _mm256_cvtps_epi32(colors_2_f32);
 96 |   __m256i colors_3_i32 = _mm256_cvtps_epi32(colors_3_f32);
 97 |   __m256i colors_4_i32 = _mm256_cvtps_epi32(colors_4_f32);
 98 |   __m256i colors_5_i32 = _mm256_cvtps_epi32(colors_5_f32);
 99 |   __m256i colors_6_i32 = _mm256_cvtps_epi32(colors_6_f32);
100 |   __m256i colors_7_i32 = _mm256_cvtps_epi32(colors_7_f32);
101 |   __m256i colors_8_i32 = _mm256_cvtps_epi32(colors_8_f32);
102 |   __m256i colors_9_i32 = _mm256_cvtps_epi32(colors_9_f32);
103 |   __m256i colors_10_i32 = _mm256_cvtps_epi32(colors_10_f32);
104 |   __m256i colors_11_i32 = _mm256_cvtps_epi32(colors_11_f32);
105 |   __m256i colors_12_i32 = _mm256_cvtps_epi32(colors_12_f32);
106 | 
107 |   const uint8_t BOTH_LOW_XMMWORD = 32;
108 |   const uint8_t BOTH_HIGH_XMMWORD = 49;
109 |   __m256i temp_permute_1 = _mm256_permute2x128_si256(colors_1_i32, colors_2_i32, BOTH_LOW_XMMWORD);
110 |   __m256i temp_permute_2 = _mm256_permute2x128_si256(colors_1_i32, colors_2_i32, BOTH_HIGH_XMMWORD);
111 |   __m256i temp_permute_3 = _mm256_permute2x128_si256(colors_3_i32, colors_4_i32, BOTH_LOW_XMMWORD);
112 |   __m256i temp_permute_4 = _mm256_permute2x128_si256(colors_3_i32, colors_4_i32, BOTH_HIGH_XMMWORD);
113 |   __m256i temp_permute_5 = _mm256_permute2x128_si256(colors_5_i32, colors_6_i32, BOTH_LOW_XMMWORD);
114 |   __m256i temp_permute_6 = _mm256_permute2x128_si256(colors_5_i32, colors_6_i32, BOTH_HIGH_XMMWORD);
115 |   __m256i temp_permute_7 = _mm256_permute2x128_si256(colors_7_i32, colors_8_i32, BOTH_LOW_XMMWORD);
116 |   __m256i temp_permute_8 = _mm256_permute2x128_si256(colors_7_i32, colors_8_i32, BOTH_HIGH_XMMWORD);
117 |   __m256i temp_permute_9 = _mm256_permute2x128_si256(colors_9_i32, colors_10_i32, BOTH_LOW_XMMWORD);
118 |   __m256i temp_permute_10 =
119 |       _mm256_permute2x128_si256(colors_9_i32, colors_10_i32, BOTH_HIGH_XMMWORD);
120 |   __m256i temp_permute_11 =
121 |       _mm256_permute2x128_si256(colors_11_i32, colors_12_i32, BOTH_LOW_XMMWORD);
122 |   __m256i temp_permute_12 =
123 |       _mm256_permute2x128_si256(colors_11_i32, colors_12_i32, BOTH_HIGH_XMMWORD);
124 | 
125 |   __m256i colors_1_i16 = _mm256_packs_epi32(temp_permute_1, temp_permute_2);
126 |   __m256i colors_2_i16 = _mm256_packs_epi32(temp_permute_3, temp_permute_4);
127 |   __m256i colors_3_i16 = _mm256_packs_epi32(temp_permute_5, temp_permute_6);
128 |   __m256i colors_4_i16 = _mm256_packs_epi32(temp_permute_7, temp_permute_8);
129 |   __m256i colors_5_i16 = _mm256_packs_epi32(temp_permute_9, temp_permute_10);
130 |   __m256i colors_6_i16 = _mm256_packs_epi32(temp_permute_11, temp_permute_12);
131 | 
132 |   temp_permute_1 = _mm256_permute2x128_si256(colors_1_i16, colors_2_i16, BOTH_LOW_XMMWORD);
133 |   temp_permute_2 = _mm256_permute2x128_si256(colors_1_i16, colors_2_i16, BOTH_HIGH_XMMWORD);
134 |   temp_permute_3 = _mm256_permute2x128_si256(colors_3_i16, colors_4_i16, BOTH_LOW_XMMWORD);
135 |   temp_permute_4 = _mm256_permute2x128_si256(colors_3_i16, colors_4_i16, BOTH_HIGH_XMMWORD);
136 |   temp_permute_5 = _mm256_permute2x128_si256(colors_5_i16, colors_6_i16, BOTH_LOW_XMMWORD);
137 |   temp_permute_6 = _mm256_permute2x128_si256(colors_5_i16, colors_6_i16, BOTH_HIGH_XMMWORD);
138 | 
139 |   __m256i colors_1_u8 = _mm256_packus_epi16(temp_permute_1, temp_permute_2);
140 |   __m256i colors_2_u8 = _mm256_packus_epi16(temp_permute_3, temp_permute_4);
141 |   __m256i colors_3_u8 = _mm256_packus_epi16(temp_permute_5, temp_permute_6);
142 | 
143 |   // SDL offsets our img pointer to a location that might not be aligned to 32 bytes.
144 |   // Therefore we can't just stream from the registers to memory... :(
145 |   write_pos *= 3;
146 |   if constexpr (global::active_render_mode == RenderMode::real_time) {
147 |     alignas(32) CharColor char_buf[32];
148 |     _mm256_store_si256(((__m256i*)char_buf), colors_1_u8);
149 |     _mm256_store_si256(((__m256i*)char_buf) + 1, colors_2_u8);
150 |     _mm256_store_si256(((__m256i*)char_buf) + 2, colors_3_u8);
151 | 
152 |     int* img_ints = (int*)(((__m256i*)img_buf) + write_pos);
153 |     int* color_ints = (int*)(char_buf);
154 | 
155 |     for (int i = 0; i < 24; i += 4) {
156 |       _mm_stream_si32((img_ints + i), *(color_ints + i));
157 |       _mm_stream_si32((img_ints + i + 1), *(color_ints + i + 1));
158 |       _mm_stream_si32((img_ints + i + 2), *(color_ints + i + 2));
159 |       _mm_stream_si32((img_ints + i + 3), *(color_ints + i + 3));
160 |     }
161 |   } else {
162 |     _mm256_stream_si256(((__m256i*)img_buf) + write_pos, colors_1_u8);
163 |     _mm256_stream_si256(((__m256i*)img_buf) + write_pos + 1, colors_2_u8);
164 |     _mm256_stream_si256(((__m256i*)img_buf) + write_pos + 2, colors_3_u8);
165 |   }
166 | }
167 | 
168 | inline static void render(CharColor* img_buf, const Vec3 cam_origin, uint32_t pix_offset) {
169 |   // comptime generated
170 |   constexpr Vec3_256 base_dirs = comptime::init_ray_directions();
171 |   RayCluster base_rays = {
172 |       .dir = base_dirs,
173 |       .orig = broadcast_vec(&cam_origin),
174 |   };
175 | 
176 |   Color_256 sample_color;
177 |   alignas(32) Color color_buf[32];
178 | 
179 |   constexpr uint32_t write_chunk_size = global::img_width / 32;
180 |   uint32_t row = pix_offset / global::img_width;
181 |   uint32_t write_pos = row * write_chunk_size;
182 |   uint16_t color_buf_idx = 0;
183 |   uint16_t sample_group;
184 | 
185 |   static_assert(global::sample_group_num > 0,
186 |                 "there must be at least one group of ray samples to calculate");
187 | 
188 |   for (; row < global::img_height; row += global::thread_count) {
189 |     for (uint32_t col = 0; col < global::img_width; col++) {
190 |       sample_color.x = _mm256_setzero_ps();
191 |       sample_color.y = _mm256_setzero_ps();
192 |       sample_color.z = _mm256_setzero_ps();
193 | 
194 |       for (sample_group = 0; sample_group < global::sample_group_num; sample_group++) {
195 |         RayCluster samples = base_rays;
196 | 
197 |         float x_scale = global::pix_du * col;
198 |         __m256 x_scale_vec = _mm256_broadcast_ss(&x_scale);
199 |         samples.dir.x = samples.dir.x + x_scale_vec;
200 | 
201 |         float y_scale = (global::pix_dv * row) + (sample_group * global::sample_dv);
202 |         __m256 y_scale_vec = _mm256_broadcast_ss(&y_scale);
203 |         samples.dir.y += y_scale_vec;
204 | 
205 |         sample_color += ray_cluster_colors(&samples);
206 |       }
207 | 
208 |       // accumulate all color channels into first float of vec
209 |       sample_color.x = _mm256_hadd_ps(sample_color.x, sample_color.x);
210 |       sample_color.x = _mm256_hadd_ps(sample_color.x, sample_color.x);
211 |       sample_color.x = _mm256_hadd_ps(sample_color.x, sample_color.x);
212 | 
213 |       sample_color.y = _mm256_hadd_ps(sample_color.y, sample_color.y);
214 |       sample_color.y = _mm256_hadd_ps(sample_color.y, sample_color.y);
215 |       sample_color.y = _mm256_hadd_ps(sample_color.y, sample_color.y);
216 | 
217 |       sample_color.z = _mm256_hadd_ps(sample_color.z, sample_color.z);
218 |       sample_color.z = _mm256_hadd_ps(sample_color.z, sample_color.z);
219 |       sample_color.z = _mm256_hadd_ps(sample_color.z, sample_color.z);
220 | 
221 |       _mm_store_ss(&color_buf[color_buf_idx].x, _mm256_castps256_ps128(sample_color.x));
222 |       _mm_store_ss(&color_buf[color_buf_idx].y, _mm256_castps256_ps128(sample_color.y));
223 |       _mm_store_ss(&color_buf[color_buf_idx].z, _mm256_castps256_ps128(sample_color.z));
224 | 
225 |       color_buf_idx++;
226 | 
227 |       if (color_buf_idx != 32) {
228 |         continue;
229 |       }
230 | 
231 |       write_out_color_buf(color_buf, img_buf, write_pos);
232 |       write_pos++;
233 | 
234 |       color_buf_idx = 0;
235 |     }
236 |     write_pos += ((global::thread_count - 1) * write_chunk_size);
237 |   }
238 | }
239 | 
240 | using namespace std::chrono;
241 | 
242 | inline static void render_png() {
243 |   static_assert(global::img_height % global::thread_count == 0,
244 |                 "Thread count must divide rows equally");
245 | 
246 |   CharColor* img_data =
247 |       (CharColor*)aligned_alloc(32, global::img_width * global::img_height * sizeof(CharColor));
248 |   init_spheres();
249 |   std::array<std::future<void>, global::thread_count> futures;
250 |   Camera cam;
251 | 
252 |   auto start_time = system_clock::now();
253 | 
254 |   for (size_t idx = 0; idx < global::thread_count; idx++) {
255 |     futures[idx] =
256 |         std::async(std::launch::async, render, img_data, cam.origin, idx * global::img_width);
257 |   }
258 | 
259 |   for (size_t idx = 0; idx < global::thread_count; idx++) {
260 |     futures[idx].get();
261 |   }
262 | 
263 |   auto end_time = system_clock::now();
264 |   auto dur = duration<float>(end_time - start_time);
265 |   float milli = duration_cast<microseconds>(dur).count() / 1000.f;
266 |   printf("render time (ms): %f\n", milli);
267 |   stbi_write_png("out.png", global::img_width, global::img_height, 3, img_data,
268 |                  global::img_width * sizeof(CharColor));
269 | }
270 | 
271 | inline static void render_realtime() {
272 |   static_assert(global::img_height % global::thread_count == 0,
273 |                 "Thread count must divide rows equally");
274 |   CharColor* img_data =
275 |       (CharColor*)aligned_alloc(32, global::img_width * global::img_height * sizeof(CharColor));
276 |   init_spheres();
277 |   std::array<std::future<void>, global::thread_count> futures{};
278 |   Camera cam;
279 | 
280 |   SDL_Window* win = NULL;
281 |   SDL_Renderer* renderer = NULL;
282 | 
283 |   int sdl_res = SDL_Init(SDL_INIT_VIDEO);
284 | 
285 |   if (sdl_res < 0) {
286 |     printf("SDL initialization failed with status code: %d\n", sdl_res);
287 |     exit(EXIT_FAILURE);
288 |   }
289 | 
290 |   win = SDL_CreateWindow("Crack Tracer", 100, 100, global::img_width, global::img_height, 0);
291 |   renderer = SDL_CreateRenderer(win, -1, SDL_RENDERER_ACCELERATED);
292 | 
293 |   SDL_Texture* buffer =
294 |       SDL_CreateTexture(renderer, SDL_PIXELFORMAT_RGB24, SDL_TEXTUREACCESS_STREAMING,
295 |                         global::img_width, global::img_height);
296 | 
297 |   int pitch = global::img_width * sizeof(CharColor);
298 | 
299 |   while (true) {
300 |     SDL_Event e;
301 |     if (SDL_PollEvent(&e)) {
302 |       if (e.type == SDL_QUIT) {
303 |         break;
304 |       }
305 |       cam.register_key_event(e);
306 |     }
307 | 
308 |     cam.update();
309 | 
310 |     SDL_LockTexture(buffer, NULL, (void**)(&img_data), &pitch);
311 | 
312 |     for (size_t idx = 0; idx < global::thread_count; idx++) {
313 |       futures[idx] =
314 |           std::async(std::launch::async, render, img_data, cam.origin, idx * global::img_width);
315 |     }
316 | 
317 |     for (size_t idx = 0; idx < global::thread_count; idx++) {
318 |       futures[idx].get();
319 |     }
320 | 
321 |     SDL_UnlockTexture(buffer);
322 | 
323 |     SDL_RenderCopy(renderer, buffer, NULL, NULL);
324 | 
325 |     // flip the backbuffer
326 |     SDL_RenderPresent(renderer);
327 |   }
328 | 
329 |   SDL_DestroyTexture(buffer);
330 |   SDL_DestroyRenderer(renderer);
331 |   SDL_DestroyWindow(win);
332 | }
333 | 


--------------------------------------------------------------------------------
/src/sphere.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "materials.h"
  3 | #include "math.h"
  4 | #include "rand.h"
  5 | #include "types.h"
  6 | #include <cstdlib>
  7 | #include <immintrin.h>
  8 | #include <vector>
  9 | 
 10 | static std::vector<Sphere> spheres;
 11 | inline static void init_spheres() {
 12 |   spheres.reserve(488);
 13 |   spheres = {
 14 |       {{.center = {.x = -1.f, .y = 1.f, .z = -2.5f}, .mat = red_lambertian, .r = 1.f},
 15 |        {.center = {.x = 0.f, .y = 1.f, .z = 0.f}, .mat = glass, .r = 1.f},
 16 |        {.center = {.x = 1.f, .y = 1.f, .z = 2.5f}, .mat = copper_metallic, .r = 1.f},
 17 |        {.center = {.x = 0.f, .y = -1000.f, .z = 0.f}, .mat = silver_lambertian, .r = 1000.f}},
 18 |   };
 19 |   LCGRand lcg_rand;
 20 |   for (int a = -11; a < 11; a++) {
 21 |     for (int b = -11; b < 11; b++) {
 22 |       float choose_mat = lcg_rand.rand_in_range(0, 1);
 23 |       Vec3 center = {
 24 |           .x = a + lcg_rand.rand_in_range(0, 1),
 25 |           .y = 0.2f,
 26 |           .z = b + 0.9f * lcg_rand.rand_in_range(0, 1),
 27 |       };
 28 |       if (choose_mat < 0.3) {
 29 |         // diffuse
 30 |         Color albedo = {
 31 |             .x = lcg_rand.rand_in_range(0, 1),
 32 |             .y = lcg_rand.rand_in_range(0, 1),
 33 |             .z = lcg_rand.rand_in_range(0, 1),
 34 |         };
 35 |         Material new_mat = {.atten = albedo, .type = MatType::lambertian};
 36 |         spheres.push_back(Sphere{.center = center, .mat = new_mat, .r = 0.2});
 37 |       } else if (choose_mat < 0.7) {
 38 |         // metal
 39 |         Color albedo = {
 40 |             .x = lcg_rand.rand_in_range(0.5, 1),
 41 |             .y = lcg_rand.rand_in_range(0.5, 1),
 42 |             .z = lcg_rand.rand_in_range(0.5, 1),
 43 |         };
 44 |         Material new_mat = {.atten = albedo, .type = MatType::metallic};
 45 |         spheres.push_back(Sphere{.center = center, .mat = new_mat, .r = 0.2});
 46 |       } else {
 47 |         // glass
 48 |         Material new_mat = {.atten = white, .type = MatType::dielectric};
 49 |         spheres.push_back(Sphere{.center = center, .mat = new_mat, .r = 0.2});
 50 |       }
 51 |     }
 52 |   }
 53 | }
 54 | 
 55 | // Returns hit t values or 0 depending on if this ray hit this sphere or not
 56 | [[nodiscard]] inline static __m256 sphere_hit(const RayCluster* rays, const Sphere* sphere,
 57 |                                               float t_max) {
 58 | 
 59 |   Vec3_256 sphere_center = broadcast_vec(&sphere->center);
 60 |   Vec3_256 oc = sphere_center - rays->orig;
 61 |   float rad_2 = sphere->r * sphere->r;
 62 |   __m256 rad_2_vec = _mm256_broadcast_ss(&rad_2);
 63 | 
 64 |   __m256 a = dot(&rays->dir, &rays->dir);
 65 |   __m256 b = dot(&rays->dir, &oc);
 66 |   __m256 c = dot(&oc, &oc) - rad_2_vec;
 67 | 
 68 |   __m256 discrim = _mm256_fmsub_ps(b, b, a * c);
 69 | 
 70 |   __m256 hit_loc = _mm256_cmp_ps(discrim, global::zeros, global::cmpnlt);
 71 |   int no_hit = _mm256_testz_ps(hit_loc, hit_loc);
 72 | 
 73 |   if (no_hit) {
 74 |     return global::zeros;
 75 |   }
 76 | 
 77 |   // mask out the discriminants and b where there aren't hits
 78 |   discrim = _mm256_and_ps(discrim, hit_loc);
 79 |   b = _mm256_and_ps(b, hit_loc);
 80 | 
 81 |   __m256 sqrt_d = _mm256_sqrt_ps(discrim);
 82 |   __m256 recip_a = _mm256_rcp_ps(a);
 83 | 
 84 |   __m256 root = (b - sqrt_d) * recip_a;
 85 | 
 86 |   // allow through roots within the max t value
 87 |   __m256 t_max_vec = _mm256_broadcast_ss(&t_max);
 88 |   __m256 below_max = _mm256_cmp_ps(root, t_max_vec, global::cmplt);
 89 |   __m256 above_min = _mm256_cmp_ps(root, global::t_min_vec, global::cmpnlt);
 90 |   hit_loc = _mm256_and_ps(above_min, below_max);
 91 | 
 92 |   // Only clear materials can have another root thats worth finding.
 93 |   // This is why i only check for the farther out hit value if the material
 94 |   // is dielectric.
 95 |   if (_mm256_testz_ps(hit_loc, hit_loc) && sphere->mat.type == dielectric) {
 96 |     root = (b + sqrt_d) * recip_a;
 97 |     below_max = _mm256_cmp_ps(root, t_max_vec, global::cmplt);
 98 |     above_min = _mm256_cmp_ps(root, global::t_min_vec, global::cmpnlt);
 99 |     hit_loc = _mm256_and_ps(above_min, below_max);
100 |   }
101 |   root = _mm256_and_ps(root, hit_loc);
102 | 
103 |   return root;
104 | }
105 | 
106 | inline static void set_face_normal(const RayCluster* rays, HitRecords* hit_rec,
107 |                                    const Vec3_256* outward_norm) {
108 |   __m256 ray_norm_dot = dot(&rays->dir, outward_norm);
109 |   hit_rec->front_face = _mm256_cmp_ps(ray_norm_dot, _mm256_setzero_ps(), global::cmplt);
110 |   hit_rec->norm = -*outward_norm;
111 |   hit_rec->norm = blend_vec256(&hit_rec->norm, outward_norm, hit_rec->front_face);
112 | }
113 | 
114 | inline static void create_hit_record(HitRecords* hit_rec, const RayCluster* rays,
115 |                                      SphereCluster* sphere_cluster, __m256 t_vals) {
116 |   hit_rec->t = t_vals;
117 |   hit_rec->mat = sphere_cluster->mat;
118 | 
119 |   hit_rec->orig.x = _mm256_fmadd_ps(rays->dir.x, t_vals, rays->orig.x);
120 |   hit_rec->orig.y = _mm256_fmadd_ps(rays->dir.y, t_vals, rays->orig.y);
121 |   hit_rec->orig.z = _mm256_fmadd_ps(rays->dir.z, t_vals, rays->orig.z);
122 | 
123 |   Vec3_256 norm = hit_rec->orig - sphere_cluster->center;
124 |   // normalize
125 |   norm /= sphere_cluster->r;
126 | 
127 |   set_face_normal(rays, hit_rec, &norm);
128 | }
129 | 
130 | // updates a sphere cluster with a sphere given a mask of where to insert the new sphere's values
131 | inline static void update_sphere_cluster(SphereCluster* curr_cluster, Sphere curr_sphere,
132 |                                          __m256 update_mask) {
133 | 
134 |   if (_mm256_testz_ps(update_mask, update_mask)) {
135 |     return;
136 |   }
137 | 
138 |   SphereCluster new_spheres = {
139 |       .center = broadcast_vec(&curr_sphere.center),
140 |       .mat =
141 |           {
142 |               .atten = broadcast_vec(&curr_sphere.mat.atten),
143 |               .type = _mm256_set1_epi32(curr_sphere.mat.type),
144 |           },
145 |       .r = _mm256_broadcast_ss(&curr_sphere.r),
146 | 
147 |   };
148 | 
149 |   new_spheres.center &= update_mask;
150 |   new_spheres.mat.atten &= update_mask;
151 |   new_spheres.mat.type = _mm256_and_si256(new_spheres.mat.type, (__m256i)update_mask);
152 |   new_spheres.r = _mm256_and_ps(new_spheres.r, update_mask);
153 | 
154 |   // negation of update locations so we can preserve current values
155 |   // while clearing bits where we will update
156 |   __m256 preserve_curr = _mm256_xor_ps(update_mask, global::all_set);
157 | 
158 |   SphereCluster curr_spheres = {
159 |       .center = curr_cluster->center & preserve_curr,
160 |       .mat =
161 |           {
162 |               .atten = curr_cluster->mat.atten & preserve_curr,
163 |               .type = _mm256_and_si256(curr_cluster->mat.type, (__m256i)preserve_curr),
164 |           },
165 |       .r = _mm256_and_ps(curr_cluster->r, preserve_curr),
166 | 
167 |   };
168 | 
169 |   curr_cluster->center = new_spheres.center + curr_spheres.center;
170 |   curr_cluster->mat.atten = new_spheres.mat.atten + curr_spheres.mat.atten;
171 |   curr_cluster->mat.type = new_spheres.mat.type + curr_spheres.mat.type;
172 |   curr_cluster->r = new_spheres.r + curr_spheres.r;
173 | };
174 | 
175 | inline static void find_sphere_hits(HitRecords* hit_rec, const RayCluster* rays, float t_max) {
176 | 
177 |   SphereCluster closest_spheres = {
178 |       .center =
179 |           {
180 |               .x = global::zeros,
181 |               .y = global::zeros,
182 |               .z = global::zeros,
183 |           },
184 |       .r = global::zeros,
185 |   };
186 | 
187 |   __m256 max = _mm256_broadcast_ss(&global::float_max);
188 | 
189 |   // find first sphere as a baseline
190 |   __m256 lowest_t_vals = sphere_hit(rays, &spheres[0], t_max);
191 |   __m256 hit_loc = _mm256_cmp_ps(lowest_t_vals, global::zeros, global::cmpneq);
192 | 
193 |   update_sphere_cluster(&closest_spheres, spheres[0], hit_loc);
194 | 
195 |   for (size_t i = 1; i < spheres.size(); i++) {
196 |     __m256 new_t_vals = sphere_hit(rays, &spheres[i], t_max);
197 | 
198 |     // don't update on instances of no hits (hit locations all zeros)
199 |     hit_loc = _mm256_cmp_ps(new_t_vals, global::zeros, global::cmpneq);
200 |     if (_mm256_testz_ps(hit_loc, hit_loc)) {
201 |       continue;
202 |     }
203 | 
204 |     // replace all 0's with float maximum to not replace actual values with
205 |     // 0's during the minimum comparisons. Again, 0's represent no hits
206 |     __m256 no_hit_loc = _mm256_xor_ps(hit_loc, global::all_set);
207 |     __m256 max_mask = _mm256_and_ps(no_hit_loc, max);
208 |     new_t_vals = _mm256_or_ps(new_t_vals, max_mask);
209 | 
210 |     // replace 0's with max for current lowest too
211 |     __m256 curr_no_hit_loc = _mm256_cmp_ps(lowest_t_vals, global::zeros, global::cmpeq);
212 |     max_mask = _mm256_and_ps(curr_no_hit_loc, max);
213 |     __m256 lowest_t_masked = _mm256_or_ps(lowest_t_vals, max_mask);
214 | 
215 |     // update sphere references based on where new
216 |     // t values are closer than the current lowest
217 |     __m256 update_locs = _mm256_cmp_ps(new_t_vals, lowest_t_masked, global::cmplt);
218 |     update_sphere_cluster(&closest_spheres, spheres[i], update_locs);
219 | 
220 |     // update current lowest t values based on new t's, however, mask out
221 |     // where we put float max values so that the t values still represent
222 |     // no hits as 0.0
223 |     lowest_t_vals = _mm256_min_ps(lowest_t_masked, new_t_vals);
224 |     __m256 actual_vals_loc = _mm256_cmp_ps(lowest_t_vals, max, global::cmpneq);
225 |     lowest_t_vals = _mm256_and_ps(lowest_t_vals, actual_vals_loc);
226 |   }
227 | 
228 |   create_hit_record(hit_rec, rays, &closest_spheres, lowest_t_vals);
229 | }
230 | 


--------------------------------------------------------------------------------
/src/types.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cstdint>
 3 | #include <immintrin.h>
 4 | 
 5 | struct Vec3 {
 6 |   float x;
 7 |   float y;
 8 |   float z;
 9 | };
10 | 
11 | struct Vec4 {
12 |   float x;
13 |   float y;
14 |   float z;
15 |   float w;
16 | };
17 | 
18 | struct Vec3_256 {
19 |   __m256 x;
20 |   __m256 y;
21 |   __m256 z;
22 | };
23 | 
24 | struct CharColor {
25 |   uint8_t r;
26 |   uint8_t g;
27 |   uint8_t b;
28 | };
29 | 
30 | enum MatType {
31 |   metallic,
32 |   lambertian,
33 |   dielectric,
34 | };
35 | 
36 | using Color_256 = Vec3_256;
37 | struct Material_256 {
38 |   Color_256 atten;
39 |   __m256i type;
40 | };
41 | 
42 | using Color = Vec3;
43 | struct alignas(16) Material {
44 |   Color atten;
45 |   MatType type;
46 | };
47 | 
48 | struct alignas(32) Sphere {
49 |   Vec3 center;
50 |   Material mat;
51 |   float r;
52 | };
53 | 
54 | struct SphereCluster {
55 |   Vec3_256 center;
56 |   Material_256 mat;
57 |   __m256 r;
58 | };
59 | 
60 | struct RayCluster {
61 |   Vec3_256 dir;
62 |   Vec3_256 orig;
63 | };
64 | 
65 | struct HitRecords {
66 |   Vec3_256 orig;
67 |   Vec3_256 norm;
68 |   Material_256 mat;
69 |   __m256 front_face;
70 |   __m256 t;
71 | };
72 | 


--------------------------------------------------------------------------------
/src/utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cstdio>
 3 | #include <immintrin.h>
 4 | 
 5 | #ifndef NDEBUG
 6 | #define BREAKPOINT asm("int $3");
 7 | #else
 8 | #define BREAKPOINT ;
 9 | #endif
10 | 
11 | inline void print_vec_256(const __m256 vec) {
12 | 
13 |   printf("%.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n", vec[0], vec[1], vec[2], vec[3], vec[4],
14 |          vec[5], vec[6], vec[7]);
15 | }
16 | 
17 | inline void print_vec_128(const __m128 vec) {
18 |   printf("%.3f %.3f %.3f %.3f\n", vec[0], vec[1], vec[2], vec[3]);
19 | }
20 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.22)
 2 | 
 3 | project(crack-tracer-tests)
 4 | find_package(SDL2 REQUIRED)
 5 | 
 6 | set(CMAKE_CXX_STANDARD 20)
 7 | set(CMAKE_CXX_STANDARD_REQUIRED True)
 8 | 
 9 | add_executable(${PROJECT_NAME} entry.cpp)
10 | 
11 | target_compile_options(${PROJECT_NAME}
12 |                        PRIVATE "$<$<CONFIG:DEBUG>:-g;-march=native>")
13 | 
14 | target_link_libraries(${PROJECT_NAME} PRIVATE SDL2::SDL2 SDL2::SDL2main)
15 | 
16 | target_include_directories(${PROJECT_NAME} SYSTEM
17 |                            PRIVATE ../thirdparty/stb_image_write)
18 | 


--------------------------------------------------------------------------------
/tests/entry.cpp:
--------------------------------------------------------------------------------
  1 | #include "../src/math.h"
  2 | #include "../src/render.h"
  3 | #include "../src/sphere.h"
  4 | #include "../src/utils.h"
  5 | 
  6 | #include <cmath>
  7 | #include <cstdint>
  8 | #include <cstdio>
  9 | #include <cstdlib>
 10 | #include <immintrin.h>
 11 | 
 12 | const RayCluster rays{.dir = {
 13 |                           .x = _mm256_set_ps(-.6f, -.2f, .2f, .6f, -.6f, -.2f, .2f, .6f),
 14 |                           .y = _mm256_set_ps(.4f, .4f, .4f, .4f, -.4f, -.4f, -.4f, -.4f),
 15 |                           .z = _mm256_set_ps(-1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f, -1.f),
 16 |                       }};
 17 | 
 18 | const Sphere sphere{
 19 |     .center = {.x = 0.f, .y = 0.f, .z = -1.f},
 20 |     .mat = silver_metallic,
 21 |     .r = .5f,
 22 | };
 23 | 
 24 | void test_sphere_hit() {
 25 |   printf("TESTING SPHERE_HIT\n");
 26 |   __m256 t_vals = sphere_hit(&rays, &sphere, 1000.f);
 27 |   print_vec_256(t_vals);
 28 |   printf("\n");
 29 | };
 30 | 
 31 | void test_update_sphere_cluster() {
 32 |   printf("TESTING UPDATE_SPHERE_CLUSTER\n");
 33 |   SphereCluster sphere_cluster{
 34 |       .center = {.x = _mm256_setzero_ps(), .y = _mm256_setzero_ps(), .z = _mm256_setzero_ps()},
 35 |       .r = _mm256_setzero_ps(),
 36 |   };
 37 |   const Sphere rando_sphere1{
 38 |       .center = {.x = 69.f, .y = 420.f, .z = 21.f},
 39 |       .r = .8f,
 40 |   };
 41 | 
 42 |   const Sphere rando_sphere2{
 43 |       .center = {.x = 100.f, .y = -2.3f, .z = 800.f},
 44 |       .r = 0.39f,
 45 |   };
 46 | 
 47 |   uint32_t a = 0xFFFFFFFF;
 48 |   float set = *((float*)&a);
 49 | 
 50 |   __m256 mask = _mm256_set_ps(set, 0.f, set, 0.f, set, 0.f, set, 0.f);
 51 |   update_sphere_cluster(&sphere_cluster, rando_sphere1, mask);
 52 | 
 53 |   printf("iter 1\n");
 54 |   print_vec_256(sphere_cluster.center.x);
 55 |   print_vec_256(sphere_cluster.center.y);
 56 |   print_vec_256(sphere_cluster.center.z);
 57 |   print_vec_256(sphere_cluster.r);
 58 | 
 59 |   mask = _mm256_set_ps(0.f, 0.f, 0.f, set, set, set, set, 0.f);
 60 |   update_sphere_cluster(&sphere_cluster, rando_sphere2, mask);
 61 | 
 62 |   printf("iter 2\n");
 63 |   print_vec_256(sphere_cluster.center.x);
 64 |   print_vec_256(sphere_cluster.center.y);
 65 |   print_vec_256(sphere_cluster.center.z);
 66 |   print_vec_256(sphere_cluster.r);
 67 | 
 68 |   printf("\n");
 69 | }
 70 | 
 71 | void test_reflect() {
 72 |   printf("TESTING REFLECT\n");
 73 |   Vec3_256 axis = {
 74 |       .x = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
 75 |       .y = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f},
 76 |       .z = {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f},
 77 |   };
 78 | 
 79 |   Vec3_256 ray_dirs = rays.dir;
 80 | 
 81 |   reflect(&ray_dirs, &axis);
 82 | 
 83 |   print_vec_256(ray_dirs.x);
 84 |   print_vec_256(ray_dirs.y);
 85 |   print_vec_256(ray_dirs.z);
 86 |   printf("\n");
 87 | }
 88 | 
 89 | void test_dot() {
 90 |   printf("TESTING DOT PRODUCT\n");
 91 | 
 92 |   __m256 dp = dot(&rays.dir, &rays.dir);
 93 | 
 94 |   print_vec_256(dp);
 95 |   printf("\n");
 96 | }
 97 | 
 98 | void test_normalize() {
 99 |   printf("TESTING NORMALIZE\n");
100 | 
101 |   Vec3_256 ray_dirs = rays.dir;
102 | 
103 |   normalize(&ray_dirs);
104 | 
105 |   print_vec_256(ray_dirs.x);
106 |   print_vec_256(ray_dirs.y);
107 |   print_vec_256(ray_dirs.z);
108 |   printf("\n");
109 | }
110 | 
111 | void test_refract() {
112 |   printf("TESTING NORMALIZE\n");
113 |   Vec3 ray_dir = {0.0014, 0.2775, -10};
114 |   Vec3_256 ray_dir_vec = broadcast_vec(&ray_dir);
115 |   Vec3 norm = {0.003, 0.6133, 0.790};
116 |   Vec3_256 norm_vec = broadcast_vec(&norm);
117 |   Vec3_256 refracted = refract(&ray_dir_vec, &norm_vec, global::rcp_ir_vec);
118 |   printf("\n");
119 | }
120 | 
121 | int main() {
122 |   // test_sphere_hit();
123 |   // test_update_sphere_cluster();
124 |   // test_reflect();
125 |   // test_dot();
126 |   // test_normalize();
127 |   test_refract();
128 |   return 0;
129 | }
130 | 


--------------------------------------------------------------------------------
/thirdparty/stb_image_write/stb_image_write.h:
--------------------------------------------------------------------------------
   1 | /* stb_image_write - v1.16 - public domain - http://nothings.org/stb
   2 |    writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
   3 |                                      no warranty implied; use at your own risk
   4 | 
   5 |    Before #including,
   6 | 
   7 |        #define STB_IMAGE_WRITE_IMPLEMENTATION
   8 | 
   9 |    in the file that you want to have the implementation.
  10 | 
  11 |    Will probably not work correctly with strict-aliasing optimizations.
  12 | 
  13 | ABOUT:
  14 | 
  15 |    This header file is a library for writing images to C stdio or a callback.
  16 | 
  17 |    The PNG output is not optimal; it is 20-50% larger than the file
  18 |    written by a decent optimizing implementation; though providing a custom
  19 |    zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
  20 |    This library is designed for source code compactness and simplicity,
  21 |    not optimal image file size or run-time performance.
  22 | 
  23 | BUILDING:
  24 | 
  25 |    You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
  26 |    You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
  27 |    malloc,realloc,free.
  28 |    You can #define STBIW_MEMMOVE() to replace memmove()
  29 |    You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
  30 |    for PNG compression (instead of the builtin one), it must have the following signature:
  31 |    unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
  32 |    The returned data will be freed with STBIW_FREE() (free() by default),
  33 |    so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
  34 | 
  35 | UNICODE:
  36 | 
  37 |    If compiling for Windows and you wish to use Unicode filenames, compile
  38 |    with
  39 |        #define STBIW_WINDOWS_UTF8
  40 |    and pass utf8-encoded filenames. Call stbiw_convert_wchar_to_utf8 to convert
  41 |    Windows wchar_t filenames to utf8.
  42 | 
  43 | USAGE:
  44 | 
  45 |    There are five functions, one for each image file format:
  46 | 
  47 |      int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int
  48 | stride_in_bytes); int stbi_write_bmp(char const *filename, int w, int h, int comp, const void
  49 | *data); int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data); int
  50 | stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality); int
  51 | stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
  52 | 
  53 |      void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
  54 | 
  55 |    There are also five equivalent functions that use an arbitrary write function. You are
  56 |    expected to open/close your file-equivalent before and after calling these:
  57 | 
  58 |      int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const
  59 | void  *data, int stride_in_bytes); int stbi_write_bmp_to_func(stbi_write_func *func, void *context,
  60 | int w, int h, int comp, const void  *data); int stbi_write_tga_to_func(stbi_write_func *func, void
  61 | *context, int w, int h, int comp, const void  *data); int stbi_write_hdr_to_func(stbi_write_func
  62 | *func, void *context, int w, int h, int comp, const float *data); int
  63 | stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void
  64 | *data, int quality);
  65 | 
  66 |    where the callback is:
  67 |       void stbi_write_func(void *context, void *data, int size);
  68 | 
  69 |    You can configure it with these global variables:
  70 |       int stbi_write_tga_with_rle;             // defaults to true; set to 0 to disable RLE
  71 |       int stbi_write_png_compression_level;    // defaults to 8; set to higher for more compression
  72 |       int stbi_write_force_png_filter;         // defaults to -1; set to 0..5 to force a filter mode
  73 | 
  74 | 
  75 |    You can define STBI_WRITE_NO_STDIO to disable the file variant of these
  76 |    functions, so the library will not use stdio.h at all. However, this will
  77 |    also disable HDR writing, because it requires stdio for formatted output.
  78 | 
  79 |    Each function returns 0 on failure and non-0 on success.
  80 | 
  81 |    The functions create an image file defined by the parameters. The image
  82 |    is a rectangle of pixels stored from left-to-right, top-to-bottom.
  83 |    Each pixel contains 'comp' channels of data stored interleaved with 8-bits
  84 |    per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is
  85 |    monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall.
  86 |    The *data pointer points to the first byte of the top-left-most pixel.
  87 |    For PNG, "stride_in_bytes" is the distance in bytes from the first byte of
  88 |    a row of pixels to the first byte of the next row of pixels.
  89 | 
  90 |    PNG creates output files with the same number of components as the input.
  91 |    The BMP format expands Y to RGB in the file format and does not
  92 |    output alpha.
  93 | 
  94 |    PNG supports writing rectangles of data even when the bytes storing rows of
  95 |    data are not consecutive in memory (e.g. sub-rectangles of a larger image),
  96 |    by supplying the stride between the beginning of adjacent rows. The other
  97 |    formats do not. (Thus you cannot write a native-format BMP through the BMP
  98 |    writer, both because it is in BGR order and because it may have padding
  99 |    at the end of the line.)
 100 | 
 101 |    PNG allows you to set the deflate compression level by setting the global
 102 |    variable 'stbi_write_png_compression_level' (it defaults to 8).
 103 | 
 104 |    HDR expects linear float data. Since the format is always 32-bit rgb(e)
 105 |    data, alpha (if provided) is discarded, and for monochrome data it is
 106 |    replicated across all three channels.
 107 | 
 108 |    TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
 109 |    data, set the global variable 'stbi_write_tga_with_rle' to 0.
 110 | 
 111 |    JPEG does ignore alpha channels in input data; quality is between 1 and 100.
 112 |    Higher quality looks better but results in a bigger image.
 113 |    JPEG baseline (no JPEG progressive).
 114 | 
 115 | CREDITS:
 116 | 
 117 | 
 118 |    Sean Barrett           -    PNG/BMP/TGA
 119 |    Baldur Karlsson        -    HDR
 120 |    Jean-Sebastien Guay    -    TGA monochrome
 121 |    Tim Kelsey             -    misc enhancements
 122 |    Alan Hickman           -    TGA RLE
 123 |    Emmanuel Julien        -    initial file IO callback implementation
 124 |    Jon Olick              -    original jo_jpeg.cpp code
 125 |    Daniel Gibson          -    integrate JPEG, allow external zlib
 126 |    Aarni Koskela          -    allow choosing PNG filter
 127 | 
 128 |    bugfixes:
 129 |       github:Chribba
 130 |       Guillaume Chereau
 131 |       github:jry2
 132 |       github:romigrou
 133 |       Sergio Gonzalez
 134 |       Jonas Karlsson
 135 |       Filip Wasil
 136 |       Thatcher Ulrich
 137 |       github:poppolopoppo
 138 |       Patrick Boettcher
 139 |       github:xeekworx
 140 |       Cap Petschulat
 141 |       Simon Rodriguez
 142 |       Ivan Tikhonov
 143 |       github:ignotion
 144 |       Adam Schackart
 145 |       Andrew Kensler
 146 | 
 147 | LICENSE
 148 | 
 149 |   See end of file for license information.
 150 | 
 151 | */
 152 | 
 153 | #ifndef INCLUDE_STB_IMAGE_WRITE_H
 154 | #define INCLUDE_STB_IMAGE_WRITE_H
 155 | 
 156 | #include <stdlib.h>
 157 | 
 158 | // if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
 159 | #ifndef STBIWDEF
 160 | #ifdef STB_IMAGE_WRITE_STATIC
 161 | #define STBIWDEF static
 162 | #else
 163 | #ifdef __cplusplus
 164 | #define STBIWDEF extern "C"
 165 | #else
 166 | #define STBIWDEF extern
 167 | #endif
 168 | #endif
 169 | #endif
 170 | 
 171 | #ifndef STB_IMAGE_WRITE_STATIC // C++ forbids static forward declarations
 172 | STBIWDEF int stbi_write_tga_with_rle;
 173 | STBIWDEF int stbi_write_png_compression_level;
 174 | STBIWDEF int stbi_write_force_png_filter;
 175 | #endif
 176 | 
 177 | #ifndef STBI_WRITE_NO_STDIO
 178 | STBIWDEF int stbi_write_png(char const* filename, int w, int h, int comp, const void* data,
 179 |                             int stride_in_bytes);
 180 | STBIWDEF int stbi_write_bmp(char const* filename, int w, int h, int comp, const void* data);
 181 | STBIWDEF int stbi_write_tga(char const* filename, int w, int h, int comp, const void* data);
 182 | STBIWDEF int stbi_write_hdr(char const* filename, int w, int h, int comp, const float* data);
 183 | STBIWDEF int stbi_write_jpg(char const* filename, int x, int y, int comp, const void* data,
 184 |                             int quality);
 185 | 
 186 | #ifdef STBIW_WINDOWS_UTF8
 187 | STBIWDEF int stbiw_convert_wchar_to_utf8(char* buffer, size_t bufferlen, const wchar_t* input);
 188 | #endif
 189 | #endif
 190 | 
 191 | typedef void stbi_write_func(void* context, void* data, int size);
 192 | 
 193 | STBIWDEF int stbi_write_png_to_func(stbi_write_func* func, void* context, int w, int h, int comp,
 194 |                                     const void* data, int stride_in_bytes);
 195 | STBIWDEF int stbi_write_bmp_to_func(stbi_write_func* func, void* context, int w, int h, int comp,
 196 |                                     const void* data);
 197 | STBIWDEF int stbi_write_tga_to_func(stbi_write_func* func, void* context, int w, int h, int comp,
 198 |                                     const void* data);
 199 | STBIWDEF int stbi_write_hdr_to_func(stbi_write_func* func, void* context, int w, int h, int comp,
 200 |                                     const float* data);
 201 | STBIWDEF int stbi_write_jpg_to_func(stbi_write_func* func, void* context, int x, int y, int comp,
 202 |                                     const void* data, int quality);
 203 | 
 204 | STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
 205 | 
 206 | #endif // INCLUDE_STB_IMAGE_WRITE_H
 207 | 
 208 | #ifdef STB_IMAGE_WRITE_IMPLEMENTATION
 209 | 
 210 | #ifdef _WIN32
 211 | #ifndef _CRT_SECURE_NO_WARNINGS
 212 | #define _CRT_SECURE_NO_WARNINGS
 213 | #endif
 214 | #ifndef _CRT_NONSTDC_NO_DEPRECATE
 215 | #define _CRT_NONSTDC_NO_DEPRECATE
 216 | #endif
 217 | #endif
 218 | 
 219 | #ifndef STBI_WRITE_NO_STDIO
 220 | #include <stdio.h>
 221 | #endif // STBI_WRITE_NO_STDIO
 222 | 
 223 | #include <math.h>
 224 | #include <stdarg.h>
 225 | #include <stdlib.h>
 226 | #include <string.h>
 227 | 
 228 | #if defined(STBIW_MALLOC) && defined(STBIW_FREE) &&                                                \
 229 |     (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
 230 | // ok
 231 | #elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) &&                 \
 232 |     !defined(STBIW_REALLOC_SIZED)
 233 | // ok
 234 | #else
 235 | #error                                                                                             \
 236 |     "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
 237 | #endif
 238 | 
 239 | #ifndef STBIW_MALLOC
 240 | #define STBIW_MALLOC(sz) malloc(sz)
 241 | #define STBIW_REALLOC(p, newsz) realloc(p, newsz)
 242 | #define STBIW_FREE(p) free(p)
 243 | #endif
 244 | 
 245 | #ifndef STBIW_REALLOC_SIZED
 246 | #define STBIW_REALLOC_SIZED(p, oldsz, newsz) STBIW_REALLOC(p, newsz)
 247 | #endif
 248 | 
 249 | #ifndef STBIW_MEMMOVE
 250 | #define STBIW_MEMMOVE(a, b, sz) memmove(a, b, sz)
 251 | #endif
 252 | 
 253 | #ifndef STBIW_ASSERT
 254 | #include <assert.h>
 255 | #define STBIW_ASSERT(x) assert(x)
 256 | #endif
 257 | 
 258 | #define STBIW_UCHAR(x) (unsigned char)((x) & 0xff)
 259 | 
 260 | #ifdef STB_IMAGE_WRITE_STATIC
 261 | static int stbi_write_png_compression_level = 8;
 262 | static int stbi_write_tga_with_rle = 1;
 263 | static int stbi_write_force_png_filter = -1;
 264 | #else
 265 | int stbi_write_png_compression_level = 8;
 266 | int stbi_write_tga_with_rle = 1;
 267 | int stbi_write_force_png_filter = -1;
 268 | #endif
 269 | 
 270 | static int stbi__flip_vertically_on_write = 0;
 271 | 
 272 | STBIWDEF void stbi_flip_vertically_on_write(int flag) { stbi__flip_vertically_on_write = flag; }
 273 | 
 274 | typedef struct {
 275 |   stbi_write_func* func;
 276 |   void* context;
 277 |   unsigned char buffer[64];
 278 |   int buf_used;
 279 | } stbi__write_context;
 280 | 
 281 | // initialize a callback-based context
 282 | static void stbi__start_write_callbacks(stbi__write_context* s, stbi_write_func* c, void* context) {
 283 |   s->func = c;
 284 |   s->context = context;
 285 | }
 286 | 
 287 | #ifndef STBI_WRITE_NO_STDIO
 288 | 
 289 | static void stbi__stdio_write(void* context, void* data, int size) {
 290 |   fwrite(data, 1, size, (FILE*)context);
 291 | }
 292 | 
 293 | #if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
 294 | #ifdef __cplusplus
 295 | #define STBIW_EXTERN extern "C"
 296 | #else
 297 | #define STBIW_EXTERN extern
 298 | #endif
 299 | STBIW_EXTERN
 300 | __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags,
 301 |                                                         const char* str, int cbmb, wchar_t* widestr,
 302 |                                                         int cchwide);
 303 | STBIW_EXTERN
 304 | __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags,
 305 |                                                         const wchar_t* widestr, int cchwide,
 306 |                                                         char* str, int cbmb, const char* defchar,
 307 |                                                         int* used_default);
 308 | 
 309 | STBIWDEF int stbiw_convert_wchar_to_utf8(char* buffer, size_t bufferlen, const wchar_t* input) {
 310 |   return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int)bufferlen, NULL, NULL);
 311 | }
 312 | #endif
 313 | 
 314 | static FILE* stbiw__fopen(char const* filename, char const* mode) {
 315 |   FILE* f;
 316 | #if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
 317 |   wchar_t wMode[64];
 318 |   wchar_t wFilename[1024];
 319 |   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename,
 320 |                                sizeof(wFilename) / sizeof(*wFilename)))
 321 |     return 0;
 322 | 
 323 |   if (0 ==
 324 |       MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode) / sizeof(*wMode)))
 325 |     return 0;
 326 | 
 327 | #if defined(_MSC_VER) && _MSC_VER >= 1400
 328 |   if (0 != _wfopen_s(&f, wFilename, wMode))
 329 |     f = 0;
 330 | #else
 331 |   f = _wfopen(wFilename, wMode);
 332 | #endif
 333 | 
 334 | #elif defined(_MSC_VER) && _MSC_VER >= 1400
 335 |   if (0 != fopen_s(&f, filename, mode))
 336 |     f = 0;
 337 | #else
 338 |   f = fopen(filename, mode);
 339 | #endif
 340 |   return f;
 341 | }
 342 | 
 343 | static int stbi__start_write_file(stbi__write_context* s, const char* filename) {
 344 |   FILE* f = stbiw__fopen(filename, "wb");
 345 |   stbi__start_write_callbacks(s, stbi__stdio_write, (void*)f);
 346 |   return f != NULL;
 347 | }
 348 | 
 349 | static void stbi__end_write_file(stbi__write_context* s) { fclose((FILE*)s->context); }
 350 | 
 351 | #endif // !STBI_WRITE_NO_STDIO
 352 | 
 353 | typedef unsigned int stbiw_uint32;
 354 | typedef int stb_image_write_test[sizeof(stbiw_uint32) == 4 ? 1 : -1];
 355 | 
 356 | static void stbiw__writefv(stbi__write_context* s, const char* fmt, va_list v) {
 357 |   while (*fmt) {
 358 |     switch (*fmt++) {
 359 |     case ' ':
 360 |       break;
 361 |     case '1': {
 362 |       unsigned char x = STBIW_UCHAR(va_arg(v, int));
 363 |       s->func(s->context, &x, 1);
 364 |       break;
 365 |     }
 366 |     case '2': {
 367 |       int x = va_arg(v, int);
 368 |       unsigned char b[2];
 369 |       b[0] = STBIW_UCHAR(x);
 370 |       b[1] = STBIW_UCHAR(x >> 8);
 371 |       s->func(s->context, b, 2);
 372 |       break;
 373 |     }
 374 |     case '4': {
 375 |       stbiw_uint32 x = va_arg(v, int);
 376 |       unsigned char b[4];
 377 |       b[0] = STBIW_UCHAR(x);
 378 |       b[1] = STBIW_UCHAR(x >> 8);
 379 |       b[2] = STBIW_UCHAR(x >> 16);
 380 |       b[3] = STBIW_UCHAR(x >> 24);
 381 |       s->func(s->context, b, 4);
 382 |       break;
 383 |     }
 384 |     default:
 385 |       STBIW_ASSERT(0);
 386 |       return;
 387 |     }
 388 |   }
 389 | }
 390 | 
 391 | static void stbiw__writef(stbi__write_context* s, const char* fmt, ...) {
 392 |   va_list v;
 393 |   va_start(v, fmt);
 394 |   stbiw__writefv(s, fmt, v);
 395 |   va_end(v);
 396 | }
 397 | 
 398 | static void stbiw__write_flush(stbi__write_context* s) {
 399 |   if (s->buf_used) {
 400 |     s->func(s->context, &s->buffer, s->buf_used);
 401 |     s->buf_used = 0;
 402 |   }
 403 | }
 404 | 
 405 | static void stbiw__putc(stbi__write_context* s, unsigned char c) { s->func(s->context, &c, 1); }
 406 | 
 407 | static void stbiw__write1(stbi__write_context* s, unsigned char a) {
 408 |   if ((size_t)s->buf_used + 1 > sizeof(s->buffer))
 409 |     stbiw__write_flush(s);
 410 |   s->buffer[s->buf_used++] = a;
 411 | }
 412 | 
 413 | static void stbiw__write3(stbi__write_context* s, unsigned char a, unsigned char b,
 414 |                           unsigned char c) {
 415 |   int n;
 416 |   if ((size_t)s->buf_used + 3 > sizeof(s->buffer))
 417 |     stbiw__write_flush(s);
 418 |   n = s->buf_used;
 419 |   s->buf_used = n + 3;
 420 |   s->buffer[n + 0] = a;
 421 |   s->buffer[n + 1] = b;
 422 |   s->buffer[n + 2] = c;
 423 | }
 424 | 
 425 | static void stbiw__write_pixel(stbi__write_context* s, int rgb_dir, int comp, int write_alpha,
 426 |                                int expand_mono, unsigned char* d) {
 427 |   unsigned char bg[3] = {255, 0, 255}, px[3];
 428 |   int k;
 429 | 
 430 |   if (write_alpha < 0)
 431 |     stbiw__write1(s, d[comp - 1]);
 432 | 
 433 |   switch (comp) {
 434 |   case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
 435 |   case 1:
 436 |     if (expand_mono)
 437 |       stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
 438 |     else
 439 |       stbiw__write1(s, d[0]); // monochrome TGA
 440 |     break;
 441 |   case 4:
 442 |     if (!write_alpha) {
 443 |       // composite against pink background
 444 |       for (k = 0; k < 3; ++k)
 445 |         px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
 446 |       stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
 447 |       break;
 448 |     }
 449 |     /* FALLTHROUGH */
 450 |   case 3:
 451 |     stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
 452 |     break;
 453 |   }
 454 |   if (write_alpha > 0)
 455 |     stbiw__write1(s, d[comp - 1]);
 456 | }
 457 | 
 458 | static void stbiw__write_pixels(stbi__write_context* s, int rgb_dir, int vdir, int x, int y,
 459 |                                 int comp, void* data, int write_alpha, int scanline_pad,
 460 |                                 int expand_mono) {
 461 |   stbiw_uint32 zero = 0;
 462 |   int i, j, j_end;
 463 | 
 464 |   if (y <= 0)
 465 |     return;
 466 | 
 467 |   if (stbi__flip_vertically_on_write)
 468 |     vdir *= -1;
 469 | 
 470 |   if (vdir < 0) {
 471 |     j_end = -1;
 472 |     j = y - 1;
 473 |   } else {
 474 |     j_end = y;
 475 |     j = 0;
 476 |   }
 477 | 
 478 |   for (; j != j_end; j += vdir) {
 479 |     for (i = 0; i < x; ++i) {
 480 |       unsigned char* d = (unsigned char*)data + (j * x + i) * comp;
 481 |       stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
 482 |     }
 483 |     stbiw__write_flush(s);
 484 |     s->func(s->context, &zero, scanline_pad);
 485 |   }
 486 | }
 487 | 
 488 | static int stbiw__outfile(stbi__write_context* s, int rgb_dir, int vdir, int x, int y, int comp,
 489 |                           int expand_mono, void* data, int alpha, int pad, const char* fmt, ...) {
 490 |   if (y < 0 || x < 0) {
 491 |     return 0;
 492 |   } else {
 493 |     va_list v;
 494 |     va_start(v, fmt);
 495 |     stbiw__writefv(s, fmt, v);
 496 |     va_end(v);
 497 |     stbiw__write_pixels(s, rgb_dir, vdir, x, y, comp, data, alpha, pad, expand_mono);
 498 |     return 1;
 499 |   }
 500 | }
 501 | 
 502 | static int stbi_write_bmp_core(stbi__write_context* s, int x, int y, int comp, const void* data) {
 503 |   if (comp != 4) {
 504 |     // write RGB bitmap
 505 |     int pad = (-x * 3) & 3;
 506 |     return stbiw__outfile(s, -1, -1, x, y, comp, 1, (void*)data, 0, pad,
 507 |                           "11 4 22 4"
 508 |                           "4 44 22 444444",
 509 |                           'B', 'M', 14 + 40 + (x * 3 + pad) * y, 0, 0, 14 + 40, // file header
 510 |                           40, x, y, 1, 24, 0, 0, 0, 0, 0, 0);                   // bitmap header
 511 |   } else {
 512 |     // RGBA bitmaps need a v4 header
 513 |     // use BI_BITFIELDS mode with 32bpp and alpha mask
 514 |     // (straight BI_RGB with alpha mask doesn't work in most readers)
 515 |     return stbiw__outfile(s, -1, -1, x, y, comp, 1, (void*)data, 1, 0,
 516 |                           "11 4 22 4"
 517 |                           "4 44 22 444444 4444 4 444 444 444 444",
 518 |                           'B', 'M', 14 + 108 + x * y * 4, 0, 0, 14 + 108, // file header
 519 |                           108, x, y, 1, 32, 3, 0, 0, 0, 0, 0, 0xff0000, 0xff00, 0xff, 0xff000000u,
 520 |                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // bitmap V4 header
 521 |   }
 522 | }
 523 | 
 524 | STBIWDEF int stbi_write_bmp_to_func(stbi_write_func* func, void* context, int x, int y, int comp,
 525 |                                     const void* data) {
 526 |   stbi__write_context s = {0};
 527 |   stbi__start_write_callbacks(&s, func, context);
 528 |   return stbi_write_bmp_core(&s, x, y, comp, data);
 529 | }
 530 | 
 531 | #ifndef STBI_WRITE_NO_STDIO
 532 | STBIWDEF int stbi_write_bmp(char const* filename, int x, int y, int comp, const void* data) {
 533 |   stbi__write_context s = {0};
 534 |   if (stbi__start_write_file(&s, filename)) {
 535 |     int r = stbi_write_bmp_core(&s, x, y, comp, data);
 536 |     stbi__end_write_file(&s);
 537 |     return r;
 538 |   } else
 539 |     return 0;
 540 | }
 541 | #endif //! STBI_WRITE_NO_STDIO
 542 | 
 543 | static int stbi_write_tga_core(stbi__write_context* s, int x, int y, int comp, void* data) {
 544 |   int has_alpha = (comp == 2 || comp == 4);
 545 |   int colorbytes = has_alpha ? comp - 1 : comp;
 546 |   int format =
 547 |       colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
 548 | 
 549 |   if (y < 0 || x < 0)
 550 |     return 0;
 551 | 
 552 |   if (!stbi_write_tga_with_rle) {
 553 |     return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void*)data, has_alpha, 0, "111 221 2222 11", 0,
 554 |                           0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8,
 555 |                           has_alpha * 8);
 556 |   } else {
 557 |     int i, j, k;
 558 |     int jend, jdir;
 559 | 
 560 |     stbiw__writef(s, "111 221 2222 11", 0, 0, format + 8, 0, 0, 0, 0, 0, x, y,
 561 |                   (colorbytes + has_alpha) * 8, has_alpha * 8);
 562 | 
 563 |     if (stbi__flip_vertically_on_write) {
 564 |       j = 0;
 565 |       jend = y;
 566 |       jdir = 1;
 567 |     } else {
 568 |       j = y - 1;
 569 |       jend = -1;
 570 |       jdir = -1;
 571 |     }
 572 |     for (; j != jend; j += jdir) {
 573 |       unsigned char* row = (unsigned char*)data + j * x * comp;
 574 |       int len;
 575 | 
 576 |       for (i = 0; i < x; i += len) {
 577 |         unsigned char* begin = row + i * comp;
 578 |         int diff = 1;
 579 |         len = 1;
 580 | 
 581 |         if (i < x - 1) {
 582 |           ++len;
 583 |           diff = memcmp(begin, row + (i + 1) * comp, comp);
 584 |           if (diff) {
 585 |             const unsigned char* prev = begin;
 586 |             for (k = i + 2; k < x && len < 128; ++k) {
 587 |               if (memcmp(prev, row + k * comp, comp)) {
 588 |                 prev += comp;
 589 |                 ++len;
 590 |               } else {
 591 |                 --len;
 592 |                 break;
 593 |               }
 594 |             }
 595 |           } else {
 596 |             for (k = i + 2; k < x && len < 128; ++k) {
 597 |               if (!memcmp(begin, row + k * comp, comp)) {
 598 |                 ++len;
 599 |               } else {
 600 |                 break;
 601 |               }
 602 |             }
 603 |           }
 604 |         }
 605 | 
 606 |         if (diff) {
 607 |           unsigned char header = STBIW_UCHAR(len - 1);
 608 |           stbiw__write1(s, header);
 609 |           for (k = 0; k < len; ++k) {
 610 |             stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
 611 |           }
 612 |         } else {
 613 |           unsigned char header = STBIW_UCHAR(len - 129);
 614 |           stbiw__write1(s, header);
 615 |           stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
 616 |         }
 617 |       }
 618 |     }
 619 |     stbiw__write_flush(s);
 620 |   }
 621 |   return 1;
 622 | }
 623 | 
 624 | STBIWDEF int stbi_write_tga_to_func(stbi_write_func* func, void* context, int x, int y, int comp,
 625 |                                     const void* data) {
 626 |   stbi__write_context s = {0};
 627 |   stbi__start_write_callbacks(&s, func, context);
 628 |   return stbi_write_tga_core(&s, x, y, comp, (void*)data);
 629 | }
 630 | 
 631 | #ifndef STBI_WRITE_NO_STDIO
 632 | STBIWDEF int stbi_write_tga(char const* filename, int x, int y, int comp, const void* data) {
 633 |   stbi__write_context s = {0};
 634 |   if (stbi__start_write_file(&s, filename)) {
 635 |     int r = stbi_write_tga_core(&s, x, y, comp, (void*)data);
 636 |     stbi__end_write_file(&s);
 637 |     return r;
 638 |   } else
 639 |     return 0;
 640 | }
 641 | #endif
 642 | 
 643 | // *************************************************************************************************
 644 | // Radiance RGBE HDR writer
 645 | // by Baldur Karlsson
 646 | 
 647 | #define stbiw__max(a, b) ((a) > (b) ? (a) : (b))
 648 | 
 649 | #ifndef STBI_WRITE_NO_STDIO
 650 | 
 651 | static void stbiw__linear_to_rgbe(unsigned char* rgbe, float* linear) {
 652 |   int exponent;
 653 |   float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
 654 | 
 655 |   if (maxcomp < 1e-32f) {
 656 |     rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
 657 |   } else {
 658 |     float normalize = (float)frexp(maxcomp, &exponent) * 256.0f / maxcomp;
 659 | 
 660 |     rgbe[0] = (unsigned char)(linear[0] * normalize);
 661 |     rgbe[1] = (unsigned char)(linear[1] * normalize);
 662 |     rgbe[2] = (unsigned char)(linear[2] * normalize);
 663 |     rgbe[3] = (unsigned char)(exponent + 128);
 664 |   }
 665 | }
 666 | 
 667 | static void stbiw__write_run_data(stbi__write_context* s, int length, unsigned char databyte) {
 668 |   unsigned char lengthbyte = STBIW_UCHAR(length + 128);
 669 |   STBIW_ASSERT(length + 128 <= 255);
 670 |   s->func(s->context, &lengthbyte, 1);
 671 |   s->func(s->context, &databyte, 1);
 672 | }
 673 | 
 674 | static void stbiw__write_dump_data(stbi__write_context* s, int length, unsigned char* data) {
 675 |   unsigned char lengthbyte = STBIW_UCHAR(length);
 676 |   STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
 677 |   s->func(s->context, &lengthbyte, 1);
 678 |   s->func(s->context, data, length);
 679 | }
 680 | 
 681 | static void stbiw__write_hdr_scanline(stbi__write_context* s, int width, int ncomp,
 682 |                                       unsigned char* scratch, float* scanline) {
 683 |   unsigned char scanlineheader[4] = {2, 2, 0, 0};
 684 |   unsigned char rgbe[4];
 685 |   float linear[3];
 686 |   int x;
 687 | 
 688 |   scanlineheader[2] = (width & 0xff00) >> 8;
 689 |   scanlineheader[3] = (width & 0x00ff);
 690 | 
 691 |   /* skip RLE for images too small or large */
 692 |   if (width < 8 || width >= 32768) {
 693 |     for (x = 0; x < width; x++) {
 694 |       switch (ncomp) {
 695 |       case 4: /* fallthrough */
 696 |       case 3:
 697 |         linear[2] = scanline[x * ncomp + 2];
 698 |         linear[1] = scanline[x * ncomp + 1];
 699 |         linear[0] = scanline[x * ncomp + 0];
 700 |         break;
 701 |       default:
 702 |         linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0];
 703 |         break;
 704 |       }
 705 |       stbiw__linear_to_rgbe(rgbe, linear);
 706 |       s->func(s->context, rgbe, 4);
 707 |     }
 708 |   } else {
 709 |     int c, r;
 710 |     /* encode into scratch buffer */
 711 |     for (x = 0; x < width; x++) {
 712 |       switch (ncomp) {
 713 |       case 4: /* fallthrough */
 714 |       case 3:
 715 |         linear[2] = scanline[x * ncomp + 2];
 716 |         linear[1] = scanline[x * ncomp + 1];
 717 |         linear[0] = scanline[x * ncomp + 0];
 718 |         break;
 719 |       default:
 720 |         linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0];
 721 |         break;
 722 |       }
 723 |       stbiw__linear_to_rgbe(rgbe, linear);
 724 |       scratch[x + width * 0] = rgbe[0];
 725 |       scratch[x + width * 1] = rgbe[1];
 726 |       scratch[x + width * 2] = rgbe[2];
 727 |       scratch[x + width * 3] = rgbe[3];
 728 |     }
 729 | 
 730 |     s->func(s->context, scanlineheader, 4);
 731 | 
 732 |     /* RLE each component separately */
 733 |     for (c = 0; c < 4; c++) {
 734 |       unsigned char* comp = &scratch[width * c];
 735 | 
 736 |       x = 0;
 737 |       while (x < width) {
 738 |         // find first run
 739 |         r = x;
 740 |         while (r + 2 < width) {
 741 |           if (comp[r] == comp[r + 1] && comp[r] == comp[r + 2])
 742 |             break;
 743 |           ++r;
 744 |         }
 745 |         if (r + 2 >= width)
 746 |           r = width;
 747 |         // dump up to first run
 748 |         while (x < r) {
 749 |           int len = r - x;
 750 |           if (len > 128)
 751 |             len = 128;
 752 |           stbiw__write_dump_data(s, len, &comp[x]);
 753 |           x += len;
 754 |         }
 755 |         // if there's a run, output it
 756 |         if (r + 2 <
 757 |             width) { // same test as what we break out of in search loop, so only true if we break'd
 758 |           // find next byte after run
 759 |           while (r < width && comp[r] == comp[x])
 760 |             ++r;
 761 |           // output run up to r
 762 |           while (x < r) {
 763 |             int len = r - x;
 764 |             if (len > 127)
 765 |               len = 127;
 766 |             stbiw__write_run_data(s, len, comp[x]);
 767 |             x += len;
 768 |           }
 769 |         }
 770 |       }
 771 |     }
 772 |   }
 773 | }
 774 | 
 775 | static int stbi_write_hdr_core(stbi__write_context* s, int x, int y, int comp, float* data) {
 776 |   if (y <= 0 || x <= 0 || data == NULL)
 777 |     return 0;
 778 |   else {
 779 |     // Each component is stored separately. Allocate scratch space for full output scanline.
 780 |     unsigned char* scratch = (unsigned char*)STBIW_MALLOC(x * 4);
 781 |     int i, len;
 782 |     char buffer[128];
 783 |     char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
 784 |     s->func(s->context, header, sizeof(header) - 1);
 785 | 
 786 | #ifdef __STDC_LIB_EXT1__
 787 |     len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n",
 788 |                     y, x);
 789 | #else
 790 |     len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
 791 | #endif
 792 |     s->func(s->context, buffer, len);
 793 | 
 794 |     for (i = 0; i < y; i++)
 795 |       stbiw__write_hdr_scanline(s, x, comp, scratch,
 796 |                                 data + comp * x * (stbi__flip_vertically_on_write ? y - 1 - i : i));
 797 |     STBIW_FREE(scratch);
 798 |     return 1;
 799 |   }
 800 | }
 801 | 
 802 | STBIWDEF int stbi_write_hdr_to_func(stbi_write_func* func, void* context, int x, int y, int comp,
 803 |                                     const float* data) {
 804 |   stbi__write_context s = {0};
 805 |   stbi__start_write_callbacks(&s, func, context);
 806 |   return stbi_write_hdr_core(&s, x, y, comp, (float*)data);
 807 | }
 808 | 
 809 | STBIWDEF int stbi_write_hdr(char const* filename, int x, int y, int comp, const float* data) {
 810 |   stbi__write_context s = {0};
 811 |   if (stbi__start_write_file(&s, filename)) {
 812 |     int r = stbi_write_hdr_core(&s, x, y, comp, (float*)data);
 813 |     stbi__end_write_file(&s);
 814 |     return r;
 815 |   } else
 816 |     return 0;
 817 | }
 818 | #endif // STBI_WRITE_NO_STDIO
 819 | 
 820 | //////////////////////////////////////////////////////////////////////////////
 821 | //
 822 | // PNG writer
 823 | //
 824 | 
 825 | #ifndef STBIW_ZLIB_COMPRESS
 826 | // stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
 827 | #define stbiw__sbraw(a) ((int*)(void*)(a) - 2)
 828 | #define stbiw__sbm(a) stbiw__sbraw(a)[0]
 829 | #define stbiw__sbn(a) stbiw__sbraw(a)[1]
 830 | 
 831 | #define stbiw__sbneedgrow(a, n) ((a) == 0 || stbiw__sbn(a) + n >= stbiw__sbm(a))
 832 | #define stbiw__sbmaybegrow(a, n) (stbiw__sbneedgrow(a, (n)) ? stbiw__sbgrow(a, n) : 0)
 833 | #define stbiw__sbgrow(a, n) stbiw__sbgrowf((void**)&(a), (n), sizeof(*(a)))
 834 | 
 835 | #define stbiw__sbpush(a, v) (stbiw__sbmaybegrow(a, 1), (a)[stbiw__sbn(a)++] = (v))
 836 | #define stbiw__sbcount(a) ((a) ? stbiw__sbn(a) : 0)
 837 | #define stbiw__sbfree(a) ((a) ? STBIW_FREE(stbiw__sbraw(a)), 0 : 0)
 838 | 
 839 | static void* stbiw__sbgrowf(void** arr, int increment, int itemsize) {
 840 |   int m = *arr ? 2 * stbiw__sbm(*arr) + increment : increment + 1;
 841 |   void* p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0,
 842 |                                 *arr ? (stbiw__sbm(*arr) * itemsize + sizeof(int) * 2) : 0,
 843 |                                 itemsize * m + sizeof(int) * 2);
 844 |   STBIW_ASSERT(p);
 845 |   if (p) {
 846 |     if (!*arr)
 847 |       ((int*)p)[1] = 0;
 848 |     *arr = (void*)((int*)p + 2);
 849 |     stbiw__sbm(*arr) = m;
 850 |   }
 851 |   return *arr;
 852 | }
 853 | 
 854 | static unsigned char* stbiw__zlib_flushf(unsigned char* data, unsigned int* bitbuffer,
 855 |                                          int* bitcount) {
 856 |   while (*bitcount >= 8) {
 857 |     stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
 858 |     *bitbuffer >>= 8;
 859 |     *bitcount -= 8;
 860 |   }
 861 |   return data;
 862 | }
 863 | 
 864 | static int stbiw__zlib_bitrev(int code, int codebits) {
 865 |   int res = 0;
 866 |   while (codebits--) {
 867 |     res = (res << 1) | (code & 1);
 868 |     code >>= 1;
 869 |   }
 870 |   return res;
 871 | }
 872 | 
 873 | static unsigned int stbiw__zlib_countm(unsigned char* a, unsigned char* b, int limit) {
 874 |   int i;
 875 |   for (i = 0; i < limit && i < 258; ++i)
 876 |     if (a[i] != b[i])
 877 |       break;
 878 |   return i;
 879 | }
 880 | 
 881 | static unsigned int stbiw__zhash(unsigned char* data) {
 882 |   stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
 883 |   hash ^= hash << 3;
 884 |   hash += hash >> 5;
 885 |   hash ^= hash << 4;
 886 |   hash += hash >> 17;
 887 |   hash ^= hash << 25;
 888 |   hash += hash >> 6;
 889 |   return hash;
 890 | }
 891 | 
 892 | #define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
 893 | #define stbiw__zlib_add(code, codebits)                                                            \
 894 |   (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
 895 | #define stbiw__zlib_huffa(b, c) stbiw__zlib_add(stbiw__zlib_bitrev(b, c), c)
 896 | // default huffman tables
 897 | #define stbiw__zlib_huff1(n) stbiw__zlib_huffa(0x30 + (n), 8)
 898 | #define stbiw__zlib_huff2(n) stbiw__zlib_huffa(0x190 + (n) - 144, 9)
 899 | #define stbiw__zlib_huff3(n) stbiw__zlib_huffa(0 + (n) - 256, 7)
 900 | #define stbiw__zlib_huff4(n) stbiw__zlib_huffa(0xc0 + (n) - 280, 8)
 901 | #define stbiw__zlib_huff(n)                                                                        \
 902 |   ((n) <= 143   ? stbiw__zlib_huff1(n)                                                             \
 903 |    : (n) <= 255 ? stbiw__zlib_huff2(n)                                                             \
 904 |    : (n) <= 279 ? stbiw__zlib_huff3(n)                                                             \
 905 |                 : stbiw__zlib_huff4(n))
 906 | #define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
 907 | 
 908 | #define stbiw__ZHASH 16384
 909 | 
 910 | #endif // STBIW_ZLIB_COMPRESS
 911 | 
 912 | STBIWDEF unsigned char* stbi_zlib_compress(unsigned char* data, int data_len, int* out_len,
 913 |                                            int quality) {
 914 | #ifdef STBIW_ZLIB_COMPRESS
 915 |   // user provided a zlib compress implementation, use that
 916 |   return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
 917 | #else  // use builtin
 918 |   static unsigned short lengthc[] = {3,  4,  5,  6,   7,   8,   9,   10,  11,  13,
 919 |                                      15, 17, 19, 23,  27,  31,  35,  43,  51,  59,
 920 |                                      67, 83, 99, 115, 131, 163, 195, 227, 258, 259};
 921 |   static unsigned char lengtheb[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
 922 |                                      2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0};
 923 |   static unsigned short distc[] = {
 924 |       1,   2,   3,   4,   5,    7,    9,    13,   17,   25,   33,   49,    65,    97,    129,  193,
 925 |       257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 32768};
 926 |   static unsigned char disteb[] = {0, 0, 0, 0, 1, 1, 2, 2,  3,  3,  4,  4,  5,  5,  6,
 927 |                                    6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
 928 |   unsigned int bitbuf = 0;
 929 |   int i, j, bitcount = 0;
 930 |   unsigned char* out = NULL;
 931 |   unsigned char*** hash_table =
 932 |       (unsigned char***)STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char**));
 933 |   if (hash_table == NULL)
 934 |     return NULL;
 935 |   if (quality < 5)
 936 |     quality = 5;
 937 | 
 938 |   stbiw__sbpush(out, 0x78); // DEFLATE 32K window
 939 |   stbiw__sbpush(out, 0x5e); // FLEVEL = 1
 940 |   stbiw__zlib_add(1, 1);    // BFINAL = 1
 941 |   stbiw__zlib_add(1, 2);    // BTYPE = 1 -- fixed huffman
 942 | 
 943 |   for (i = 0; i < stbiw__ZHASH; ++i)
 944 |     hash_table[i] = NULL;
 945 | 
 946 |   i = 0;
 947 |   while (i < data_len - 3) {
 948 |     // hash next 3 bytes of data to be compressed
 949 |     int h = stbiw__zhash(data + i) & (stbiw__ZHASH - 1), best = 3;
 950 |     unsigned char* bestloc = 0;
 951 |     unsigned char** hlist = hash_table[h];
 952 |     int n = stbiw__sbcount(hlist);
 953 |     for (j = 0; j < n; ++j) {
 954 |       if (hlist[j] - data > i - 32768) { // if entry lies within window
 955 |         int d = stbiw__zlib_countm(hlist[j], data + i, data_len - i);
 956 |         if (d >= best) {
 957 |           best = d;
 958 |           bestloc = hlist[j];
 959 |         }
 960 |       }
 961 |     }
 962 |     // when hash table entry is too long, delete half the entries
 963 |     if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2 * quality) {
 964 |       STBIW_MEMMOVE(hash_table[h], hash_table[h] + quality, sizeof(hash_table[h][0]) * quality);
 965 |       stbiw__sbn(hash_table[h]) = quality;
 966 |     }
 967 |     stbiw__sbpush(hash_table[h], data + i);
 968 | 
 969 |     if (bestloc) {
 970 |       // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
 971 |       h = stbiw__zhash(data + i + 1) & (stbiw__ZHASH - 1);
 972 |       hlist = hash_table[h];
 973 |       n = stbiw__sbcount(hlist);
 974 |       for (j = 0; j < n; ++j) {
 975 |         if (hlist[j] - data > i - 32767) {
 976 |           int e = stbiw__zlib_countm(hlist[j], data + i + 1, data_len - i - 1);
 977 |           if (e > best) { // if next match is better, bail on current match
 978 |             bestloc = NULL;
 979 |             break;
 980 |           }
 981 |         }
 982 |       }
 983 |     }
 984 | 
 985 |     if (bestloc) {
 986 |       int d = (int)(data + i - bestloc); // distance back
 987 |       STBIW_ASSERT(d <= 32767 && best <= 258);
 988 |       for (j = 0; best > lengthc[j + 1] - 1; ++j)
 989 |         ;
 990 |       stbiw__zlib_huff(j + 257);
 991 |       if (lengtheb[j])
 992 |         stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
 993 |       for (j = 0; d > distc[j + 1] - 1; ++j)
 994 |         ;
 995 |       stbiw__zlib_add(stbiw__zlib_bitrev(j, 5), 5);
 996 |       if (disteb[j])
 997 |         stbiw__zlib_add(d - distc[j], disteb[j]);
 998 |       i += best;
 999 |     } else {
1000 |       stbiw__zlib_huffb(data[i]);
1001 |       ++i;
1002 |     }
1003 |   }
1004 |   // write out final bytes
1005 |   for (; i < data_len; ++i)
1006 |     stbiw__zlib_huffb(data[i]);
1007 |   stbiw__zlib_huff(256); // end of block
1008 |   // pad with 0 bits to byte boundary
1009 |   while (bitcount)
1010 |     stbiw__zlib_add(0, 1);
1011 | 
1012 |   for (i = 0; i < stbiw__ZHASH; ++i)
1013 |     (void)stbiw__sbfree(hash_table[i]);
1014 |   STBIW_FREE(hash_table);
1015 | 
1016 |   // store uncompressed instead if compression was worse
1017 |   if (stbiw__sbn(out) > data_len + 2 + ((data_len + 32766) / 32767) * 5) {
1018 |     stbiw__sbn(out) = 2; // truncate to DEFLATE 32K window and FLEVEL = 1
1019 |     for (j = 0; j < data_len;) {
1020 |       int blocklen = data_len - j;
1021 |       if (blocklen > 32767)
1022 |         blocklen = 32767;
1023 |       stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression
1024 |       stbiw__sbpush(out, STBIW_UCHAR(blocklen));    // LEN
1025 |       stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8));
1026 |       stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN
1027 |       stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8));
1028 |       memcpy(out + stbiw__sbn(out), data + j, blocklen);
1029 |       stbiw__sbn(out) += blocklen;
1030 |       j += blocklen;
1031 |     }
1032 |   }
1033 | 
1034 |   {
1035 |     // compute adler32 on input
1036 |     unsigned int s1 = 1, s2 = 0;
1037 |     int blocklen = (int)(data_len % 5552);
1038 |     j = 0;
1039 |     while (j < data_len) {
1040 |       for (i = 0; i < blocklen; ++i) {
1041 |         s1 += data[j + i];
1042 |         s2 += s1;
1043 |       }
1044 |       s1 %= 65521;
1045 |       s2 %= 65521;
1046 |       j += blocklen;
1047 |       blocklen = 5552;
1048 |     }
1049 |     stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
1050 |     stbiw__sbpush(out, STBIW_UCHAR(s2));
1051 |     stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
1052 |     stbiw__sbpush(out, STBIW_UCHAR(s1));
1053 |   }
1054 |   *out_len = stbiw__sbn(out);
1055 |   // make returned pointer freeable
1056 |   STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
1057 |   return (unsigned char*)stbiw__sbraw(out);
1058 | #endif // STBIW_ZLIB_COMPRESS
1059 | }
1060 | 
1061 | static unsigned int stbiw__crc32(unsigned char* buffer, int len) {
1062 | #ifdef STBIW_CRC32
1063 |   return STBIW_CRC32(buffer, len);
1064 | #else
1065 |   static unsigned int crc_table[256] = {
1066 |       0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535,
1067 |       0x9E6495A3, 0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD,
1068 |       0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D,
1069 |       0x6DDDE4EB, 0xF4D4B551, 0x83D385C7, 0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,
1070 |       0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4,
1071 |       0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
1072 |       0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59, 0x26D930AC,
1073 |       0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
1074 |       0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB,
1075 |       0xB6662D3D, 0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F,
1076 |       0x9FBFE4A5, 0xE8B8D433, 0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB,
1077 |       0x086D3D2D, 0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
1078 |       0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA,
1079 |       0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65, 0x4DB26158, 0x3AB551CE,
1080 |       0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A,
1081 |       0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
1082 |       0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409,
1083 |       0xCE61E49F, 0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
1084 |       0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739,
1085 |       0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8,
1086 |       0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1, 0xF00F9344, 0x8708A3D2, 0x1E01F268,
1087 |       0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0,
1088 |       0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8,
1089 |       0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
1090 |       0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF,
1091 |       0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703,
1092 |       0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7,
1093 |       0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D, 0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A,
1094 |       0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE,
1095 |       0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
1096 |       0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777, 0x88085AE6,
1097 |       0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
1098 |       0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D,
1099 |       0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5,
1100 |       0x47B2CF7F, 0x30B5FFE9, 0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605,
1101 |       0xCDD70693, 0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
1102 |       0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D};
1103 | 
1104 |   unsigned int crc = ~0u;
1105 |   int i;
1106 |   for (i = 0; i < len; ++i)
1107 |     crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
1108 |   return ~crc;
1109 | #endif
1110 | }
1111 | 
1112 | #define stbiw__wpng4(o, a, b, c, d)                                                                \
1113 |   ((o)[0] = STBIW_UCHAR(a), (o)[1] = STBIW_UCHAR(b), (o)[2] = STBIW_UCHAR(c),                      \
1114 |    (o)[3] = STBIW_UCHAR(d), (o) += 4)
1115 | #define stbiw__wp32(data, v) stbiw__wpng4(data, (v) >> 24, (v) >> 16, (v) >> 8, (v));
1116 | #define stbiw__wptag(data, s) stbiw__wpng4(data, s[0], s[1], s[2], s[3])
1117 | 
1118 | static void stbiw__wpcrc(unsigned char** data, int len) {
1119 |   unsigned int crc = stbiw__crc32(*data - len - 4, len + 4);
1120 |   stbiw__wp32(*data, crc);
1121 | }
1122 | 
1123 | static unsigned char stbiw__paeth(int a, int b, int c) {
1124 |   int p = a + b - c, pa = abs(p - a), pb = abs(p - b), pc = abs(p - c);
1125 |   if (pa <= pb && pa <= pc)
1126 |     return STBIW_UCHAR(a);
1127 |   if (pb <= pc)
1128 |     return STBIW_UCHAR(b);
1129 |   return STBIW_UCHAR(c);
1130 | }
1131 | 
1132 | // @OPTIMIZE: provide an option that always forces left-predict or paeth predict
1133 | static void stbiw__encode_png_line(unsigned char* pixels, int stride_bytes, int width, int height,
1134 |                                    int y, int n, int filter_type, signed char* line_buffer) {
1135 |   static int mapping[] = {0, 1, 2, 3, 4};
1136 |   static int firstmap[] = {0, 1, 0, 5, 6};
1137 |   int* mymap = (y != 0) ? mapping : firstmap;
1138 |   int i;
1139 |   int type = mymap[filter_type];
1140 |   unsigned char* z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height - 1 - y : y);
1141 |   int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
1142 | 
1143 |   if (type == 0) {
1144 |     memcpy(line_buffer, z, width * n);
1145 |     return;
1146 |   }
1147 | 
1148 |   // first loop isn't optimized since it's just one pixel
1149 |   for (i = 0; i < n; ++i) {
1150 |     switch (type) {
1151 |     case 1:
1152 |       line_buffer[i] = z[i];
1153 |       break;
1154 |     case 2:
1155 |       line_buffer[i] = z[i] - z[i - signed_stride];
1156 |       break;
1157 |     case 3:
1158 |       line_buffer[i] = z[i] - (z[i - signed_stride] >> 1);
1159 |       break;
1160 |     case 4:
1161 |       line_buffer[i] = (signed char)(z[i] - stbiw__paeth(0, z[i - signed_stride], 0));
1162 |       break;
1163 |     case 5:
1164 |       line_buffer[i] = z[i];
1165 |       break;
1166 |     case 6:
1167 |       line_buffer[i] = z[i];
1168 |       break;
1169 |     }
1170 |   }
1171 |   switch (type) {
1172 |   case 1:
1173 |     for (i = n; i < width * n; ++i)
1174 |       line_buffer[i] = z[i] - z[i - n];
1175 |     break;
1176 |   case 2:
1177 |     for (i = n; i < width * n; ++i)
1178 |       line_buffer[i] = z[i] - z[i - signed_stride];
1179 |     break;
1180 |   case 3:
1181 |     for (i = n; i < width * n; ++i)
1182 |       line_buffer[i] = z[i] - ((z[i - n] + z[i - signed_stride]) >> 1);
1183 |     break;
1184 |   case 4:
1185 |     for (i = n; i < width * n; ++i)
1186 |       line_buffer[i] =
1187 |           z[i] - stbiw__paeth(z[i - n], z[i - signed_stride], z[i - signed_stride - n]);
1188 |     break;
1189 |   case 5:
1190 |     for (i = n; i < width * n; ++i)
1191 |       line_buffer[i] = z[i] - (z[i - n] >> 1);
1192 |     break;
1193 |   case 6:
1194 |     for (i = n; i < width * n; ++i)
1195 |       line_buffer[i] = z[i] - stbiw__paeth(z[i - n], 0, 0);
1196 |     break;
1197 |   }
1198 | }
1199 | 
1200 | STBIWDEF unsigned char* stbi_write_png_to_mem(const unsigned char* pixels, int stride_bytes, int x,
1201 |                                               int y, int n, int* out_len) {
1202 |   int force_filter = stbi_write_force_png_filter;
1203 |   int ctype[5] = {-1, 0, 4, 2, 6};
1204 |   unsigned char sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
1205 |   unsigned char *out, *o, *filt, *zlib;
1206 |   signed char* line_buffer;
1207 |   int j, zlen;
1208 | 
1209 |   if (stride_bytes == 0)
1210 |     stride_bytes = x * n;
1211 | 
1212 |   if (force_filter >= 5) {
1213 |     force_filter = -1;
1214 |   }
1215 | 
1216 |   filt = (unsigned char*)STBIW_MALLOC((x * n + 1) * y);
1217 |   if (!filt)
1218 |     return 0;
1219 |   line_buffer = (signed char*)STBIW_MALLOC(x * n);
1220 |   if (!line_buffer) {
1221 |     STBIW_FREE(filt);
1222 |     return 0;
1223 |   }
1224 |   for (j = 0; j < y; ++j) {
1225 |     int filter_type;
1226 |     if (force_filter > -1) {
1227 |       filter_type = force_filter;
1228 |       stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, force_filter,
1229 |                              line_buffer);
1230 |     } else { // Estimate the best filter by running through all of them:
1231 |       int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
1232 |       for (filter_type = 0; filter_type < 5; filter_type++) {
1233 |         stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, filter_type,
1234 |                                line_buffer);
1235 | 
1236 |         // Estimate the entropy of the line using this filter; the less, the better.
1237 |         est = 0;
1238 |         for (i = 0; i < x * n; ++i) {
1239 |           est += abs((signed char)line_buffer[i]);
1240 |         }
1241 |         if (est < best_filter_val) {
1242 |           best_filter_val = est;
1243 |           best_filter = filter_type;
1244 |         }
1245 |       }
1246 |       if (filter_type !=
1247 |           best_filter) { // If the last iteration already got us the best filter, don't redo it
1248 |         stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, best_filter,
1249 |                                line_buffer);
1250 |         filter_type = best_filter;
1251 |       }
1252 |     }
1253 |     // when we get here, filter_type contains the filter type, and line_buffer contains the data
1254 |     filt[j * (x * n + 1)] = (unsigned char)filter_type;
1255 |     STBIW_MEMMOVE(filt + j * (x * n + 1) + 1, line_buffer, x * n);
1256 |   }
1257 |   STBIW_FREE(line_buffer);
1258 |   zlib = stbi_zlib_compress(filt, y * (x * n + 1), &zlen, stbi_write_png_compression_level);
1259 |   STBIW_FREE(filt);
1260 |   if (!zlib)
1261 |     return 0;
1262 | 
1263 |   // each tag requires 12 bytes of overhead
1264 |   out = (unsigned char*)STBIW_MALLOC(8 + 12 + 13 + 12 + zlen + 12);
1265 |   if (!out)
1266 |     return 0;
1267 |   *out_len = 8 + 12 + 13 + 12 + zlen + 12;
1268 | 
1269 |   o = out;
1270 |   STBIW_MEMMOVE(o, sig, 8);
1271 |   o += 8;
1272 |   stbiw__wp32(o, 13); // header length
1273 |   stbiw__wptag(o, "IHDR");
1274 |   stbiw__wp32(o, x);
1275 |   stbiw__wp32(o, y);
1276 |   *o++ = 8;
1277 |   *o++ = STBIW_UCHAR(ctype[n]);
1278 |   *o++ = 0;
1279 |   *o++ = 0;
1280 |   *o++ = 0;
1281 |   stbiw__wpcrc(&o, 13);
1282 | 
1283 |   stbiw__wp32(o, zlen);
1284 |   stbiw__wptag(o, "IDAT");
1285 |   STBIW_MEMMOVE(o, zlib, zlen);
1286 |   o += zlen;
1287 |   STBIW_FREE(zlib);
1288 |   stbiw__wpcrc(&o, zlen);
1289 | 
1290 |   stbiw__wp32(o, 0);
1291 |   stbiw__wptag(o, "IEND");
1292 |   stbiw__wpcrc(&o, 0);
1293 | 
1294 |   STBIW_ASSERT(o == out + *out_len);
1295 | 
1296 |   return out;
1297 | }
1298 | 
1299 | #ifndef STBI_WRITE_NO_STDIO
1300 | STBIWDEF int stbi_write_png(char const* filename, int x, int y, int comp, const void* data,
1301 |                             int stride_bytes) {
1302 |   FILE* f;
1303 |   int len;
1304 |   unsigned char* png =
1305 |       stbi_write_png_to_mem((const unsigned char*)data, stride_bytes, x, y, comp, &len);
1306 |   if (png == NULL)
1307 |     return 0;
1308 | 
1309 |   f = stbiw__fopen(filename, "wb");
1310 |   if (!f) {
1311 |     STBIW_FREE(png);
1312 |     return 0;
1313 |   }
1314 |   fwrite(png, 1, len, f);
1315 |   fclose(f);
1316 |   STBIW_FREE(png);
1317 |   return 1;
1318 | }
1319 | #endif
1320 | 
1321 | STBIWDEF int stbi_write_png_to_func(stbi_write_func* func, void* context, int x, int y, int comp,
1322 |                                     const void* data, int stride_bytes) {
1323 |   int len;
1324 |   unsigned char* png =
1325 |       stbi_write_png_to_mem((const unsigned char*)data, stride_bytes, x, y, comp, &len);
1326 |   if (png == NULL)
1327 |     return 0;
1328 |   func(context, png, len);
1329 |   STBIW_FREE(png);
1330 |   return 1;
1331 | }
1332 | 
1333 | /* ***************************************************************************
1334 |  *
1335 |  * JPEG writer
1336 |  *
1337 |  * This is based on Jon Olick's jo_jpeg.cpp:
1338 |  * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
1339 |  */
1340 | 
1341 | static const unsigned char stbiw__jpg_ZigZag[] = {
1342 |     0,  1,  5,  6,  14, 15, 27, 28, 2,  4,  7,  13, 16, 26, 29, 42, 3,  8,  12, 17, 25, 30,
1343 |     41, 43, 9,  11, 18, 24, 31, 40, 44, 53, 10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38,
1344 |     46, 51, 55, 60, 21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63};
1345 | 
1346 | static void stbiw__jpg_writeBits(stbi__write_context* s, int* bitBufP, int* bitCntP,
1347 |                                  const unsigned short* bs) {
1348 |   int bitBuf = *bitBufP, bitCnt = *bitCntP;
1349 |   bitCnt += bs[1];
1350 |   bitBuf |= bs[0] << (24 - bitCnt);
1351 |   while (bitCnt >= 8) {
1352 |     unsigned char c = (bitBuf >> 16) & 255;
1353 |     stbiw__putc(s, c);
1354 |     if (c == 255) {
1355 |       stbiw__putc(s, 0);
1356 |     }
1357 |     bitBuf <<= 8;
1358 |     bitCnt -= 8;
1359 |   }
1360 |   *bitBufP = bitBuf;
1361 |   *bitCntP = bitCnt;
1362 | }
1363 | 
1364 | static void stbiw__jpg_DCT(float* d0p, float* d1p, float* d2p, float* d3p, float* d4p, float* d5p,
1365 |                            float* d6p, float* d7p) {
1366 |   float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
1367 |   float z1, z2, z3, z4, z5, z11, z13;
1368 | 
1369 |   float tmp0 = d0 + d7;
1370 |   float tmp7 = d0 - d7;
1371 |   float tmp1 = d1 + d6;
1372 |   float tmp6 = d1 - d6;
1373 |   float tmp2 = d2 + d5;
1374 |   float tmp5 = d2 - d5;
1375 |   float tmp3 = d3 + d4;
1376 |   float tmp4 = d3 - d4;
1377 | 
1378 |   // Even part
1379 |   float tmp10 = tmp0 + tmp3; // phase 2
1380 |   float tmp13 = tmp0 - tmp3;
1381 |   float tmp11 = tmp1 + tmp2;
1382 |   float tmp12 = tmp1 - tmp2;
1383 | 
1384 |   d0 = tmp10 + tmp11; // phase 3
1385 |   d4 = tmp10 - tmp11;
1386 | 
1387 |   z1 = (tmp12 + tmp13) * 0.707106781f; // c4
1388 |   d2 = tmp13 + z1;                     // phase 5
1389 |   d6 = tmp13 - z1;
1390 | 
1391 |   // Odd part
1392 |   tmp10 = tmp4 + tmp5; // phase 2
1393 |   tmp11 = tmp5 + tmp6;
1394 |   tmp12 = tmp6 + tmp7;
1395 | 
1396 |   // The rotator is modified from fig 4-8 to avoid extra negations.
1397 |   z5 = (tmp10 - tmp12) * 0.382683433f; // c6
1398 |   z2 = tmp10 * 0.541196100f + z5;      // c2-c6
1399 |   z4 = tmp12 * 1.306562965f + z5;      // c2+c6
1400 |   z3 = tmp11 * 0.707106781f;           // c4
1401 | 
1402 |   z11 = tmp7 + z3; // phase 5
1403 |   z13 = tmp7 - z3;
1404 | 
1405 |   *d5p = z13 + z2; // phase 6
1406 |   *d3p = z13 - z2;
1407 |   *d1p = z11 + z4;
1408 |   *d7p = z11 - z4;
1409 | 
1410 |   *d0p = d0;
1411 |   *d2p = d2;
1412 |   *d4p = d4;
1413 |   *d6p = d6;
1414 | }
1415 | 
1416 | static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
1417 |   int tmp1 = val < 0 ? -val : val;
1418 |   val = val < 0 ? val - 1 : val;
1419 |   bits[1] = 1;
1420 |   while (tmp1 >>= 1) {
1421 |     ++bits[1];
1422 |   }
1423 |   bits[0] = val & ((1 << bits[1]) - 1);
1424 | }
1425 | 
1426 | static int stbiw__jpg_processDU(stbi__write_context* s, int* bitBuf, int* bitCnt, float* CDU,
1427 |                                 int du_stride, float* fdtbl, int DC,
1428 |                                 const unsigned short HTDC[256][2],
1429 |                                 const unsigned short HTAC[256][2]) {
1430 |   const unsigned short EOB[2] = {HTAC[0x00][0], HTAC[0x00][1]};
1431 |   const unsigned short M16zeroes[2] = {HTAC[0xF0][0], HTAC[0xF0][1]};
1432 |   int dataOff, i, j, n, diff, end0pos, x, y;
1433 |   int DU[64];
1434 | 
1435 |   // DCT rows
1436 |   for (dataOff = 0, n = du_stride * 8; dataOff < n; dataOff += du_stride) {
1437 |     stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + 1], &CDU[dataOff + 2], &CDU[dataOff + 3],
1438 |                    &CDU[dataOff + 4], &CDU[dataOff + 5], &CDU[dataOff + 6], &CDU[dataOff + 7]);
1439 |   }
1440 |   // DCT columns
1441 |   for (dataOff = 0; dataOff < 8; ++dataOff) {
1442 |     stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + du_stride], &CDU[dataOff + du_stride * 2],
1443 |                    &CDU[dataOff + du_stride * 3], &CDU[dataOff + du_stride * 4],
1444 |                    &CDU[dataOff + du_stride * 5], &CDU[dataOff + du_stride * 6],
1445 |                    &CDU[dataOff + du_stride * 7]);
1446 |   }
1447 |   // Quantize/descale/zigzag the coefficients
1448 |   for (y = 0, j = 0; y < 8; ++y) {
1449 |     for (x = 0; x < 8; ++x, ++j) {
1450 |       float v;
1451 |       i = y * du_stride + x;
1452 |       v = CDU[i] * fdtbl[j];
1453 |       // DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
1454 |       // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
1455 |       DU[stbiw__jpg_ZigZag[j]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
1456 |     }
1457 |   }
1458 | 
1459 |   // Encode DC
1460 |   diff = DU[0] - DC;
1461 |   if (diff == 0) {
1462 |     stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
1463 |   } else {
1464 |     unsigned short bits[2];
1465 |     stbiw__jpg_calcBits(diff, bits);
1466 |     stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
1467 |     stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
1468 |   }
1469 |   // Encode ACs
1470 |   end0pos = 63;
1471 |   for (; (end0pos > 0) && (DU[end0pos] == 0); --end0pos) {
1472 |   }
1473 |   // end0pos = first element in reverse order !=0
1474 |   if (end0pos == 0) {
1475 |     stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
1476 |     return DU[0];
1477 |   }
1478 |   for (i = 1; i <= end0pos; ++i) {
1479 |     int startpos = i;
1480 |     int nrzeroes;
1481 |     unsigned short bits[2];
1482 |     for (; DU[i] == 0 && i <= end0pos; ++i) {
1483 |     }
1484 |     nrzeroes = i - startpos;
1485 |     if (nrzeroes >= 16) {
1486 |       int lng = nrzeroes >> 4;
1487 |       int nrmarker;
1488 |       for (nrmarker = 1; nrmarker <= lng; ++nrmarker)
1489 |         stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
1490 |       nrzeroes &= 15;
1491 |     }
1492 |     stbiw__jpg_calcBits(DU[i], bits);
1493 |     stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes << 4) + bits[1]]);
1494 |     stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
1495 |   }
1496 |   if (end0pos != 63) {
1497 |     stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
1498 |   }
1499 |   return DU[0];
1500 | }
1501 | 
1502 | static int stbi_write_jpg_core(stbi__write_context* s, int width, int height, int comp,
1503 |                                const void* data, int quality) {
1504 |   // Constants that don't pollute global namespace
1505 |   static const unsigned char std_dc_luminance_nrcodes[] = {0, 0, 1, 5, 1, 1, 1, 1, 1,
1506 |                                                            1, 0, 0, 0, 0, 0, 0, 0};
1507 |   static const unsigned char std_dc_luminance_values[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
1508 |   static const unsigned char std_ac_luminance_nrcodes[] = {0, 0, 2, 1, 3, 3, 2, 4,   3,
1509 |                                                            5, 5, 4, 4, 0, 0, 1, 0x7d};
1510 |   static const unsigned char std_ac_luminance_values[] = {
1511 |       0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61,
1512 |       0x07, 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08, 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52,
1513 |       0xd1, 0xf0, 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x25,
1514 |       0x26, 0x27, 0x28, 0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45,
1515 |       0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x63, 0x64,
1516 |       0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x83,
1517 |       0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
1518 |       0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
1519 |       0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3,
1520 |       0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8,
1521 |       0xe9, 0xea, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa};
1522 |   static const unsigned char std_dc_chrominance_nrcodes[] = {0, 0, 3, 1, 1, 1, 1, 1, 1,
1523 |                                                              1, 1, 1, 0, 0, 0, 0, 0};
1524 |   static const unsigned char std_dc_chrominance_values[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
1525 |   static const unsigned char std_ac_chrominance_nrcodes[] = {0, 0, 2, 1, 2, 4, 4, 3,   4,
1526 |                                                              7, 5, 4, 4, 0, 1, 2, 0x77};
1527 |   static const unsigned char std_ac_chrominance_values[] = {
1528 |       0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, 0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61,
1529 |       0x71, 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91, 0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33,
1530 |       0x52, 0xf0, 0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34, 0xe1, 0x25, 0xf1, 0x17, 0x18,
1531 |       0x19, 0x1a, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44,
1532 |       0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x63,
1533 |       0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a,
1534 |       0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
1535 |       0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
1536 |       0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca,
1537 |       0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
1538 |       0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa};
1539 |   // Huffman tables
1540 |   static const unsigned short YDC_HT[256][2] = {{0, 2},  {2, 3},   {3, 3},   {4, 3},
1541 |                                                 {5, 3},  {6, 3},   {14, 4},  {30, 5},
1542 |                                                 {62, 6}, {126, 7}, {254, 8}, {510, 9}};
1543 |   static const unsigned short UVDC_HT[256][2] = {{0, 2},   {1, 2},   {2, 2},     {6, 3},
1544 |                                                  {14, 4},  {30, 5},  {62, 6},    {126, 7},
1545 |                                                  {254, 8}, {510, 9}, {1022, 10}, {2046, 11}};
1546 |   static const unsigned short YAC_HT[256][2] = {
1547 |       {10, 4},     {0, 2},      {1, 2},      {4, 3},      {11, 4},     {26, 5},     {120, 7},
1548 |       {248, 8},    {1014, 10},  {65410, 16}, {65411, 16}, {0, 0},      {0, 0},      {0, 0},
1549 |       {0, 0},      {0, 0},      {0, 0},      {12, 4},     {27, 5},     {121, 7},    {502, 9},
1550 |       {2038, 11},  {65412, 16}, {65413, 16}, {65414, 16}, {65415, 16}, {65416, 16}, {0, 0},
1551 |       {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {28, 5},     {249, 8},
1552 |       {1015, 10},  {4084, 12},  {65417, 16}, {65418, 16}, {65419, 16}, {65420, 16}, {65421, 16},
1553 |       {65422, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
1554 |       {58, 6},     {503, 9},    {4085, 12},  {65423, 16}, {65424, 16}, {65425, 16}, {65426, 16},
1555 |       {65427, 16}, {65428, 16}, {65429, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
1556 |       {0, 0},      {0, 0},      {59, 6},     {1016, 10},  {65430, 16}, {65431, 16}, {65432, 16},
1557 |       {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {0, 0},      {0, 0},
1558 |       {0, 0},      {0, 0},      {0, 0},      {0, 0},      {122, 7},    {2039, 11},  {65438, 16},
1559 |       {65439, 16}, {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16},
1560 |       {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {123, 7},
1561 |       {4086, 12},  {65446, 16}, {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16},
1562 |       {65452, 16}, {65453, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
1563 |       {0, 0},      {250, 8},    {4087, 12},  {65454, 16}, {65455, 16}, {65456, 16}, {65457, 16},
1564 |       {65458, 16}, {65459, 16}, {65460, 16}, {65461, 16}, {0, 0},      {0, 0},      {0, 0},
1565 |       {0, 0},      {0, 0},      {0, 0},      {504, 9},    {32704, 15}, {65462, 16}, {65463, 16},
1566 |       {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {0, 0},
1567 |       {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {505, 9},    {65470, 16},
1568 |       {65471, 16}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16},
1569 |       {65478, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
1570 |       {506, 9},    {65479, 16}, {65480, 16}, {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16},
1571 |       {65485, 16}, {65486, 16}, {65487, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
1572 |       {0, 0},      {0, 0},      {1017, 10},  {65488, 16}, {65489, 16}, {65490, 16}, {65491, 16},
1573 |       {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, {0, 0},      {0, 0},
1574 |       {0, 0},      {0, 0},      {0, 0},      {0, 0},      {1018, 10},  {65497, 16}, {65498, 16},
1575 |       {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16},
1576 |       {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {2040, 11},
1577 |       {65506, 16}, {65507, 16}, {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16},
1578 |       {65513, 16}, {65514, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
1579 |       {0, 0},      {65515, 16}, {65516, 16}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16},
1580 |       {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {0, 0},      {0, 0},      {0, 0},
1581 |       {0, 0},      {0, 0},      {2041, 11},  {65525, 16}, {65526, 16}, {65527, 16}, {65528, 16},
1582 |       {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0},
1583 |       {0, 0},      {0, 0},      {0, 0},      {0, 0}};
1584 |   static const unsigned short UVAC_HT[256][2] = {
1585 |       {0, 2},      {1, 2},      {4, 3},      {10, 4},     {24, 5},     {25, 5},     {56, 6},
1586 |       {120, 7},    {500, 9},    {1014, 10},  {4084, 12},  {0, 0},      {0, 0},      {0, 0},
1587 |       {0, 0},      {0, 0},      {0, 0},      {11, 4},     {57, 6},     {246, 8},    {501, 9},
1588 |       {2038, 11},  {4085, 12},  {65416, 16}, {65417, 16}, {65418, 16}, {65419, 16}, {0, 0},
1589 |       {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {26, 5},     {247, 8},
1590 |       {1015, 10},  {4086, 12},  {32706, 15}, {65420, 16}, {65421, 16}, {65422, 16}, {65423, 16},
1591 |       {65424, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
1592 |       {27, 5},     {248, 8},    {1016, 10},  {4087, 12},  {65425, 16}, {65426, 16}, {65427, 16},
1593 |       {65428, 16}, {65429, 16}, {65430, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
1594 |       {0, 0},      {0, 0},      {58, 6},     {502, 9},    {65431, 16}, {65432, 16}, {65433, 16},
1595 |       {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {65438, 16}, {0, 0},      {0, 0},
1596 |       {0, 0},      {0, 0},      {0, 0},      {0, 0},      {59, 6},     {1017, 10},  {65439, 16},
1597 |       {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, {65446, 16},
1598 |       {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {121, 7},
1599 |       {2039, 11},  {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16},
1600 |       {65453, 16}, {65454, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
1601 |       {0, 0},      {122, 7},    {2040, 11},  {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16},
1602 |       {65459, 16}, {65460, 16}, {65461, 16}, {65462, 16}, {0, 0},      {0, 0},      {0, 0},
1603 |       {0, 0},      {0, 0},      {0, 0},      {249, 8},    {65463, 16}, {65464, 16}, {65465, 16},
1604 |       {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {65470, 16}, {65471, 16}, {0, 0},
1605 |       {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {503, 9},    {65472, 16},
1606 |       {65473, 16}, {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, {65479, 16},
1607 |       {65480, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
1608 |       {504, 9},    {65481, 16}, {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16},
1609 |       {65487, 16}, {65488, 16}, {65489, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
1610 |       {0, 0},      {0, 0},      {505, 9},    {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16},
1611 |       {65494, 16}, {65495, 16}, {65496, 16}, {65497, 16}, {65498, 16}, {0, 0},      {0, 0},
1612 |       {0, 0},      {0, 0},      {0, 0},      {0, 0},      {506, 9},    {65499, 16}, {65500, 16},
1613 |       {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, {65505, 16}, {65506, 16}, {65507, 16},
1614 |       {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {2041, 11},
1615 |       {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16},
1616 |       {65515, 16}, {65516, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
1617 |       {0, 0},      {16352, 14}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, {65521, 16},
1618 |       {65522, 16}, {65523, 16}, {65524, 16}, {65525, 16}, {0, 0},      {0, 0},      {0, 0},
1619 |       {0, 0},      {0, 0},      {1018, 10},  {32707, 15}, {65526, 16}, {65527, 16}, {65528, 16},
1620 |       {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, {65534, 16}, {0, 0},
1621 |       {0, 0},      {0, 0},      {0, 0},      {0, 0}};
1622 |   static const int YQT[] = {16, 11, 10, 16, 24,  40,  51,  61,  12, 12, 14, 19, 26,  58,  60,  55,
1623 |                             14, 13, 16, 24, 40,  57,  69,  56,  14, 17, 22, 29, 51,  87,  80,  62,
1624 |                             18, 22, 37, 56, 68,  109, 103, 77,  24, 35, 55, 64, 81,  104, 113, 92,
1625 |                             49, 64, 78, 87, 103, 121, 120, 101, 72, 92, 95, 98, 112, 100, 103, 99};
1626 |   static const int UVQT[] = {17, 18, 24, 47, 99, 99, 99, 99, 18, 21, 26, 66, 99, 99, 99, 99,
1627 |                              24, 26, 56, 99, 99, 99, 99, 99, 47, 66, 99, 99, 99, 99, 99, 99,
1628 |                              99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
1629 |                              99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99};
1630 |   static const float aasf[] = {1.0f * 2.828427125f,         1.387039845f * 2.828427125f,
1631 |                                1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f,
1632 |                                1.0f * 2.828427125f,         0.785694958f * 2.828427125f,
1633 |                                0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f};
1634 | 
1635 |   int row, col, i, k, subsample;
1636 |   float fdtbl_Y[64], fdtbl_UV[64];
1637 |   unsigned char YTable[64], UVTable[64];
1638 | 
1639 |   if (!data || !width || !height || comp > 4 || comp < 1) {
1640 |     return 0;
1641 |   }
1642 | 
1643 |   quality = quality ? quality : 90;
1644 |   subsample = quality <= 90 ? 1 : 0;
1645 |   quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
1646 |   quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
1647 | 
1648 |   for (i = 0; i < 64; ++i) {
1649 |     int uvti, yti = (YQT[i] * quality + 50) / 100;
1650 |     YTable[stbiw__jpg_ZigZag[i]] = (unsigned char)(yti < 1 ? 1 : yti > 255 ? 255 : yti);
1651 |     uvti = (UVQT[i] * quality + 50) / 100;
1652 |     UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char)(uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
1653 |   }
1654 | 
1655 |   for (row = 0, k = 0; row < 8; ++row) {
1656 |     for (col = 0; col < 8; ++col, ++k) {
1657 |       fdtbl_Y[k] = 1 / (YTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
1658 |       fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
1659 |     }
1660 |   }
1661 | 
1662 |   // Write Headers
1663 |   {
1664 |     static const unsigned char head0[] = {0xFF, 0xD8, 0xFF, 0xE0, 0, 0x10, 'J', 'F', 'I',
1665 |                                           'F',  0,    1,    1,    0, 0,    1,   0,   1,
1666 |                                           0,    0,    0xFF, 0xDB, 0, 0x84, 0};
1667 |     static const unsigned char head2[] = {0xFF, 0xDA, 0, 0xC,  3, 1,    0,
1668 |                                           2,    0x11, 3, 0x11, 0, 0x3F, 0};
1669 |     const unsigned char head1[] = {0xFF,
1670 |                                    0xC0,
1671 |                                    0,
1672 |                                    0x11,
1673 |                                    8,
1674 |                                    (unsigned char)(height >> 8),
1675 |                                    STBIW_UCHAR(height),
1676 |                                    (unsigned char)(width >> 8),
1677 |                                    STBIW_UCHAR(width),
1678 |                                    3,
1679 |                                    1,
1680 |                                    (unsigned char)(subsample ? 0x22 : 0x11),
1681 |                                    0,
1682 |                                    2,
1683 |                                    0x11,
1684 |                                    1,
1685 |                                    3,
1686 |                                    0x11,
1687 |                                    1,
1688 |                                    0xFF,
1689 |                                    0xC4,
1690 |                                    0x01,
1691 |                                    0xA2,
1692 |                                    0};
1693 |     s->func(s->context, (void*)head0, sizeof(head0));
1694 |     s->func(s->context, (void*)YTable, sizeof(YTable));
1695 |     stbiw__putc(s, 1);
1696 |     s->func(s->context, UVTable, sizeof(UVTable));
1697 |     s->func(s->context, (void*)head1, sizeof(head1));
1698 |     s->func(s->context, (void*)(std_dc_luminance_nrcodes + 1),
1699 |             sizeof(std_dc_luminance_nrcodes) - 1);
1700 |     s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
1701 |     stbiw__putc(s, 0x10); // HTYACinfo
1702 |     s->func(s->context, (void*)(std_ac_luminance_nrcodes + 1),
1703 |             sizeof(std_ac_luminance_nrcodes) - 1);
1704 |     s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
1705 |     stbiw__putc(s, 1); // HTUDCinfo
1706 |     s->func(s->context, (void*)(std_dc_chrominance_nrcodes + 1),
1707 |             sizeof(std_dc_chrominance_nrcodes) - 1);
1708 |     s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
1709 |     stbiw__putc(s, 0x11); // HTUACinfo
1710 |     s->func(s->context, (void*)(std_ac_chrominance_nrcodes + 1),
1711 |             sizeof(std_ac_chrominance_nrcodes) - 1);
1712 |     s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
1713 |     s->func(s->context, (void*)head2, sizeof(head2));
1714 |   }
1715 | 
1716 |   // Encode 8x8 macroblocks
1717 |   {
1718 |     static const unsigned short fillBits[] = {0x7F, 7};
1719 |     int DCY = 0, DCU = 0, DCV = 0;
1720 |     int bitBuf = 0, bitCnt = 0;
1721 |     // comp == 2 is grey+alpha (alpha is ignored)
1722 |     int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
1723 |     const unsigned char* dataR = (const unsigned char*)data;
1724 |     const unsigned char* dataG = dataR + ofsG;
1725 |     const unsigned char* dataB = dataR + ofsB;
1726 |     int x, y, pos;
1727 |     if (subsample) {
1728 |       for (y = 0; y < height; y += 16) {
1729 |         for (x = 0; x < width; x += 16) {
1730 |           float Y[256], U[256], V[256];
1731 |           for (row = y, pos = 0; row < y + 16; ++row) {
1732 |             // row >= height => use last input row
1733 |             int clamped_row = (row < height) ? row : height - 1;
1734 |             int base_p =
1735 |                 (stbi__flip_vertically_on_write ? (height - 1 - clamped_row) : clamped_row) *
1736 |                 width * comp;
1737 |             for (col = x; col < x + 16; ++col, ++pos) {
1738 |               // if col >= width => use pixel from last input column
1739 |               int p = base_p + ((col < width) ? col : (width - 1)) * comp;
1740 |               float r = dataR[p], g = dataG[p], b = dataB[p];
1741 |               Y[pos] = +0.29900f * r + 0.58700f * g + 0.11400f * b - 128;
1742 |               U[pos] = -0.16874f * r - 0.33126f * g + 0.50000f * b;
1743 |               V[pos] = +0.50000f * r - 0.41869f * g - 0.08131f * b;
1744 |             }
1745 |           }
1746 |           DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y + 0, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
1747 |           DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y + 8, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
1748 |           DCY =
1749 |               stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y + 128, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
1750 |           DCY =
1751 |               stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y + 136, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT);
1752 | 
1753 |           // subsample U,V
1754 |           {
1755 |             float subU[64], subV[64];
1756 |             int yy, xx;
1757 |             for (yy = 0, pos = 0; yy < 8; ++yy) {
1758 |               for (xx = 0; xx < 8; ++xx, ++pos) {
1759 |                 int j = yy * 32 + xx * 2;
1760 |                 subU[pos] = (U[j + 0] + U[j + 1] + U[j + 16] + U[j + 17]) * 0.25f;
1761 |                 subV[pos] = (V[j + 0] + V[j + 1] + V[j + 16] + V[j + 17]) * 0.25f;
1762 |               }
1763 |             }
1764 |             DCU =
1765 |                 stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subU, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
1766 |             DCV =
1767 |                 stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subV, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
1768 |           }
1769 |         }
1770 |       }
1771 |     } else {
1772 |       for (y = 0; y < height; y += 8) {
1773 |         for (x = 0; x < width; x += 8) {
1774 |           float Y[64], U[64], V[64];
1775 |           for (row = y, pos = 0; row < y + 8; ++row) {
1776 |             // row >= height => use last input row
1777 |             int clamped_row = (row < height) ? row : height - 1;
1778 |             int base_p =
1779 |                 (stbi__flip_vertically_on_write ? (height - 1 - clamped_row) : clamped_row) *
1780 |                 width * comp;
1781 |             for (col = x; col < x + 8; ++col, ++pos) {
1782 |               // if col >= width => use pixel from last input column
1783 |               int p = base_p + ((col < width) ? col : (width - 1)) * comp;
1784 |               float r = dataR[p], g = dataG[p], b = dataB[p];
1785 |               Y[pos] = +0.29900f * r + 0.58700f * g + 0.11400f * b - 128;
1786 |               U[pos] = -0.16874f * r - 0.33126f * g + 0.50000f * b;
1787 |               V[pos] = +0.50000f * r - 0.41869f * g - 0.08131f * b;
1788 |             }
1789 |           }
1790 | 
1791 |           DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y, 8, fdtbl_Y, DCY, YDC_HT, YAC_HT);
1792 |           DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, U, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
1793 |           DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, V, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
1794 |         }
1795 |       }
1796 |     }
1797 | 
1798 |     // Do the bit alignment of the EOI marker
1799 |     stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
1800 |   }
1801 | 
1802 |   // EOI
1803 |   stbiw__putc(s, 0xFF);
1804 |   stbiw__putc(s, 0xD9);
1805 | 
1806 |   return 1;
1807 | }
1808 | 
1809 | STBIWDEF int stbi_write_jpg_to_func(stbi_write_func* func, void* context, int x, int y, int comp,
1810 |                                     const void* data, int quality) {
1811 |   stbi__write_context s = {0};
1812 |   stbi__start_write_callbacks(&s, func, context);
1813 |   return stbi_write_jpg_core(&s, x, y, comp, (void*)data, quality);
1814 | }
1815 | 
1816 | #ifndef STBI_WRITE_NO_STDIO
1817 | STBIWDEF int stbi_write_jpg(char const* filename, int x, int y, int comp, const void* data,
1818 |                             int quality) {
1819 |   stbi__write_context s = {0};
1820 |   if (stbi__start_write_file(&s, filename)) {
1821 |     int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
1822 |     stbi__end_write_file(&s);
1823 |     return r;
1824 |   } else
1825 |     return 0;
1826 | }
1827 | #endif
1828 | 
1829 | #endif // STB_IMAGE_WRITE_IMPLEMENTATION
1830 | 
1831 | /* Revision history
1832 |       1.16  (2021-07-11)
1833 |              make Deflate code emit uncompressed blocks when it would otherwise expand
1834 |              support writing BMPs with alpha channel
1835 |       1.15  (2020-07-13) unknown
1836 |       1.14  (2020-02-02) updated JPEG writer to downsample chroma channels
1837 |       1.13
1838 |       1.12
1839 |       1.11  (2019-08-11)
1840 | 
1841 |       1.10  (2019-02-07)
1842 |              support utf8 filenames in Windows; fix warnings and platform ifdefs
1843 |       1.09  (2018-02-11)
1844 |              fix typo in zlib quality API, improve STB_I_W_STATIC in C++
1845 |       1.08  (2018-01-29)
1846 |              add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
1847 |       1.07  (2017-07-24)
1848 |              doc fix
1849 |       1.06 (2017-07-23)
1850 |              writing JPEG (using Jon Olick's code)
1851 |       1.05   ???
1852 |       1.04 (2017-03-03)
1853 |              monochrome BMP expansion
1854 |       1.03   ???
1855 |       1.02 (2016-04-02)
1856 |              avoid allocating large structures on the stack
1857 |       1.01 (2016-01-16)
1858 |              STBIW_REALLOC_SIZED: support allocators with no realloc support
1859 |              avoid race-condition in crc initialization
1860 |              minor compile issues
1861 |       1.00 (2015-09-14)
1862 |              installable file IO function
1863 |       0.99 (2015-09-13)
1864 |              warning fixes; TGA rle support
1865 |       0.98 (2015-04-08)
1866 |              added STBIW_MALLOC, STBIW_ASSERT etc
1867 |       0.97 (2015-01-18)
1868 |              fixed HDR asserts, rewrote HDR rle logic
1869 |       0.96 (2015-01-17)
1870 |              add HDR output
1871 |              fix monochrome BMP
1872 |       0.95 (2014-08-17)
1873 |              add monochrome TGA output
1874 |       0.94 (2014-05-31)
1875 |              rename private functions to avoid conflicts with stb_image.h
1876 |       0.93 (2014-05-27)
1877 |              warning fixes
1878 |       0.92 (2010-08-01)
1879 |              casts to unsigned char to fix warnings
1880 |       0.91 (2010-07-17)
1881 |              first public release
1882 |       0.90   first internal release
1883 | */
1884 | 
1885 | /*
1886 | ------------------------------------------------------------------------------
1887 | This software is available under 2 licenses -- choose whichever you prefer.
1888 | ------------------------------------------------------------------------------
1889 | ALTERNATIVE A - MIT License
1890 | Copyright (c) 2017 Sean Barrett
1891 | Permission is hereby granted, free of charge, to any person obtaining a copy of
1892 | this software and associated documentation files (the "Software"), to deal in
1893 | the Software without restriction, including without limitation the rights to
1894 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
1895 | of the Software, and to permit persons to whom the Software is furnished to do
1896 | so, subject to the following conditions:
1897 | The above copyright notice and this permission notice shall be included in all
1898 | copies or substantial portions of the Software.
1899 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1900 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1901 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1902 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1903 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1904 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1905 | SOFTWARE.
1906 | ------------------------------------------------------------------------------
1907 | ALTERNATIVE B - Public Domain (www.unlicense.org)
1908 | This is free and unencumbered software released into the public domain.
1909 | Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
1910 | software, either in source code form or as a compiled binary, for any purpose,
1911 | commercial or non-commercial, and by any means.
1912 | In jurisdictions that recognize copyright laws, the author or authors of this
1913 | software dedicate any and all copyright interest in the software to the public
1914 | domain. We make this dedication for the benefit of the public at large and to
1915 | the detriment of our heirs and successors. We intend this dedication to be an
1916 | overt act of relinquishment in perpetuity of all present and future rights to
1917 | this software under copyright law.
1918 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1919 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1920 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1921 | AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
1922 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
1923 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1924 | ------------------------------------------------------------------------------
1925 | */
1926 | 


--------------------------------------------------------------------------------