├── images ├── owl.jpg ├── asyncimg.png ├── imgrad.png ├── sobelX_img.png ├── sobelY_img.png └── sobelXY_img.png ├── include ├── cudaimproc │ ├── imutils.h │ ├── imgio.h │ ├── cudacheck.h │ ├── execonfig.h │ └── utils.h └── matrix │ └── matrix.cuh ├── src ├── blur │ └── gaussian.cpp ├── imutils.cpp ├── utils.cpp ├── imgio.cpp ├── execonfig.cpp ├── gengrad │ ├── gengrad.cu │ ├── trueasyncgrad.cu │ └── asyncgengrad.cu ├── edged │ ├── sobelx.cu │ ├── sobely.cu │ └── sobelxy.cu └── stb │ └── stb_image_write.h ├── .gitignore ├── LICENSE ├── README.md └── CMakeLists.txt /images/owl.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/D-K-E/cuda-improc/master/images/owl.jpg -------------------------------------------------------------------------------- /images/asyncimg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/D-K-E/cuda-improc/master/images/asyncimg.png -------------------------------------------------------------------------------- /images/imgrad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/D-K-E/cuda-improc/master/images/imgrad.png -------------------------------------------------------------------------------- /images/sobelX_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/D-K-E/cuda-improc/master/images/sobelX_img.png -------------------------------------------------------------------------------- /images/sobelY_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/D-K-E/cuda-improc/master/images/sobelY_img.png -------------------------------------------------------------------------------- /images/sobelXY_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/D-K-E/cuda-improc/master/images/sobelXY_img.png -------------------------------------------------------------------------------- /include/cudaimproc/imutils.h: -------------------------------------------------------------------------------- 1 | #ifndef IMUTILS_H 2 | #define IMUTILS_H 3 | 4 | #include 5 | 6 | namespace cudaimproc { 7 | img_info owl_img(); 8 | } 9 | #endif 10 | -------------------------------------------------------------------------------- /src/blur/gaussian.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | // 4 | #include 5 | // 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace cudaimproc {} 12 | 13 | int main() { 14 | 15 | cudaimproc::img_info info = cudaimproc::owl_img(); 16 | } 17 | -------------------------------------------------------------------------------- /src/imutils.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace cudaimproc { 4 | 5 | img_info owl_img() { 6 | std::filesystem::path img_dir(IMAGE_DIR); 7 | std::filesystem::path imname("owl.jpg"); 8 | std::filesystem::path imgp = img_dir / imname; 9 | 10 | // image config 11 | img_info info = cudaimproc::imread(imgp); 12 | return info; 13 | } 14 | } // namespace cudaimproc 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | # CUDA 35 | *.i 36 | *.ii 37 | *.gpu 38 | *.ptx 39 | *.cubin 40 | *.fatbin 41 | 42 | # by DKE 43 | CMakeUserPresets.json 44 | **/build/* 45 | -------------------------------------------------------------------------------- /src/utils.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace cudaimproc { 4 | std::vector> 5 | chunker(const unsigned int seq_size, 6 | unsigned int chunk_size) { 7 | std::vector indices; 8 | for (unsigned int i = 0; i < seq_size; i++) { 9 | indices.push_back(i); 10 | } 11 | 12 | std::vector> 13 | start_ends; 14 | auto fn = [&](unsigned int b, unsigned int e) { 15 | auto pos = std::make_pair(b, e); 16 | start_ends.emplace_back(pos); 17 | }; 18 | chunks(indices, chunk_size, fn); 19 | return start_ends; 20 | } 21 | 22 | } // namespace cudaimproc 23 | -------------------------------------------------------------------------------- /include/cudaimproc/imgio.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace cudaimproc { 7 | 8 | struct img_info { 9 | unsigned char *data; 10 | std::size_t width, height, channels; 11 | const char *name; 12 | img_info() = delete; 13 | img_info(unsigned char *d, std::size_t w, std::size_t h, 14 | std::size_t c, const char *n); 15 | ~img_info() { delete[] data; } 16 | }; 17 | 18 | void render(std::optional pixels_opt, 19 | int imheight, int imwidth, int channel = 3, 20 | const char *imname = "imgrad"); 21 | 22 | img_info imread(std::filesystem::path impath); 23 | } // namespace cudaimproc 24 | -------------------------------------------------------------------------------- /include/cudaimproc/cudacheck.h: -------------------------------------------------------------------------------- 1 | #ifndef CUDACHECK_H 2 | #define CUDACHECK_H 3 | #include 4 | #include 5 | namespace cuda_imgproc { 6 | void print_cuda_error(cudaError_t res, 7 | const char *const func_name, 8 | const char *const file_name, 9 | const int line) { 10 | if (res != cudaSuccess) { 11 | std::cout << "CUDA ERROR :: " 12 | << static_cast(res) << " " 13 | << cudaGetErrorName(res) << " in function " 14 | << func_name << " at line " << line 15 | << " of file " << file_name << std::endl; 16 | cudaDeviceReset(); 17 | exit(99); 18 | } 19 | }; 20 | } // namespace cuda_imgproc 21 | 22 | #define CUDA_CHECK(v) \ 23 | cuda_imgproc::print_cuda_error((v), #v, __FILE__, \ 24 | __LINE__) 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /include/cudaimproc/execonfig.h: -------------------------------------------------------------------------------- 1 | #ifndef EXECONFIG_H 2 | #define EXECONFIG_H 3 | // execution config handler for cuda kernels 4 | #include 5 | #include 6 | 7 | namespace cudaimproc { 8 | 9 | struct ExecutionConfig1D { 10 | 11 | ExecutionConfig1D() = delete; 12 | 13 | //! \brief cuda stream based execution config 14 | ExecutionConfig1D(std::size_t nb_elements_to_process, 15 | const std::size_t threads_per_b = 32, 16 | const std::size_t nb_s = 1); 17 | 18 | std::size_t block_nb(std::size_t stream_index = 0) const; 19 | std::size_t nb_threads() const; 20 | std::size_t nb_streams() const; 21 | 22 | private: 23 | std::size_t 24 | find_nb_blocks(std::size_t nb_elements_to_process) const; 25 | 26 | void mk_stream_chunks( 27 | std::size_t total_nb_elements_to_process); 28 | 29 | const std::size_t threads_per_block{32}; 30 | const std::size_t stream_nb{1}; 31 | 32 | std::vector> 33 | stream_chunks; 34 | }; 35 | 36 | } // namespace cudaimproc 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 D-K-E 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /include/cudaimproc/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H 2 | #define UTILS_H 3 | 4 | #include 5 | #include 6 | 7 | namespace cudaimproc { 8 | std::vector> 9 | chunker(const unsigned int seq_size, 10 | unsigned int chunk_size); 11 | // adapted from https://stackoverflow.com/a/9943098 12 | // divides the container into inclusive chunks where 13 | // each chunk contains k elements except the last one 14 | // chunks([0,1,2,3], 3) would give [(0,1,2), (3)] 15 | // the Fn should be a function that accumulates chunks into 16 | // list 17 | template 18 | void chunks(const Container &seq, std::size_t k, Fn f) { 19 | auto size = seq.size(); 20 | std::size_t i = 0; 21 | bool last_added = false; 22 | 23 | if (size > k) { 24 | while (i < (size - k)) { 25 | std::size_t start = i; 26 | std::size_t end = i + k - 1; 27 | std::size_t next_start = i + k; 28 | if (next_start >= (size - 1)) { 29 | f(i, size - 1); 30 | last_added = true; 31 | break; 32 | } 33 | f(start, end); 34 | i = next_start; 35 | } 36 | } 37 | // 38 | if (last_added == false) { 39 | f(i, size - 1); 40 | } 41 | } 42 | } // namespace cudaimproc 43 | #endif 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cuda-improc: Image processing with CUDA 2 | 3 | Basic kernels for image processing with NVIDIA® CUDA® Toolkit. We haven't put 4 | any particular effort to make these kernels efficient, hence they serve mainly 5 | as a demonstration rather than a canonic implementation. 6 | 7 | ## Examples 8 | 9 | - Image Gradient: 10 | - [gengrad.cu](src/gengrad/gengrad.cu): single stream gradient 11 | - [asyncgengrad.cu](src/gengrad/asyncgengrad.cu): basic image gradient where each 12 | channel is generated by a separate cuda stream. 13 | - `gengrad.cu` image: image gradient 14 | - `asyncgengrad.cu` image: image gradient 15 | 16 | - Sobel3x3 Filter: 17 | - [sobelx.cu](src/edged/sobelx.cu): 3x3 sobel operator applied in x 18 | direction. Since it is a linear filter, we separate its application to 19 | each channel to a separate cuda stream. 20 | - [sobely.cu](src/edged/sobely.cu): 3x3 sobel operator applied in y 21 | direction. 22 | - [sobelxy.cu](src/edged/sobelxy.cu): 3x3 sobel operator applied in both 23 | directions. 24 | - source image: owl image 25 | - `sobelx.cu` image: owl image with sobel filter applied in x direction 26 | - `sobely.cu` image: owl image with sobel filter applied in y direction 27 | - `sobelxy.cu` image: owl image with sobel filter applied in both x and y direction 28 | 29 | -------------------------------------------------------------------------------- /src/imgio.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | // 7 | #define STB_IMAGE_WRITE_IMPLEMENTATION 8 | #include "stb/stb_image_write.h" 9 | 10 | // 11 | #define STB_IMAGE_IMPLEMENTATION 12 | #include "stb/stb_image.h" 13 | 14 | namespace cudaimproc { 15 | 16 | void render(std::optional pixels_opt, 17 | int imheight, int imwidth, int channel, 18 | const char *imname) { 19 | if (pixels_opt.has_value()) { 20 | unsigned char *pixels_host = *pixels_opt; 21 | int bytes_per_pixel = channel; 22 | int bytes_per_line = imwidth * bytes_per_pixel; 23 | std::string name(imname); 24 | name += ".png"; 25 | const char *fname = name.c_str(); 26 | stbi_write_png(fname, imwidth, imheight, 27 | bytes_per_pixel, pixels_host, 28 | bytes_per_line); 29 | 30 | } else { 31 | std::cout << "P3" << std::endl; 32 | std::cout << imwidth << " " << imheight << std::endl; 33 | std::cout << "255" << std::endl; 34 | for (int j = imheight - 1; j >= 0; --j) { 35 | for (int i = 0; i < imwidth; ++i) { 36 | auto r = static_cast(i) / (imwidth - 1); 37 | auto g = static_cast(j) / (imheight - 1); 38 | auto b = 0.25; 39 | 40 | int ir = static_cast(255.999 * r); 41 | int ig = static_cast(255.999 * g); 42 | int ib = static_cast(255.999 * b); 43 | std::cout << ir << ' ' << ig << ' ' << ib 44 | << std::endl; 45 | } 46 | } 47 | } 48 | } 49 | 50 | img_info::img_info(unsigned char *d, std::size_t w, 51 | std::size_t h, std::size_t c, 52 | const char *n) 53 | : data{nullptr}, width(w), height(h), channels(c), 54 | name(n) { 55 | std::size_t imsize = width * height * channels; 56 | data = new unsigned char[imsize]; 57 | std::copy(d, d + (width * h * c), 58 | data); // copy the data into p2 59 | } 60 | 61 | img_info imread(std::filesystem::path impath) { 62 | // 63 | int w, h, c; 64 | std::filesystem::path im_p = impath.make_preferred(); 65 | const char *img_p = im_p.c_str(); 66 | unsigned char *img = stbi_load(img_p, &w, &h, &c, 0); 67 | img_info info(img, w, h, c, "in_image"); 68 | return info; 69 | } 70 | 71 | 72 | } // namespace cudaimproc 73 | -------------------------------------------------------------------------------- /src/execonfig.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | namespace cudaimproc { 7 | 8 | // find total number of blocks when give number of elements 9 | // to process 10 | std::size_t ExecutionConfig1D::find_nb_blocks( 11 | std::size_t nb_elements_to_process) const { 12 | int remainder = 13 | nb_elements_to_process % threads_per_block; 14 | 15 | std::size_t total_b = 1; 16 | if (remainder == 0) { 17 | total_b = static_cast( 18 | nb_elements_to_process / threads_per_block); 19 | } else if (remainder != 0) { 20 | std::size_t temp = 21 | nb_elements_to_process + threads_per_block; 22 | total_b = static_cast((temp - remainder) / 23 | remainder); 24 | } 25 | return total_b; 26 | } 27 | 28 | // finds number of elements to be processed by each stream 29 | void ExecutionConfig1D::mk_stream_chunks( 30 | std::size_t total_nb_elements_to_process) { 31 | std::size_t stream_size = static_cast( 32 | total_nb_elements_to_process / stream_nb); 33 | stream_chunks.clear(); 34 | stream_chunks = 35 | chunker(total_nb_elements_to_process, stream_size); 36 | } 37 | std::size_t ExecutionConfig1D::nb_threads() const { 38 | return threads_per_block; 39 | } 40 | std::size_t ExecutionConfig1D::nb_streams() const { 41 | return stream_nb; 42 | } 43 | 44 | ExecutionConfig1D::ExecutionConfig1D( 45 | std::size_t nb_elements_to_process, 46 | const std::size_t threads_per_b, const std::size_t nb_s) 47 | : threads_per_block(threads_per_b), stream_nb(nb_s) { 48 | mk_stream_chunks(nb_elements_to_process); 49 | // distribute this to streams 50 | } 51 | 52 | // get block number given streams index 53 | std::size_t ExecutionConfig1D::block_nb( 54 | std::size_t stream_index) const { 55 | if (stream_index >= stream_nb) { 56 | std::string indx = std::to_string(stream_index); 57 | std::string nbs = std::to_string(stream_nb); 58 | std::string msg = "given stream index " + indx; 59 | msg += 60 | " is >= available number of streams " + nbs; 61 | throw std::runtime_error(msg); 62 | } 63 | std::pair stream_start_end = 64 | stream_chunks[stream_index]; 65 | auto start = stream_start_end.first; 66 | auto end = stream_start_end.second; 67 | std::size_t nb_elements_to_process_by_stream = 68 | (end + 1) - start; 69 | std::size_t block_size = 70 | find_nb_blocks(nb_elements_to_process_by_stream); 71 | return block_size; 72 | } 73 | } // namespace cudaimproc 74 | -------------------------------------------------------------------------------- /src/gengrad/gengrad.cu: -------------------------------------------------------------------------------- 1 | // simple 2 | #include 3 | #include 4 | #include 5 | // 6 | #include 7 | #include 8 | #include 9 | 10 | namespace cudaimproc { 11 | 12 | __global__ void gen_gradient(unsigned char *pixels, 13 | const int imwidth, 14 | const int imheight) { 15 | // 16 | int row = threadIdx.x; // local thread index 17 | row += 18 | blockIdx.x * blockDim.x; // thread block id * number 19 | // of threads_per_block 20 | int bytes_per_pixel = 3; 21 | int per_scanline = imwidth * bytes_per_pixel; 22 | 23 | float g = static_cast(row) / 24 | static_cast(imheight - 1); 25 | if (row >= imheight) 26 | return; 27 | for (int col = 0; col < imwidth; ++col) { 28 | // 29 | float r = static_cast(col) / 30 | static_cast(imwidth - 1); 31 | float b = 0.25f; 32 | 33 | unsigned char red = 34 | static_cast(r * 255.99); 35 | unsigned char green = 36 | static_cast(g * 255.99); 37 | unsigned char blue = 38 | static_cast(b * 255.99); 39 | // 40 | int index = col * bytes_per_pixel + row * per_scanline; 41 | pixels[index] = red; 42 | pixels[index + 1] = green; 43 | pixels[index + 2] = blue; 44 | } 45 | } 46 | 47 | }; // namespace cudaimproc 48 | 49 | int main(void) { // yep this is (void) type of day 50 | // 51 | float aspect_ratio = 16.0f / 9.0f; 52 | const std::size_t imwidth = 640; 53 | const std::size_t imheight = static_cast( 54 | static_cast(imwidth) / aspect_ratio); 55 | // execution config 56 | std::size_t threads_per_block = 64; 57 | std::size_t nb_rows_to_process = imheight; 58 | cudaimproc::ExecutionConfig1D config(nb_rows_to_process, 59 | threads_per_block); 60 | std::size_t bytes_per_line = imwidth * 3; 61 | std::size_t imsize = bytes_per_line * imheight; 62 | unsigned char *pixels_device{nullptr}; 63 | // 64 | // cuda malloc 65 | CUDA_CHECK(cudaMalloc((void **)(&pixels_device), 66 | imsize * sizeof(unsigned char))); 67 | 68 | // 69 | std::cout << "block_nb: " << config.block_nb(0) << std::endl; 70 | cudaimproc::gen_gradient<<>>( 72 | pixels_device, imwidth, imheight); 73 | CUDA_CHECK(cudaGetLastError()); 74 | CUDA_CHECK(cudaDeviceSynchronize()); 75 | // 76 | unsigned char *pixels_host = new unsigned char[imsize]; 77 | CUDA_CHECK(cudaMemcpy(pixels_host, pixels_device, 78 | imsize * sizeof(unsigned char), 79 | cudaMemcpyDeviceToHost)); 80 | CUDA_CHECK(cudaFree(pixels_device)); 81 | 82 | // 83 | cudaimproc::render(std::make_optional(pixels_host), 84 | imheight, imwidth); 85 | // cuda_imgproc::render(std::nullopt, imheight, imwidth); 86 | delete[] pixels_host; 87 | return 0; 88 | } 89 | -------------------------------------------------------------------------------- /src/gengrad/trueasyncgrad.cu: -------------------------------------------------------------------------------- 1 | // simple 2 | #include 3 | #include 4 | #include 5 | // 6 | #include 7 | #include 8 | #include 9 | 10 | namespace cudaimproc { 11 | __global__ void 12 | gen_gradient_async(unsigned char *pixels, const int imwidth, 13 | const int imheight, 14 | const std::size_t stream_offset) { 15 | // 16 | int col = threadIdx.x; // local thread index 17 | col += 18 | blockIdx.x * blockDim.x; // thread block id * number 19 | // of threads_per_block 20 | if (col >= imwidth) { 21 | return; 22 | } 23 | 24 | if (rgb_offset == 2) { 25 | } 26 | int bytes_per_pixel = 3; 27 | int per_scanline = imwidth * bytes_per_pixel; 28 | 29 | float g = static_cast(col) / 30 | static_cast(imwidth - 1); 31 | unsigned char green = 32 | static_cast(g * 255.99); 33 | 34 | float b = 0.25f; 35 | unsigned char blue = static_cast(b * 255.99); 36 | 37 | for (int row = 0; row < imheight; ++row) { 38 | // 39 | float r = static_cast(row) / 40 | static_cast(imheight - 1); 41 | unsigned char red = 42 | static_cast(r * 255.99); 43 | // TODO write offset code 44 | } 45 | } 46 | }; // namespace cudaimproc 47 | 48 | int main(void) { // yep this is (void) type of day 49 | // 50 | 51 | // image config 52 | const float aspect_ratio = 16.0f / 9.0f; 53 | const std::size_t imwidth = 640; 54 | const std::size_t imheight = static_cast( 55 | static_cast(imwidth) / aspect_ratio); 56 | const std::size_t bytes_per_line = imwidth * 3; 57 | const std::size_t imsize = bytes_per_line * imheight; 58 | const std::size_t imsizeInBytes = 59 | imsize * sizeof(unsigned char); 60 | 61 | // kernel config 62 | 63 | const std::size_t threads_per_block = 64; 64 | const std::size_t nb_streams = 65 | 3; // 1 for each rgb component 66 | cudaimproc::ExecutionConfig1D config( 67 | imheight, threads_per_block, nb_streams); 68 | const std::size_t streamSize = 69 | imsize / config.nb_streams(); 70 | cudaStream_t streams[nb_streams]; 71 | 72 | // create cuda stream 73 | for (int i = 0; i < nb_streams; ++i) { 74 | CUDA_CHECK(cudaStreamCreate(&streams[i])); 75 | } 76 | 77 | unsigned char *pixels_device{nullptr}; 78 | // 79 | // cuda malloc 80 | CUDA_CHECK( 81 | cudaMalloc((void **)(&pixels_device), imsizeInBytes)); 82 | 83 | // 84 | 85 | for (int i = 0; i < nb_streams; ++i) { 86 | std::size_t sblock_nb = 87 | streamSize / config.nb_threads(); 88 | cudaimproc::gen_gradient_async<<< 89 | sblock_nb, config.nb_threads(), 0, streams[i]>>>( 90 | pixels_device, imwidth, imheight, i); 91 | CUDA_CHECK(cudaGetLastError()); 92 | } 93 | CUDA_CHECK(cudaDeviceSynchronize()); 94 | // 95 | unsigned char *pixels_host = new unsigned char[imsize]; 96 | CUDA_CHECK(cudaMemcpy(pixels_host, pixels_device, 97 | imsizeInBytes, 98 | cudaMemcpyDeviceToHost)); 99 | CUDA_CHECK(cudaFree(pixels_device)); 100 | 101 | // 102 | cudaimproc::render(std::make_optional(pixels_host), 103 | imheight, imwidth, 3, "asyncimg"); 104 | // destroy resources 105 | for (int i = 0; i < nb_streams; ++i) { 106 | CUDA_CHECK(cudaStreamDestroy(streams[i])); 107 | } 108 | delete[] pixels_host; 109 | return 0; 110 | } 111 | -------------------------------------------------------------------------------- /src/gengrad/asyncgengrad.cu: -------------------------------------------------------------------------------- 1 | // simple 2 | #include 3 | #include 4 | #include 5 | // 6 | #include 7 | #include 8 | #include 9 | 10 | namespace cudaimproc { 11 | __global__ void gen_gradient_async(unsigned char *pixels, 12 | const int imwidth, 13 | const int imheight, 14 | const int rgb_offset) { 15 | // 16 | int col = threadIdx.x; // local thread index 17 | col += 18 | blockIdx.x * blockDim.x; // thread block id * number 19 | // of threads_per_block 20 | if (col >= imwidth) { 21 | return; 22 | } 23 | 24 | float pv = static_cast(col) / 25 | static_cast(imwidth - 1); 26 | unsigned char p = static_cast(pv * 255.99); 27 | if (rgb_offset == 2) { 28 | float b = 0.25f; 29 | p = static_cast(b * 255.99); 30 | } 31 | int bytes_per_pixel = 3; 32 | int per_scanline = imwidth * bytes_per_pixel; 33 | 34 | for (int row = 0; row < imheight; ++row) { 35 | // 36 | if (rgb_offset == 1) { 37 | float pv = static_cast(row) / 38 | static_cast(imheight - 1); 39 | p = static_cast(pv * 255.99); 40 | } 41 | int index = col * bytes_per_pixel + row * per_scanline; 42 | pixels[index + rgb_offset] = p; 43 | } 44 | } 45 | }; // namespace cudaimproc 46 | 47 | int main(void) { // yep this is (void) type of day 48 | // 49 | 50 | // image config 51 | const float aspect_ratio = 16.0f / 9.0f; 52 | const std::size_t imwidth = 640; 53 | const std::size_t imheight = static_cast( 54 | static_cast(imwidth) / aspect_ratio); 55 | const std::size_t bytes_per_line = imwidth * 3; 56 | const std::size_t imsize = bytes_per_line * imheight; 57 | const std::size_t imsizeInBytes = 58 | imsize * sizeof(unsigned char); 59 | 60 | // kernel config 61 | 62 | const std::size_t threads_per_block = 64; 63 | const std::size_t nb_streams = 64 | 3; // 1 for each rgb component 65 | cudaimproc::ExecutionConfig1D config( 66 | imheight, threads_per_block, nb_streams); 67 | const std::size_t streamSize = 68 | imsize / config.nb_streams(); 69 | cudaStream_t streams[nb_streams]; 70 | 71 | // create cuda stream 72 | for (int i = 0; i < nb_streams; ++i) { 73 | CUDA_CHECK(cudaStreamCreate(&streams[i])); 74 | } 75 | 76 | unsigned char *pixels_device{nullptr}; 77 | // 78 | // cuda malloc 79 | CUDA_CHECK( 80 | cudaMalloc((void **)(&pixels_device), imsizeInBytes)); 81 | 82 | // 83 | 84 | for (int i = 0; i < nb_streams; ++i) { 85 | std::size_t sblock_nb = 86 | streamSize / config.nb_threads(); 87 | cudaimproc::gen_gradient_async<<< 88 | sblock_nb, config.nb_threads(), 0, streams[i]>>>( 89 | pixels_device, imwidth, imheight, i); 90 | CUDA_CHECK(cudaGetLastError()); 91 | } 92 | CUDA_CHECK(cudaDeviceSynchronize()); 93 | // 94 | unsigned char *pixels_host = new unsigned char[imsize]; 95 | CUDA_CHECK(cudaMemcpy(pixels_host, pixels_device, 96 | imsizeInBytes, 97 | cudaMemcpyDeviceToHost)); 98 | CUDA_CHECK(cudaFree(pixels_device)); 99 | 100 | // 101 | cudaimproc::render(std::make_optional(pixels_host), 102 | imheight, imwidth, 3, "asyncimg"); 103 | // destroy resources 104 | for (int i = 0; i < nb_streams; ++i) { 105 | CUDA_CHECK(cudaStreamDestroy(streams[i])); 106 | } 107 | delete[] pixels_host; 108 | return 0; 109 | } 110 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18 FATAL_ERROR) 2 | project("CUDA-imgproc" LANGUAGES CXX CUDA) 3 | 4 | find_package(CUDAToolkit REQUIRED) 5 | 6 | set(ExecutionSRC 7 | "${CMAKE_CURRENT_SOURCE_DIR}/src/execonfig.cpp" 8 | "${CMAKE_CURRENT_SOURCE_DIR}/src/utils.cpp" 9 | ) 10 | 11 | # single stream gradient 12 | add_executable(gengrad.out 13 | "${CMAKE_CURRENT_SOURCE_DIR}/src/gengrad/gengrad.cu" 14 | "${CMAKE_CURRENT_SOURCE_DIR}/src/imgio.cpp" 15 | ${ExecutionSRC} 16 | ) 17 | 18 | target_include_directories(gengrad.out PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") 19 | 20 | target_link_libraries(gengrad.out PRIVATE CUDA::cudart) 21 | target_link_libraries(gengrad.out PRIVATE CUDA::cuda_driver) 22 | target_link_libraries(gengrad.out PRIVATE CUDA::curand) 23 | 24 | set_target_properties(gengrad.out PROPERTIES LINKER_LANGUAGE CUDA) 25 | set_target_properties(gengrad.out PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 26 | 27 | # multi stream gradient 28 | add_executable(asyncgengrad.out 29 | "${CMAKE_CURRENT_SOURCE_DIR}/src/gengrad/asyncgengrad.cu" 30 | "${CMAKE_CURRENT_SOURCE_DIR}/src/imgio.cpp" 31 | ${ExecutionSRC} 32 | ) 33 | 34 | target_include_directories(asyncgengrad.out PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") 35 | 36 | target_link_libraries(asyncgengrad.out PRIVATE CUDA::cudart) 37 | target_link_libraries(asyncgengrad.out PRIVATE CUDA::cuda_driver) 38 | target_link_libraries(asyncgengrad.out PRIVATE CUDA::curand) 39 | 40 | set_target_properties(asyncgengrad.out PROPERTIES LINKER_LANGUAGE CUDA) 41 | set_target_properties(asyncgengrad.out PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 42 | 43 | # 44 | # sobel 45 | set(ImDir "${CMAKE_CURRENT_SOURCE_DIR}/images") 46 | set(ImSRC 47 | "${CMAKE_CURRENT_SOURCE_DIR}/src/imgio.cpp" 48 | "${CMAKE_CURRENT_SOURCE_DIR}/src/imutils.cpp" 49 | ) 50 | 51 | add_executable(sobelx.out 52 | "${CMAKE_CURRENT_SOURCE_DIR}/src/edged/sobelx.cu" 53 | ${ImSRC} 54 | ${ExecutionSRC} 55 | ) 56 | 57 | target_include_directories(sobelx.out PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") 58 | 59 | target_link_libraries(sobelx.out PRIVATE CUDA::cudart) 60 | target_link_libraries(sobelx.out PRIVATE CUDA::cuda_driver) 61 | target_link_libraries(sobelx.out PRIVATE CUDA::curand) 62 | 63 | set_target_properties(sobelx.out PROPERTIES LINKER_LANGUAGE CUDA) 64 | set_target_properties(sobelx.out PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 65 | 66 | target_compile_definitions( 67 | sobelx.out 68 | PRIVATE 69 | IMAGE_DIR="${ImDir}" 70 | ) 71 | 72 | add_executable(sobely.out 73 | "${CMAKE_CURRENT_SOURCE_DIR}/src/edged/sobely.cu" 74 | ${ImSRC} 75 | ${ExecutionSRC} 76 | ) 77 | 78 | target_include_directories(sobely.out PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") 79 | 80 | target_link_libraries(sobely.out PRIVATE CUDA::cudart) 81 | target_link_libraries(sobely.out PRIVATE CUDA::cuda_driver) 82 | target_link_libraries(sobely.out PRIVATE CUDA::curand) 83 | 84 | set_target_properties(sobely.out PROPERTIES LINKER_LANGUAGE CUDA) 85 | set_target_properties(sobely.out PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 86 | 87 | target_compile_definitions( 88 | sobely.out 89 | PRIVATE 90 | IMAGE_DIR="${ImDir}" 91 | ) 92 | 93 | 94 | add_executable(sobelxy.out 95 | "${CMAKE_CURRENT_SOURCE_DIR}/src/edged/sobelxy.cu" 96 | ${ImSRC} 97 | ${ExecutionSRC} 98 | ) 99 | 100 | target_include_directories(sobelxy.out PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") 101 | 102 | target_link_libraries(sobelxy.out PRIVATE CUDA::cudart) 103 | target_link_libraries(sobelxy.out PRIVATE CUDA::cuda_driver) 104 | target_link_libraries(sobelxy.out PRIVATE CUDA::curand) 105 | 106 | set_target_properties(sobelxy.out PROPERTIES LINKER_LANGUAGE CUDA) 107 | set_target_properties(sobelxy.out PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 108 | 109 | target_compile_definitions( 110 | sobelxy.out 111 | PRIVATE 112 | IMAGE_DIR="${ImDir}" 113 | ) 114 | -------------------------------------------------------------------------------- /src/edged/sobelx.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | // 4 | #include 5 | // 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace cudaimproc { 12 | // 13 | __global__ void sobelX_3x3(unsigned char *in_img, 14 | unsigned char *out_img, 15 | const int imwidth, 16 | const int imheight, 17 | const int rgb_offset) { 18 | // 19 | float sobel_1[] = { 20 | 1, 0, -1, // first row 21 | }; 22 | float sobel_2[] = { 23 | 2, 0, -2, // second row 24 | }; 25 | float sobel_3[] = { 26 | 1, 0, -1 // third row 27 | }; 28 | unsigned int scol_nb = 0; 29 | unsigned int srow_nb = 0; 30 | cudamat::MatN sobel_m; 31 | sobel_m.col_nb(scol_nb); 32 | sobel_m.row_nb(srow_nb); 33 | sobel_m.set_row(0, sobel_1); 34 | sobel_m.set_row(1, sobel_2); 35 | sobel_m.set_row(2, sobel_3); 36 | int col = threadIdx.x; // local thread index 37 | col += 38 | blockIdx.x * blockDim.x; // thread block id * number 39 | // of threads_per_block 40 | if (col >= imwidth) { 41 | return; 42 | } 43 | if (col < scol_nb) { 44 | // do padding or nothing 45 | return; 46 | } 47 | int bytes_per_pixel = 3; 48 | int per_scanline = imwidth * bytes_per_pixel; 49 | 50 | for (int row = srow_nb; row < imheight; ++row) { 51 | // 52 | float g = 0.0f; 53 | for (unsigned int sr = 0; sr < srow_nb; ++sr) { 54 | for (unsigned int sc = 0; sc < scol_nb; ++sc) { 55 | // 56 | int index = (col - sc) * bytes_per_pixel + 57 | (row - sr) * per_scanline; 58 | float scell; 59 | sobel_m.get(sr, sc, scell); 60 | g += scell * in_img[index + rgb_offset]; 61 | } 62 | } 63 | int index = col * bytes_per_pixel + row * per_scanline; 64 | out_img[index + rgb_offset] = 65 | static_cast(g); 66 | } 67 | } 68 | }; // namespace cudaimproc 69 | 70 | int main() { 71 | 72 | // image config 73 | cudaimproc::img_info info = cudaimproc::owl_img(); 74 | const std::size_t imgSize = 75 | info.width * info.height * info.channels; 76 | const std::size_t imgSizeByte = 77 | imgSize * sizeof(unsigned char); 78 | // 79 | // execution config 80 | const std::size_t threads_per_block = 64; 81 | const std::size_t nb_streams = 82 | 3; // 1 for each rgb component 83 | const std::size_t streamSize = imgSize / nb_streams; 84 | cudaStream_t streams[nb_streams]; 85 | 86 | // create cuda stream 87 | for (int i = 0; i < nb_streams; ++i) { 88 | CUDA_CHECK(cudaStreamCreate(&streams[i])); 89 | } 90 | 91 | // 92 | unsigned char *d_out_img{nullptr}; 93 | unsigned char *d_in_img{nullptr}; 94 | CUDA_CHECK(cudaMalloc( 95 | reinterpret_cast(&d_out_img), imgSizeByte)); 96 | CUDA_CHECK(cudaMalloc( 97 | reinterpret_cast(&d_in_img), imgSizeByte)); 98 | 99 | CUDA_CHECK( 100 | cudaMemcpy(reinterpret_cast(d_in_img), 101 | reinterpret_cast(info.data), 102 | imgSizeByte, cudaMemcpyHostToDevice)); 103 | 104 | // launch kernel 105 | for (int i = 0; i < nb_streams; ++i) { 106 | cudaimproc:: 107 | sobelX_3x3<<>>( 109 | d_in_img, d_out_img, info.width, info.height, 110 | i); 111 | CUDA_CHECK(cudaGetLastError()); 112 | } 113 | CUDA_CHECK(cudaDeviceSynchronize()); 114 | 115 | // destroy cuda stream 116 | for (int i = 0; i < nb_streams; ++i) { 117 | CUDA_CHECK(cudaStreamDestroy(streams[i])); 118 | } 119 | CUDA_CHECK(cudaFree(d_in_img)); 120 | unsigned char *out_img = new unsigned char[imgSize]; 121 | CUDA_CHECK( 122 | cudaMemcpy(reinterpret_cast(out_img), 123 | reinterpret_cast(d_out_img), 124 | imgSizeByte, cudaMemcpyDeviceToHost)); 125 | CUDA_CHECK(cudaFree(d_out_img)); 126 | // 127 | cudaimproc::render(std::make_optional(out_img), 128 | info.height, info.width, info.channels, 129 | "sobelX_img"); 130 | // cuda_imgproc::render(std::nullopt, imheight, imwidth); 131 | delete[] out_img; 132 | return 0; 133 | } 134 | -------------------------------------------------------------------------------- /src/edged/sobely.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | // 4 | #include 5 | // 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace cudaimproc { 12 | // 13 | __global__ void sobelY_3x3(unsigned char *in_img, 14 | unsigned char *out_img, 15 | const int imwidth, 16 | const int imheight, 17 | const int rgb_offset) { 18 | // 19 | float sobel_1[] = { 20 | 1, 2, 1, // first row 21 | }; 22 | float sobel_2[] = { 23 | 0, 0, 0, // second row 24 | }; 25 | float sobel_3[] = { 26 | -1, -2, -1 // third row 27 | }; 28 | unsigned int scol_nb = 0; 29 | unsigned int srow_nb = 0; 30 | cudamat::MatN sobel_m; 31 | sobel_m.col_nb(scol_nb); 32 | sobel_m.row_nb(srow_nb); 33 | sobel_m.set_row(0, sobel_1); 34 | sobel_m.set_row(1, sobel_2); 35 | sobel_m.set_row(2, sobel_3); 36 | int col = threadIdx.x; // local thread index 37 | col += 38 | blockIdx.x * blockDim.x; // thread block id * number 39 | // of threads_per_block 40 | if (col >= imwidth) { 41 | return; 42 | } 43 | if (col < scol_nb) { 44 | // do padding or nothing 45 | return; 46 | } 47 | int bytes_per_pixel = 3; 48 | int per_scanline = imwidth * bytes_per_pixel; 49 | 50 | for (int row = srow_nb; row < imheight; ++row) { 51 | // 52 | float g = 0.0f; 53 | for (unsigned int sr = 0; sr < srow_nb; ++sr) { 54 | for (unsigned int sc = 0; sc < scol_nb; ++sc) { 55 | // 56 | int index = (col - sc) * bytes_per_pixel + 57 | (row - sr) * per_scanline; 58 | float scell; 59 | sobel_m.get(sr, sc, scell); 60 | g += scell * in_img[index + rgb_offset]; 61 | } 62 | } 63 | int index = col * bytes_per_pixel + row * per_scanline; 64 | out_img[index + rgb_offset] = 65 | static_cast(g); 66 | } 67 | } 68 | }; // namespace cudaimproc 69 | 70 | int main() { 71 | std::filesystem::path img_dir(IMAGE_DIR); 72 | std::filesystem::path imname("owl.jpg"); 73 | std::filesystem::path imgp = img_dir / imname; 74 | 75 | // image config 76 | cudaimproc::img_info info = cudaimproc::imread(imgp); 77 | const std::size_t imgSize = 78 | info.width * info.height * info.channels; 79 | const std::size_t imgSizeByte = 80 | imgSize * sizeof(unsigned char); 81 | // 82 | // execution config 83 | const std::size_t threads_per_block = 64; 84 | const std::size_t nb_streams = 85 | 3; // 1 for each rgb component 86 | const std::size_t streamSize = imgSize / nb_streams; 87 | cudaStream_t streams[nb_streams]; 88 | 89 | // create cuda stream 90 | for (int i = 0; i < nb_streams; ++i) { 91 | CUDA_CHECK(cudaStreamCreate(&streams[i])); 92 | } 93 | 94 | // 95 | unsigned char *d_out_img{nullptr}; 96 | unsigned char *d_in_img{nullptr}; 97 | CUDA_CHECK(cudaMalloc( 98 | reinterpret_cast(&d_out_img), imgSizeByte)); 99 | CUDA_CHECK(cudaMalloc( 100 | reinterpret_cast(&d_in_img), imgSizeByte)); 101 | 102 | CUDA_CHECK( 103 | cudaMemcpy(reinterpret_cast(d_in_img), 104 | reinterpret_cast(info.data), 105 | imgSizeByte, cudaMemcpyHostToDevice)); 106 | 107 | // launch kernel 108 | for (int i = 0; i < nb_streams; ++i) { 109 | cudaimproc:: 110 | sobelY_3x3<<>>( 112 | d_in_img, d_out_img, info.width, info.height, 113 | i); 114 | CUDA_CHECK(cudaGetLastError()); 115 | } 116 | CUDA_CHECK(cudaDeviceSynchronize()); 117 | 118 | // destroy cuda stream 119 | for (int i = 0; i < nb_streams; ++i) { 120 | CUDA_CHECK(cudaStreamDestroy(streams[i])); 121 | } 122 | CUDA_CHECK(cudaFree(d_in_img)); 123 | unsigned char *out_img = new unsigned char[imgSize]; 124 | CUDA_CHECK( 125 | cudaMemcpy(reinterpret_cast(out_img), 126 | reinterpret_cast(d_out_img), 127 | imgSizeByte, cudaMemcpyDeviceToHost)); 128 | CUDA_CHECK(cudaFree(d_out_img)); 129 | // 130 | cudaimproc::render(std::make_optional(out_img), 131 | info.height, info.width, info.channels, 132 | "sobelY_img"); 133 | // cuda_imgproc::render(std::nullopt, imheight, imwidth); 134 | delete[] out_img; 135 | return 0; 136 | } 137 | -------------------------------------------------------------------------------- /src/edged/sobelxy.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | // 4 | #include 5 | // 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace cudaimproc { 12 | __device__ cudamat::MatN mk_ymat() { 13 | float sobel_1[] = { 14 | 1, 2, 1, // first row 15 | }; 16 | float sobel_2[] = { 17 | 0, 0, 0, // second row 18 | }; 19 | float sobel_3[] = { 20 | -1, -2, -1 // third row 21 | }; 22 | cudamat::MatN sobel_m; 23 | sobel_m.set_row(0, sobel_1); 24 | sobel_m.set_row(1, sobel_2); 25 | sobel_m.set_row(2, sobel_3); 26 | return sobel_m; 27 | } 28 | __device__ cudamat::MatN mk_xmat() { 29 | float sobel_1[] = { 30 | 1, 0, -1, // first row 31 | }; 32 | float sobel_2[] = { 33 | 2, 0, -2, // second row 34 | }; 35 | float sobel_3[] = { 36 | 1, 0, -1 // third row 37 | }; 38 | cudamat::MatN sobel_m; 39 | sobel_m.set_row(0, sobel_1); 40 | sobel_m.set_row(1, sobel_2); 41 | sobel_m.set_row(2, sobel_3); 42 | return sobel_m; 43 | } 44 | // 45 | __global__ void sobelXY_3x3(unsigned char *in_img, 46 | unsigned char *out_img, 47 | const int imwidth, 48 | const int imheight, 49 | const int rgb_offset) { 50 | // 51 | 52 | unsigned int scol_nb = 0; 53 | unsigned int srow_nb = 0; 54 | cudamat::MatN sobel_y = mk_ymat(); 55 | sobel_y.col_nb(scol_nb); 56 | sobel_y.row_nb(srow_nb); 57 | cudamat::MatN sobel_x = mk_xmat(); 58 | int col = threadIdx.x; // local thread index 59 | col += 60 | blockIdx.x * blockDim.x; // thread block id * number 61 | // of threads_per_block 62 | if (col >= imwidth) { 63 | return; 64 | } 65 | if (col < scol_nb) { 66 | // do padding or nothing 67 | return; 68 | } 69 | int bytes_per_pixel = 3; 70 | int per_scanline = imwidth * bytes_per_pixel; 71 | 72 | for (int row = srow_nb; row < imheight; ++row) { 73 | // 74 | float gx = 0.0f; 75 | float gy = 0.0f; 76 | for (unsigned int sr = 0; sr < srow_nb; ++sr) { 77 | for (unsigned int sc = 0; sc < scol_nb; ++sc) { 78 | // 79 | int index = (col - sc) * bytes_per_pixel + 80 | (row - sr) * per_scanline; 81 | float scell; 82 | sobel_y.get(sr, sc, scell); 83 | gy += scell * in_img[index + rgb_offset]; 84 | sobel_x.get(sr, sc, scell); 85 | gx += scell * in_img[index + rgb_offset]; 86 | } 87 | } 88 | int index = col * bytes_per_pixel + row * per_scanline; 89 | float g = std::sqrt(gx * gx + gy * gy); 90 | out_img[index + rgb_offset] = 91 | static_cast(g); 92 | } 93 | } 94 | }; // namespace cudaimproc 95 | 96 | int main() { 97 | std::filesystem::path img_dir(IMAGE_DIR); 98 | std::filesystem::path imname("owl.jpg"); 99 | std::filesystem::path imgp = img_dir / imname; 100 | 101 | // image config 102 | cudaimproc::img_info info = cudaimproc::imread(imgp); 103 | const std::size_t imgSize = 104 | info.width * info.height * info.channels; 105 | const std::size_t imgSizeByte = 106 | imgSize * sizeof(unsigned char); 107 | // 108 | // execution config 109 | const std::size_t threads_per_block = 64; 110 | const std::size_t nb_streams = 111 | 3; // 1 for each rgb component 112 | const std::size_t streamSize = imgSize / nb_streams; 113 | cudaStream_t streams[nb_streams]; 114 | 115 | // create cuda stream 116 | for (int i = 0; i < nb_streams; ++i) { 117 | CUDA_CHECK(cudaStreamCreate(&streams[i])); 118 | } 119 | 120 | // 121 | unsigned char *d_out_img{nullptr}; 122 | unsigned char *d_in_img{nullptr}; 123 | CUDA_CHECK(cudaMalloc( 124 | reinterpret_cast(&d_out_img), imgSizeByte)); 125 | CUDA_CHECK(cudaMalloc( 126 | reinterpret_cast(&d_in_img), imgSizeByte)); 127 | 128 | CUDA_CHECK( 129 | cudaMemcpy(reinterpret_cast(d_in_img), 130 | reinterpret_cast(info.data), 131 | imgSizeByte, cudaMemcpyHostToDevice)); 132 | 133 | // launch kernel 134 | for (int i = 0; i < nb_streams; ++i) { 135 | cudaimproc:: 136 | sobelXY_3x3<<>>( 138 | d_in_img, d_out_img, info.width, info.height, 139 | i); 140 | CUDA_CHECK(cudaGetLastError()); 141 | } 142 | CUDA_CHECK(cudaDeviceSynchronize()); 143 | 144 | // destroy cuda stream 145 | for (int i = 0; i < nb_streams; ++i) { 146 | CUDA_CHECK(cudaStreamDestroy(streams[i])); 147 | } 148 | CUDA_CHECK(cudaFree(d_in_img)); 149 | unsigned char *out_img = new unsigned char[imgSize]; 150 | CUDA_CHECK( 151 | cudaMemcpy(reinterpret_cast(out_img), 152 | reinterpret_cast(d_out_img), 153 | imgSizeByte, cudaMemcpyDeviceToHost)); 154 | CUDA_CHECK(cudaFree(d_out_img)); 155 | // 156 | cudaimproc::render(std::make_optional(out_img), 157 | info.height, info.width, info.channels, 158 | "sobelXY_img"); 159 | // cuda_imgproc::render(std::nullopt, imheight, imwidth); 160 | delete[] out_img; 161 | return 0; 162 | } 163 | -------------------------------------------------------------------------------- /include/matrix/matrix.cuh: -------------------------------------------------------------------------------- 1 | #ifndef MATRIX_CUH 2 | #define MATRIX_CUH 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | namespace cudamat { 15 | 16 | enum mstatus_t : std::uint_least8_t { 17 | SUCCESS = 1, 18 | INDEX_ERROR = 2, 19 | SIZE_ERROR = 3, 20 | ARG_ERROR = 4, 21 | LU_ERROR = 5, 22 | NOT_IMPLEMENTED = 6 23 | }; 24 | 25 | struct MResult { 26 | // 27 | unsigned int line_info = 0; 28 | const char *file_name = ""; 29 | const char *fn_name = ""; 30 | const char *call_name = ""; 31 | const char *duration_info = ""; 32 | 33 | mstatus_t status; 34 | bool success = false; 35 | 36 | __host__ __device__ MResult() {} 37 | __host__ __device__ MResult(unsigned int line, 38 | const char *fname, 39 | const char *funcname, 40 | const char *cname, 41 | mstatus_t op) 42 | : line_info(line), file_name(fname), 43 | fn_name(funcname), call_name(cname), status(op), 44 | success(op == SUCCESS) {} 45 | }; 46 | 47 | template 49 | class MatN { 50 | /** holds the vector data*/ 51 | T data[ColNb * RowNb]; 52 | static const unsigned int nb_rows = RowNb; 53 | static const unsigned int nb_cols = ColNb; 54 | static const unsigned int size = ColNb * RowNb; 55 | static const unsigned int sizeInBytes = 56 | ColNb * RowNb * sizeof(T); 57 | 58 | public: 59 | __host__ __device__ MatN() { 60 | for (unsigned int i = 0; i < RowNb * ColNb; i++) { 61 | data[i] = static_cast(0); 62 | } 63 | // lu_decomposition = LUdcmp(data); 64 | } 65 | __host__ __device__ ~MatN() { delete[] data; } 66 | __host__ __device__ MatN(const T vd[RowNb * ColNb]) 67 | : data(vd) {} 68 | 69 | __host__ __device__ MatN(T fill_value) { 70 | for (unsigned int i = 0; i < size; i++) { 71 | set(i, fill_value); 72 | } 73 | } 74 | template 75 | friend __host__ __device__ std::stringstream & 76 | operator<<(std::stringstream &out, MatN m); 77 | 78 | /**\brief Create matrix based on argument matrix*/ 79 | template 81 | __host__ __device__ static MResult 82 | from_row_cols(MatN &out) { 83 | out = MatN(static_cast(0)); 84 | return MResult(__LINE__, __FILE__, __FUNCTION__, 85 | "from_row_cols", SUCCESS); 86 | } 87 | template 89 | __host__ __device__ static MResult 90 | from_row_cols(T v, MatN &out) { 91 | MatN mat(v); 92 | out = mat; 93 | return MResult(__LINE__, __FILE__, __FUNCTION__, 94 | "from_row_cols", SUCCESS); 95 | } 96 | template 98 | __host__ __device__ static MResult 99 | identity(unsigned int nb, 100 | MatN &out) { 101 | MatN mat; 102 | auto r = from_row_cols(mat); 103 | if (r.status != SUCCESS) 104 | return r; 105 | for (unsigned int i = 0; i < nb; i++) { 106 | mat.set(i, i, static_cast(1)); 107 | } 108 | out = mat; 109 | return MResult(__LINE__, __FILE__, __FUNCTION__, 110 | "identity", SUCCESS); 111 | } 112 | __host__ __device__ MResult 113 | apply(const MatN &vmat, 114 | const std::function &fn, 115 | MatN &out) const { 116 | for (unsigned int i = 0; i < size; i++) { 117 | T tout = static_cast(0); 118 | vmat.get(i, tout); 119 | T val = fn(data[i], tout); 120 | auto r = out.set(i, val); 121 | if (r.status != SUCCESS) 122 | return r; 123 | } 124 | return MResult(__LINE__, __FILE__, __FUNCTION__, 125 | "apply", SUCCESS); 126 | } 127 | __host__ __device__ MResult 128 | apply(const T &v, const std::function &fn, 129 | MatN &out) const { 130 | for (unsigned int i = 0; i < size; i++) { 131 | T val = fn(data[i], v); 132 | auto r = out.set(i, val); 133 | if (r.status != SUCCESS) 134 | return r; 135 | } 136 | return MResult(__LINE__, __FILE__, __FUNCTION__, 137 | "apply", SUCCESS); 138 | } 139 | // tested 140 | template 142 | __host__ __device__ MResult 143 | fill(T v, MatN &out) const { 144 | unsigned int s = 0; 145 | out.get_size(s); 146 | for (unsigned int i = 0; i < s; i++) { 147 | out.set(i, v); 148 | } 149 | return MResult(__LINE__, __FILE__, __FUNCTION__, "fill", 150 | SUCCESS); 151 | } 152 | // tested 153 | __host__ __device__ MResult 154 | transpose(MatN &out) const { 155 | 156 | for (unsigned int i = 0; i < nb_rows; i++) { 157 | for (unsigned int j = 0; j < nb_cols; j++) { 158 | T tout = static_cast(0); 159 | get(i, j, tout); 160 | out.set(j, i, tout); 161 | } 162 | } 163 | return MResult(__LINE__, __FILE__, __FUNCTION__, 164 | "transpose", SUCCESS); 165 | } 166 | // tested 167 | __host__ __device__ MResult 168 | col_nb(unsigned int &v) const { 169 | v = nb_cols; 170 | return MResult(__LINE__, __FILE__, __FUNCTION__, 171 | "col_nb", SUCCESS); 172 | } 173 | // tested 174 | __host__ __device__ MResult 175 | row_nb(unsigned int &v) const { 176 | v = nb_rows; 177 | return MResult(__LINE__, __FILE__, __FUNCTION__, 178 | "row_nb", SUCCESS); 179 | } 180 | // tested 181 | __host__ __device__ MResult 182 | get_size(unsigned int &out) const { 183 | out = size; 184 | return MResult(__LINE__, __FILE__, __FUNCTION__, 185 | "get_size", SUCCESS); 186 | } 187 | __host__ __device__ MResult get(unsigned int row, 188 | unsigned int col, 189 | T &out) const { 190 | unsigned int index = row * nb_cols + col; 191 | MResult r = get(index, out); 192 | if (r.status != SUCCESS) 193 | return r; 194 | return MResult(__LINE__, __FILE__, __FUNCTION__, "get", 195 | SUCCESS); 196 | } 197 | __host__ __device__ MResult get(unsigned int index, 198 | T &out) const { 199 | if (index >= size) 200 | return MResult(__LINE__, __FILE__, __FUNCTION__, 201 | "get", INDEX_ERROR); 202 | out = data[index]; 203 | return MResult(__LINE__, __FILE__, __FUNCTION__, "get", 204 | SUCCESS); 205 | } 206 | __host__ __device__ MResult 207 | get(T out[RowNb * ColNb]) const { 208 | out = data; 209 | return MResult(__LINE__, __FILE__, __FUNCTION__, "get", 210 | SUCCESS); 211 | } 212 | __host__ __device__ MResult set(unsigned int row, 213 | unsigned int col, T el) { 214 | unsigned int index = row * nb_cols + col; 215 | auto r = set(index, el); 216 | if (r.status != SUCCESS) 217 | return r; 218 | return MResult(__LINE__, __FILE__, __FUNCTION__, "set", 219 | SUCCESS); 220 | } 221 | __host__ __device__ MResult set(unsigned int index, 222 | T el) { 223 | if (index >= size) 224 | return MResult(__LINE__, __FILE__, __FUNCTION__, 225 | "set", INDEX_ERROR); 226 | 227 | data[index] = el; 228 | return MResult(__LINE__, __FILE__, __FUNCTION__, "set", 229 | SUCCESS); 230 | } 231 | __host__ __device__ MResult 232 | column(unsigned int index, T out[RowNb]) const { 233 | if (index >= ColNb) { 234 | return MResult(__LINE__, __FILE__, __FUNCTION__, 235 | "column", INDEX_ERROR); 236 | } 237 | for (unsigned int i = 0; i < RowNb; i++) { 238 | out[i] = data[i * ColNb + index]; 239 | } 240 | return MResult(__LINE__, __FILE__, __FUNCTION__, 241 | "column", SUCCESS); 242 | } 243 | __host__ __device__ MResult 244 | set_column(unsigned int index, const T idata[RowNb]) { 245 | if (index >= ColNb) { 246 | return MResult(__LINE__, __FILE__, __FUNCTION__, 247 | "set_column", INDEX_ERROR); 248 | } 249 | for (unsigned int i = 0; i < RowNb; i++) { 250 | data[i * ColNb + index] = idata[i]; 251 | } 252 | return MResult(__LINE__, __FILE__, __FUNCTION__, 253 | "set_column", SUCCESS); 254 | } 255 | __host__ __device__ MResult row(unsigned int index, 256 | T out[ColNb]) const { 257 | if (index >= RowNb) { 258 | return MResult(__LINE__, __FILE__, __FUNCTION__, 259 | "row", INDEX_ERROR); 260 | } 261 | for (unsigned int i = 0; i < ColNb; i++) { 262 | out[i] = data[index * ColNb + i]; 263 | } 264 | return MResult(__LINE__, __FILE__, __FUNCTION__, 265 | "row", SUCCESS); 266 | } 267 | __host__ __device__ MResult 268 | set_row(unsigned int index, const T idata[ColNb]) { 269 | if (index >= RowNb) { 270 | return MResult(__LINE__, __FILE__, __FUNCTION__, 271 | "set_row", INDEX_ERROR); 272 | } 273 | for (unsigned int i = 0; i < ColNb; i++) { 274 | data[index * ColNb + i] = idata[i]; 275 | } 276 | return MResult(__LINE__, __FILE__, __FUNCTION__, 277 | "set_row", SUCCESS); 278 | } 279 | 280 | /**Obtain submatrix TODO*/ 281 | __host__ __device__ MResult 282 | submat(unsigned int row_start, unsigned int col_start, 283 | MatN &out) const { 284 | unsigned int row_size = nb_rows - row_start; 285 | unsigned int col_size = nb_cols - col_start; 286 | return MResult(__LINE__, __FILE__, __FUNCTION__, 287 | "submat", NOT_IMPLEMENTED); 288 | } 289 | __host__ __device__ MResult 290 | add(const MatN &v, 291 | MatN &out) const { 292 | auto fn = [](T matv, T val) { return matv + val; }; 293 | return apply(v, fn, out); 294 | } 295 | __host__ __device__ MResult 296 | add(T v, MatN &out) const { 297 | auto fn = [](T matv, T val) { return matv + val; }; 298 | return apply(v, fn, out); 299 | } 300 | __host__ __device__ MResult 301 | subtract(const MatN &v, 302 | MatN &out) const { 303 | auto fn = [](T matv, T val) { return matv - val; }; 304 | return apply(v, fn, out); 305 | } 306 | __host__ __device__ MResult 307 | subtract(T v, MatN &out) const { 308 | auto fn = [](T matv, T val) { return matv - val; }; 309 | return apply(v, fn, out); 310 | } 311 | __host__ __device__ MResult 312 | hadamard_product(const MatN &v, 313 | MatN &out) const { 314 | auto fn = [](T matv, T val) { return matv * val; }; 315 | return apply(v, fn, out); 316 | } 317 | __host__ __device__ MResult 318 | hadamard_product(T v, MatN &out) const { 319 | auto fn = [](T matv, T val) { return matv * val; }; 320 | return apply(v, fn, out); 321 | } 322 | __host__ __device__ MResult 323 | divide(const MatN &v, 324 | MatN &out) const { 325 | unsigned int osize = 0; 326 | v.get_size(osize); 327 | for (unsigned int i = 0; i < osize; i++) { 328 | T tout = static_cast(0); 329 | v.get(i, tout); 330 | if (tout == static_cast(0)) { 331 | // zero division risk 332 | return MResult(__LINE__, __FILE__, __FUNCTION__, 333 | "divide", ARG_ERROR); 334 | } 335 | } 336 | auto fn = [](T matv, T val) { return matv / val; }; 337 | return apply(v, fn, out); 338 | } 339 | __host__ __device__ MResult 340 | divide(T v, MatN &out) const { 341 | if (v == static_cast(0)) { 342 | return MResult(__LINE__, __FILE__, __FUNCTION__, 343 | "divide", ARG_ERROR); 344 | } 345 | auto fn = [](T matv, T val) { return matv / val; }; 346 | return apply(v, fn, out); 347 | } 348 | /**Declares inner vector product*/ 349 | template 350 | __host__ __device__ MResult vdot(const T x[N], 351 | const T y[N], 352 | T &out) const { 353 | if (N == 0) { 354 | return MResult(__LINE__, __FILE__, __FUNCTION__, 355 | "vdot", SIZE_ERROR); 356 | } 357 | 358 | out = static_cast(0); 359 | for (unsigned int i = 0; i < N; i++) { 360 | out += x[i] * y[i]; 361 | } 362 | return MResult(__LINE__, __FILE__, __FUNCTION__, "vdot", 363 | SUCCESS); 364 | } 365 | 366 | /**Declares inner vector product with scalars*/ 367 | template 368 | __host__ __device__ MResult vdot_s(const T x[N], 369 | const T &a, 370 | T out[N]) const { 371 | 372 | if (N == 0) { 373 | return MResult(__LINE__, __FILE__, __FUNCTION__, 374 | "vdot_s", SIZE_ERROR); 375 | } 376 | for (unsigned int i = 0; i < N; i++) { 377 | out[i] = x[i] * a; 378 | } 379 | return MResult(__LINE__, __FILE__, __FUNCTION__, 380 | "vdot_s", SUCCESS); 381 | } 382 | /**Implements saxpy algorithm from Golub, Van Loan 2013, 383 | * p. 4 alg.1.1.2*/ 384 | template 385 | __host__ __device__ MResult saxpy(const T &a, 386 | const T x[N], 387 | T y[N]) const { 388 | if (N == 0) { 389 | return MResult(__LINE__, __FILE__, __FUNCTION__, 390 | "saxpy", SIZE_ERROR); 391 | } 392 | for (unsigned int i = 0; i < N; i++) { 393 | y[i] += x[i] * a; // 394 | } 395 | return MResult(__LINE__, __FILE__, __FUNCTION__, 396 | "saxpy", SUCCESS); 397 | } 398 | /** 399 | Implements gaxpy algorithm from Golub, Van Loan 2013, p. 400 | 4 alg.1.1.3 401 | 402 | as specified in p. 6-7 403 | */ 404 | __host__ __device__ MResult gaxpy(const T x[ColNb], 405 | T y[RowNb]) const { 406 | for (unsigned int j = 0; j < ColNb; j++) { 407 | T c_j[RowNb]; 408 | column(j, c_j); 409 | saxpy(x[j], c_j, y); 410 | } 411 | return MResult(__LINE__, __FILE__, __FUNCTION__, 412 | "gaxpy", SUCCESS); 413 | } 414 | /** 415 | Implements outer product update from Golub, Van Loan 416 | 2013, p. 7 as a series of saxpy operations 417 | */ 418 | template 419 | __host__ __device__ MResult 420 | outer_product(const T x[Rn], const T y[Cn], 421 | MatN &out) const { 422 | for (unsigned int i = 0; i < Rn; i++) { 423 | T A_i[Cn]; 424 | out.row(i, A_i); 425 | saxpy(x[i], y, A_i); 426 | out.set_row(i, A_i); 427 | } 428 | return MResult(__LINE__, __FILE__, __FUNCTION__, 429 | "outer_product", SUCCESS); 430 | } 431 | template 432 | __host__ __device__ MResult 433 | multiply(T v, MatN &out) const { 434 | // m x n \cdot vmat (n x l) = out (m x l) 435 | // RowNb x ColNb \codt (n x l) = out (OutRowNb x 436 | // OutColNb) 437 | MatN vmat(v); 438 | 439 | auto r = multiply(vmat, out); 440 | if (r.status != SUCCESS) 441 | return r; 442 | return MResult(__LINE__, __FILE__, __FUNCTION__, 443 | "multiply", SUCCESS); 444 | } 445 | /*matrix to matrix multiplication*/ 446 | template 447 | __host__ __device__ MResult 448 | dot(const MatN &v, 449 | MatN &out) const { 450 | return multiply(v, out); 451 | } 452 | /*matrix to scalar multiplication*/ 453 | template 454 | __host__ __device__ MResult 455 | dot(T v, MatN &out) const { 456 | return multiply(v, out); 457 | } 458 | /*matrix to vector multiplication*/ 459 | __host__ __device__ MResult 460 | dot(const T v[ColNb], MatN &out) const { 461 | MatN vmat(v); 462 | auto r = multiply<1>(vmat, out); 463 | if (r.status != SUCCESS) 464 | return r; 465 | return MResult(__LINE__, __FILE__, __FUNCTION__, "dot", 466 | SUCCESS); 467 | } 468 | 469 | /** 470 | m x n \cdot vmat (n x l) = out (m x l) 471 | RowNb x ColNb \codt (OutRowNb x OutColNb) = out (RowNb x 472 | OutColNb) 473 | 474 | We are using the kij (row outer product) variant from 475 | Golub, van Loan 2013, p. 11 alg. 1.1.8 due to 476 | implementing this algorithm in C++. For fortran etc one 477 | should use jki since it access matrices by column. For 478 | a comparison of algorithms see table 1.1.1 in p. 9 479 | 480 | tested 481 | */ 482 | template 483 | __host__ __device__ MResult 484 | multiply(const MatN &B, 485 | MatN &out) const { 486 | 487 | // fill out matrix with zero 488 | out = MatN(static_cast(0)); 489 | for (unsigned int k = 0; k < ColNb; k++) { 490 | // x vector 491 | T A_k[RowNb]; 492 | column(k, A_k); 493 | 494 | // y vector 495 | T B_k[OutColNb]; 496 | B.row(k, B_k); 497 | 498 | // compute their outer product 499 | outer_product(A_k, B_k, out); 500 | } 501 | return MResult(__LINE__, __FILE__, __FUNCTION__, 502 | "multiply", SUCCESS); 503 | } 504 | /** 505 | add row 506 | */ 507 | __host__ __device__ MResult 508 | add_row(const T r_data[ColNb], 509 | MatN &out) const { 510 | return add_rows(r_data, out); 511 | } 512 | /** 513 | add rows if the incoming data has a size of multiple of 514 | number of columns 515 | of this array 516 | */ 517 | template 518 | __host__ __device__ MResult add_rows( 519 | const T r_data[InRow], 520 | MatN &out) const { 521 | if ((InRow % ColNb) != 0) { 522 | return MResult(__LINE__, __FILE__, __FUNCTION__, 523 | "add_rows", SIZE_ERROR); 524 | } 525 | // fill output matrix with zeros 526 | from_row_cols(out); 527 | 528 | // fill with the output matrix with current matrix 529 | // elements 530 | unsigned int i = 0; 531 | unsigned int j = 0; 532 | for (i = 0; i < RowNb; i++) { 533 | for (j = 0; j < ColNb; j++) { 534 | T value = static_cast(0); 535 | get(i, j, value); 536 | out.set(i, j, value); 537 | } 538 | } 539 | 540 | // fill from r_data the remaining values 541 | unsigned int nb_of_rows_to_add = 542 | static_cast(InRow / ColNb); 543 | for (i = 0; i <= nb_of_rows_to_add; i++) { 544 | unsigned int row = RowNb + i; 545 | for (unsigned int j = 0; j < ColNb; j++) { 546 | T row_val = r_data[i * ColNb + j]; 547 | out.set(row, j, row_val); 548 | } 549 | } 550 | return MResult(__LINE__, __FILE__, __FUNCTION__, 551 | "add_rows", SUCCESS); 552 | } 553 | /** 554 | add column 555 | */ 556 | __host__ __device__ MResult 557 | add_column(const T r_data[RowNb], 558 | MatN &out) const { 559 | return add_columns(r_data, out); 560 | } 561 | template 562 | __host__ __device__ MResult add_columns( 563 | const T c_data[InCol], 564 | MatN &out) const { 565 | if ((InCol % RowNb) != 0) { 566 | return MResult(__LINE__, __FILE__, __FUNCTION__, 567 | "add_columns", SIZE_ERROR); 568 | } 569 | // fill output matrix with zeros 570 | from_row_cols(out); 571 | 572 | // fill with the output matrix with current matrix 573 | // elements 574 | unsigned int i = 0; 575 | unsigned int j = 0; 576 | for (i = 0; i < RowNb; i++) { 577 | for (j = 0; j < ColNb; j++) { 578 | T value = static_cast(0); 579 | get(i, j, value); 580 | out.set(i, j, value); 581 | } 582 | } 583 | // fill from c_data the remaining values 584 | unsigned int nb_of_cols_to_add = 585 | static_cast(InCol / RowNb); 586 | 587 | // even if there are zero columns to add the output 588 | // should be one 589 | for (i = 0; i < nb_of_cols_to_add; i++) { 590 | unsigned int col = ColNb + i; 591 | for (j = 0; j < RowNb; j++) { 592 | T col_val = c_data[i * RowNb + j]; 593 | out.set(j, col, col_val); 594 | } 595 | } 596 | return MResult(__LINE__, __FILE__, __FUNCTION__, 597 | "add_columns", SUCCESS); 598 | } 599 | __host__ __device__ MResult 600 | to_double_vec(std::vector> &ovec) const { 601 | // 602 | std::vector> out( 603 | nb_rows, std::vector(nb_cols)); 604 | for (unsigned int i = 0; i < nb_rows; i++) { 605 | for (unsigned int j = 0; j < nb_cols; j++) { 606 | get(i, j, out[i][j]); 607 | } 608 | } 609 | ovec = out; 610 | return MResult(__LINE__, __FILE__, __FUNCTION__, 611 | "to_double_vec", SUCCESS); 612 | } 613 | }; 614 | 615 | template 616 | __host__ __device__ std::stringstream & 617 | operator<<(std::stringstream &out, MatN m) { 618 | constexpr unsigned int arr_size = R * C; 619 | T arr[arr_size]; 620 | m.get(arr); 621 | for (unsigned int i = 0; i < arr_size; i++) { 622 | if (i % C == 0) { 623 | out << std::endl; 624 | } 625 | if (arr[i] >= 0) { 626 | out << " " << arr[i] << " "; 627 | } else { 628 | out << arr[i] << " "; 629 | } 630 | } 631 | return out; 632 | } 633 | } // namespace cudamat 634 | #endif 635 | -------------------------------------------------------------------------------- /src/stb/stb_image_write.h: -------------------------------------------------------------------------------- 1 | /* stb_image_write - v1.16 - public domain - http://nothings.org/stb 2 | writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015 3 | no warranty implied; use at your own risk 4 | 5 | Before #including, 6 | 7 | #define STB_IMAGE_WRITE_IMPLEMENTATION 8 | 9 | in the file that you want to have the implementation. 10 | 11 | Will probably not work correctly with strict-aliasing optimizations. 12 | 13 | ABOUT: 14 | 15 | This header file is a library for writing images to C stdio or a callback. 16 | 17 | The PNG output is not optimal; it is 20-50% larger than the file 18 | written by a decent optimizing implementation; though providing a custom 19 | zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that. 20 | This library is designed for source code compactness and simplicity, 21 | not optimal image file size or run-time performance. 22 | 23 | BUILDING: 24 | 25 | You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h. 26 | You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace 27 | malloc,realloc,free. 28 | You can #define STBIW_MEMMOVE() to replace memmove() 29 | You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function 30 | for PNG compression (instead of the builtin one), it must have the following signature: 31 | unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality); 32 | The returned data will be freed with STBIW_FREE() (free() by default), 33 | so it must be heap allocated with STBIW_MALLOC() (malloc() by default), 34 | 35 | UNICODE: 36 | 37 | If compiling for Windows and you wish to use Unicode filenames, compile 38 | with 39 | #define STBIW_WINDOWS_UTF8 40 | and pass utf8-encoded filenames. Call stbiw_convert_wchar_to_utf8 to convert 41 | Windows wchar_t filenames to utf8. 42 | 43 | USAGE: 44 | 45 | There are five functions, one for each image file format: 46 | 47 | int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes); 48 | int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data); 49 | int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data); 50 | int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality); 51 | int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data); 52 | 53 | void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically 54 | 55 | There are also five equivalent functions that use an arbitrary write function. You are 56 | expected to open/close your file-equivalent before and after calling these: 57 | 58 | int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data, int stride_in_bytes); 59 | int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data); 60 | int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data); 61 | int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data); 62 | int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality); 63 | 64 | where the callback is: 65 | void stbi_write_func(void *context, void *data, int size); 66 | 67 | You can configure it with these global variables: 68 | int stbi_write_tga_with_rle; // defaults to true; set to 0 to disable RLE 69 | int stbi_write_png_compression_level; // defaults to 8; set to higher for more compression 70 | int stbi_write_force_png_filter; // defaults to -1; set to 0..5 to force a filter mode 71 | 72 | 73 | You can define STBI_WRITE_NO_STDIO to disable the file variant of these 74 | functions, so the library will not use stdio.h at all. However, this will 75 | also disable HDR writing, because it requires stdio for formatted output. 76 | 77 | Each function returns 0 on failure and non-0 on success. 78 | 79 | The functions create an image file defined by the parameters. The image 80 | is a rectangle of pixels stored from left-to-right, top-to-bottom. 81 | Each pixel contains 'comp' channels of data stored interleaved with 8-bits 82 | per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is 83 | monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall. 84 | The *data pointer points to the first byte of the top-left-most pixel. 85 | For PNG, "stride_in_bytes" is the distance in bytes from the first byte of 86 | a row of pixels to the first byte of the next row of pixels. 87 | 88 | PNG creates output files with the same number of components as the input. 89 | The BMP format expands Y to RGB in the file format and does not 90 | output alpha. 91 | 92 | PNG supports writing rectangles of data even when the bytes storing rows of 93 | data are not consecutive in memory (e.g. sub-rectangles of a larger image), 94 | by supplying the stride between the beginning of adjacent rows. The other 95 | formats do not. (Thus you cannot write a native-format BMP through the BMP 96 | writer, both because it is in BGR order and because it may have padding 97 | at the end of the line.) 98 | 99 | PNG allows you to set the deflate compression level by setting the global 100 | variable 'stbi_write_png_compression_level' (it defaults to 8). 101 | 102 | HDR expects linear float data. Since the format is always 32-bit rgb(e) 103 | data, alpha (if provided) is discarded, and for monochrome data it is 104 | replicated across all three channels. 105 | 106 | TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed 107 | data, set the global variable 'stbi_write_tga_with_rle' to 0. 108 | 109 | JPEG does ignore alpha channels in input data; quality is between 1 and 100. 110 | Higher quality looks better but results in a bigger image. 111 | JPEG baseline (no JPEG progressive). 112 | 113 | CREDITS: 114 | 115 | 116 | Sean Barrett - PNG/BMP/TGA 117 | Baldur Karlsson - HDR 118 | Jean-Sebastien Guay - TGA monochrome 119 | Tim Kelsey - misc enhancements 120 | Alan Hickman - TGA RLE 121 | Emmanuel Julien - initial file IO callback implementation 122 | Jon Olick - original jo_jpeg.cpp code 123 | Daniel Gibson - integrate JPEG, allow external zlib 124 | Aarni Koskela - allow choosing PNG filter 125 | 126 | bugfixes: 127 | github:Chribba 128 | Guillaume Chereau 129 | github:jry2 130 | github:romigrou 131 | Sergio Gonzalez 132 | Jonas Karlsson 133 | Filip Wasil 134 | Thatcher Ulrich 135 | github:poppolopoppo 136 | Patrick Boettcher 137 | github:xeekworx 138 | Cap Petschulat 139 | Simon Rodriguez 140 | Ivan Tikhonov 141 | github:ignotion 142 | Adam Schackart 143 | Andrew Kensler 144 | 145 | LICENSE 146 | 147 | See end of file for license information. 148 | 149 | */ 150 | 151 | #ifndef INCLUDE_STB_IMAGE_WRITE_H 152 | #define INCLUDE_STB_IMAGE_WRITE_H 153 | 154 | #include 155 | 156 | // if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline' 157 | #ifndef STBIWDEF 158 | #ifdef STB_IMAGE_WRITE_STATIC 159 | #define STBIWDEF static 160 | #else 161 | #ifdef __cplusplus 162 | #define STBIWDEF extern "C" 163 | #else 164 | #define STBIWDEF extern 165 | #endif 166 | #endif 167 | #endif 168 | 169 | #ifndef STB_IMAGE_WRITE_STATIC // C++ forbids static forward declarations 170 | STBIWDEF int stbi_write_tga_with_rle; 171 | STBIWDEF int stbi_write_png_compression_level; 172 | STBIWDEF int stbi_write_force_png_filter; 173 | #endif 174 | 175 | #ifndef STBI_WRITE_NO_STDIO 176 | STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes); 177 | STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data); 178 | STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data); 179 | STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data); 180 | STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality); 181 | 182 | #ifdef STBIW_WINDOWS_UTF8 183 | STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input); 184 | #endif 185 | #endif 186 | 187 | typedef void stbi_write_func(void *context, void *data, int size); 188 | 189 | STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data, int stride_in_bytes); 190 | STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data); 191 | STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void *data); 192 | STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data); 193 | STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality); 194 | 195 | STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean); 196 | 197 | #endif//INCLUDE_STB_IMAGE_WRITE_H 198 | 199 | #ifdef STB_IMAGE_WRITE_IMPLEMENTATION 200 | 201 | #ifdef _WIN32 202 | #ifndef _CRT_SECURE_NO_WARNINGS 203 | #define _CRT_SECURE_NO_WARNINGS 204 | #endif 205 | #ifndef _CRT_NONSTDC_NO_DEPRECATE 206 | #define _CRT_NONSTDC_NO_DEPRECATE 207 | #endif 208 | #endif 209 | 210 | #ifndef STBI_WRITE_NO_STDIO 211 | #include 212 | #endif // STBI_WRITE_NO_STDIO 213 | 214 | #include 215 | #include 216 | #include 217 | #include 218 | 219 | #if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED)) 220 | // ok 221 | #elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED) 222 | // ok 223 | #else 224 | #error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)." 225 | #endif 226 | 227 | #ifndef STBIW_MALLOC 228 | #define STBIW_MALLOC(sz) malloc(sz) 229 | #define STBIW_REALLOC(p,newsz) realloc(p,newsz) 230 | #define STBIW_FREE(p) free(p) 231 | #endif 232 | 233 | #ifndef STBIW_REALLOC_SIZED 234 | #define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz) 235 | #endif 236 | 237 | 238 | #ifndef STBIW_MEMMOVE 239 | #define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz) 240 | #endif 241 | 242 | 243 | #ifndef STBIW_ASSERT 244 | #include 245 | #define STBIW_ASSERT(x) assert(x) 246 | #endif 247 | 248 | #define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff) 249 | 250 | #ifdef STB_IMAGE_WRITE_STATIC 251 | static int stbi_write_png_compression_level = 8; 252 | static int stbi_write_tga_with_rle = 1; 253 | static int stbi_write_force_png_filter = -1; 254 | #else 255 | int stbi_write_png_compression_level = 8; 256 | int stbi_write_tga_with_rle = 1; 257 | int stbi_write_force_png_filter = -1; 258 | #endif 259 | 260 | static int stbi__flip_vertically_on_write = 0; 261 | 262 | STBIWDEF void stbi_flip_vertically_on_write(int flag) 263 | { 264 | stbi__flip_vertically_on_write = flag; 265 | } 266 | 267 | typedef struct 268 | { 269 | stbi_write_func *func; 270 | void *context; 271 | unsigned char buffer[64]; 272 | int buf_used; 273 | } stbi__write_context; 274 | 275 | // initialize a callback-based context 276 | static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context) 277 | { 278 | s->func = c; 279 | s->context = context; 280 | } 281 | 282 | #ifndef STBI_WRITE_NO_STDIO 283 | 284 | static void stbi__stdio_write(void *context, void *data, int size) 285 | { 286 | fwrite(data,1,size,(FILE*) context); 287 | } 288 | 289 | #if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8) 290 | #ifdef __cplusplus 291 | #define STBIW_EXTERN extern "C" 292 | #else 293 | #define STBIW_EXTERN extern 294 | #endif 295 | STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide); 296 | STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default); 297 | 298 | STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input) 299 | { 300 | return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL); 301 | } 302 | #endif 303 | 304 | static FILE *stbiw__fopen(char const *filename, char const *mode) 305 | { 306 | FILE *f; 307 | #if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8) 308 | wchar_t wMode[64]; 309 | wchar_t wFilename[1024]; 310 | if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename))) 311 | return 0; 312 | 313 | if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode))) 314 | return 0; 315 | 316 | #if defined(_MSC_VER) && _MSC_VER >= 1400 317 | if (0 != _wfopen_s(&f, wFilename, wMode)) 318 | f = 0; 319 | #else 320 | f = _wfopen(wFilename, wMode); 321 | #endif 322 | 323 | #elif defined(_MSC_VER) && _MSC_VER >= 1400 324 | if (0 != fopen_s(&f, filename, mode)) 325 | f=0; 326 | #else 327 | f = fopen(filename, mode); 328 | #endif 329 | return f; 330 | } 331 | 332 | static int stbi__start_write_file(stbi__write_context *s, const char *filename) 333 | { 334 | FILE *f = stbiw__fopen(filename, "wb"); 335 | stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f); 336 | return f != NULL; 337 | } 338 | 339 | static void stbi__end_write_file(stbi__write_context *s) 340 | { 341 | fclose((FILE *)s->context); 342 | } 343 | 344 | #endif // !STBI_WRITE_NO_STDIO 345 | 346 | typedef unsigned int stbiw_uint32; 347 | typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1]; 348 | 349 | static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v) 350 | { 351 | while (*fmt) { 352 | switch (*fmt++) { 353 | case ' ': break; 354 | case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int)); 355 | s->func(s->context,&x,1); 356 | break; } 357 | case '2': { int x = va_arg(v,int); 358 | unsigned char b[2]; 359 | b[0] = STBIW_UCHAR(x); 360 | b[1] = STBIW_UCHAR(x>>8); 361 | s->func(s->context,b,2); 362 | break; } 363 | case '4': { stbiw_uint32 x = va_arg(v,int); 364 | unsigned char b[4]; 365 | b[0]=STBIW_UCHAR(x); 366 | b[1]=STBIW_UCHAR(x>>8); 367 | b[2]=STBIW_UCHAR(x>>16); 368 | b[3]=STBIW_UCHAR(x>>24); 369 | s->func(s->context,b,4); 370 | break; } 371 | default: 372 | STBIW_ASSERT(0); 373 | return; 374 | } 375 | } 376 | } 377 | 378 | static void stbiw__writef(stbi__write_context *s, const char *fmt, ...) 379 | { 380 | va_list v; 381 | va_start(v, fmt); 382 | stbiw__writefv(s, fmt, v); 383 | va_end(v); 384 | } 385 | 386 | static void stbiw__write_flush(stbi__write_context *s) 387 | { 388 | if (s->buf_used) { 389 | s->func(s->context, &s->buffer, s->buf_used); 390 | s->buf_used = 0; 391 | } 392 | } 393 | 394 | static void stbiw__putc(stbi__write_context *s, unsigned char c) 395 | { 396 | s->func(s->context, &c, 1); 397 | } 398 | 399 | static void stbiw__write1(stbi__write_context *s, unsigned char a) 400 | { 401 | if ((size_t)s->buf_used + 1 > sizeof(s->buffer)) 402 | stbiw__write_flush(s); 403 | s->buffer[s->buf_used++] = a; 404 | } 405 | 406 | static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c) 407 | { 408 | int n; 409 | if ((size_t)s->buf_used + 3 > sizeof(s->buffer)) 410 | stbiw__write_flush(s); 411 | n = s->buf_used; 412 | s->buf_used = n+3; 413 | s->buffer[n+0] = a; 414 | s->buffer[n+1] = b; 415 | s->buffer[n+2] = c; 416 | } 417 | 418 | static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d) 419 | { 420 | unsigned char bg[3] = { 255, 0, 255}, px[3]; 421 | int k; 422 | 423 | if (write_alpha < 0) 424 | stbiw__write1(s, d[comp - 1]); 425 | 426 | switch (comp) { 427 | case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case 428 | case 1: 429 | if (expand_mono) 430 | stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp 431 | else 432 | stbiw__write1(s, d[0]); // monochrome TGA 433 | break; 434 | case 4: 435 | if (!write_alpha) { 436 | // composite against pink background 437 | for (k = 0; k < 3; ++k) 438 | px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255; 439 | stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]); 440 | break; 441 | } 442 | /* FALLTHROUGH */ 443 | case 3: 444 | stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]); 445 | break; 446 | } 447 | if (write_alpha > 0) 448 | stbiw__write1(s, d[comp - 1]); 449 | } 450 | 451 | static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono) 452 | { 453 | stbiw_uint32 zero = 0; 454 | int i,j, j_end; 455 | 456 | if (y <= 0) 457 | return; 458 | 459 | if (stbi__flip_vertically_on_write) 460 | vdir *= -1; 461 | 462 | if (vdir < 0) { 463 | j_end = -1; j = y-1; 464 | } else { 465 | j_end = y; j = 0; 466 | } 467 | 468 | for (; j != j_end; j += vdir) { 469 | for (i=0; i < x; ++i) { 470 | unsigned char *d = (unsigned char *) data + (j*x+i)*comp; 471 | stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d); 472 | } 473 | stbiw__write_flush(s); 474 | s->func(s->context, &zero, scanline_pad); 475 | } 476 | } 477 | 478 | static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...) 479 | { 480 | if (y < 0 || x < 0) { 481 | return 0; 482 | } else { 483 | va_list v; 484 | va_start(v, fmt); 485 | stbiw__writefv(s, fmt, v); 486 | va_end(v); 487 | stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono); 488 | return 1; 489 | } 490 | } 491 | 492 | static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data) 493 | { 494 | if (comp != 4) { 495 | // write RGB bitmap 496 | int pad = (-x*3) & 3; 497 | return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad, 498 | "11 4 22 4" "4 44 22 444444", 499 | 'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40, // file header 500 | 40, x,y, 1,24, 0,0,0,0,0,0); // bitmap header 501 | } else { 502 | // RGBA bitmaps need a v4 header 503 | // use BI_BITFIELDS mode with 32bpp and alpha mask 504 | // (straight BI_RGB with alpha mask doesn't work in most readers) 505 | return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *)data,1,0, 506 | "11 4 22 4" "4 44 22 444444 4444 4 444 444 444 444", 507 | 'B', 'M', 14+108+x*y*4, 0, 0, 14+108, // file header 508 | 108, x,y, 1,32, 3,0,0,0,0,0, 0xff0000,0xff00,0xff,0xff000000u, 0, 0,0,0, 0,0,0, 0,0,0, 0,0,0); // bitmap V4 header 509 | } 510 | } 511 | 512 | STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data) 513 | { 514 | stbi__write_context s = { 0 }; 515 | stbi__start_write_callbacks(&s, func, context); 516 | return stbi_write_bmp_core(&s, x, y, comp, data); 517 | } 518 | 519 | #ifndef STBI_WRITE_NO_STDIO 520 | STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data) 521 | { 522 | stbi__write_context s = { 0 }; 523 | if (stbi__start_write_file(&s,filename)) { 524 | int r = stbi_write_bmp_core(&s, x, y, comp, data); 525 | stbi__end_write_file(&s); 526 | return r; 527 | } else 528 | return 0; 529 | } 530 | #endif //!STBI_WRITE_NO_STDIO 531 | 532 | static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data) 533 | { 534 | int has_alpha = (comp == 2 || comp == 4); 535 | int colorbytes = has_alpha ? comp-1 : comp; 536 | int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3 537 | 538 | if (y < 0 || x < 0) 539 | return 0; 540 | 541 | if (!stbi_write_tga_with_rle) { 542 | return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0, 543 | "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8); 544 | } else { 545 | int i,j,k; 546 | int jend, jdir; 547 | 548 | stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8); 549 | 550 | if (stbi__flip_vertically_on_write) { 551 | j = 0; 552 | jend = y; 553 | jdir = 1; 554 | } else { 555 | j = y-1; 556 | jend = -1; 557 | jdir = -1; 558 | } 559 | for (; j != jend; j += jdir) { 560 | unsigned char *row = (unsigned char *) data + j * x * comp; 561 | int len; 562 | 563 | for (i = 0; i < x; i += len) { 564 | unsigned char *begin = row + i * comp; 565 | int diff = 1; 566 | len = 1; 567 | 568 | if (i < x - 1) { 569 | ++len; 570 | diff = memcmp(begin, row + (i + 1) * comp, comp); 571 | if (diff) { 572 | const unsigned char *prev = begin; 573 | for (k = i + 2; k < x && len < 128; ++k) { 574 | if (memcmp(prev, row + k * comp, comp)) { 575 | prev += comp; 576 | ++len; 577 | } else { 578 | --len; 579 | break; 580 | } 581 | } 582 | } else { 583 | for (k = i + 2; k < x && len < 128; ++k) { 584 | if (!memcmp(begin, row + k * comp, comp)) { 585 | ++len; 586 | } else { 587 | break; 588 | } 589 | } 590 | } 591 | } 592 | 593 | if (diff) { 594 | unsigned char header = STBIW_UCHAR(len - 1); 595 | stbiw__write1(s, header); 596 | for (k = 0; k < len; ++k) { 597 | stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp); 598 | } 599 | } else { 600 | unsigned char header = STBIW_UCHAR(len - 129); 601 | stbiw__write1(s, header); 602 | stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin); 603 | } 604 | } 605 | } 606 | stbiw__write_flush(s); 607 | } 608 | return 1; 609 | } 610 | 611 | STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data) 612 | { 613 | stbi__write_context s = { 0 }; 614 | stbi__start_write_callbacks(&s, func, context); 615 | return stbi_write_tga_core(&s, x, y, comp, (void *) data); 616 | } 617 | 618 | #ifndef STBI_WRITE_NO_STDIO 619 | STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data) 620 | { 621 | stbi__write_context s = { 0 }; 622 | if (stbi__start_write_file(&s,filename)) { 623 | int r = stbi_write_tga_core(&s, x, y, comp, (void *) data); 624 | stbi__end_write_file(&s); 625 | return r; 626 | } else 627 | return 0; 628 | } 629 | #endif 630 | 631 | // ************************************************************************************************* 632 | // Radiance RGBE HDR writer 633 | // by Baldur Karlsson 634 | 635 | #define stbiw__max(a, b) ((a) > (b) ? (a) : (b)) 636 | 637 | #ifndef STBI_WRITE_NO_STDIO 638 | 639 | static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear) 640 | { 641 | int exponent; 642 | float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2])); 643 | 644 | if (maxcomp < 1e-32f) { 645 | rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0; 646 | } else { 647 | float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp; 648 | 649 | rgbe[0] = (unsigned char)(linear[0] * normalize); 650 | rgbe[1] = (unsigned char)(linear[1] * normalize); 651 | rgbe[2] = (unsigned char)(linear[2] * normalize); 652 | rgbe[3] = (unsigned char)(exponent + 128); 653 | } 654 | } 655 | 656 | static void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte) 657 | { 658 | unsigned char lengthbyte = STBIW_UCHAR(length+128); 659 | STBIW_ASSERT(length+128 <= 255); 660 | s->func(s->context, &lengthbyte, 1); 661 | s->func(s->context, &databyte, 1); 662 | } 663 | 664 | static void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data) 665 | { 666 | unsigned char lengthbyte = STBIW_UCHAR(length); 667 | STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code 668 | s->func(s->context, &lengthbyte, 1); 669 | s->func(s->context, data, length); 670 | } 671 | 672 | static void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline) 673 | { 674 | unsigned char scanlineheader[4] = { 2, 2, 0, 0 }; 675 | unsigned char rgbe[4]; 676 | float linear[3]; 677 | int x; 678 | 679 | scanlineheader[2] = (width&0xff00)>>8; 680 | scanlineheader[3] = (width&0x00ff); 681 | 682 | /* skip RLE for images too small or large */ 683 | if (width < 8 || width >= 32768) { 684 | for (x=0; x < width; x++) { 685 | switch (ncomp) { 686 | case 4: /* fallthrough */ 687 | case 3: linear[2] = scanline[x*ncomp + 2]; 688 | linear[1] = scanline[x*ncomp + 1]; 689 | linear[0] = scanline[x*ncomp + 0]; 690 | break; 691 | default: 692 | linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0]; 693 | break; 694 | } 695 | stbiw__linear_to_rgbe(rgbe, linear); 696 | s->func(s->context, rgbe, 4); 697 | } 698 | } else { 699 | int c,r; 700 | /* encode into scratch buffer */ 701 | for (x=0; x < width; x++) { 702 | switch(ncomp) { 703 | case 4: /* fallthrough */ 704 | case 3: linear[2] = scanline[x*ncomp + 2]; 705 | linear[1] = scanline[x*ncomp + 1]; 706 | linear[0] = scanline[x*ncomp + 0]; 707 | break; 708 | default: 709 | linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0]; 710 | break; 711 | } 712 | stbiw__linear_to_rgbe(rgbe, linear); 713 | scratch[x + width*0] = rgbe[0]; 714 | scratch[x + width*1] = rgbe[1]; 715 | scratch[x + width*2] = rgbe[2]; 716 | scratch[x + width*3] = rgbe[3]; 717 | } 718 | 719 | s->func(s->context, scanlineheader, 4); 720 | 721 | /* RLE each component separately */ 722 | for (c=0; c < 4; c++) { 723 | unsigned char *comp = &scratch[width*c]; 724 | 725 | x = 0; 726 | while (x < width) { 727 | // find first run 728 | r = x; 729 | while (r+2 < width) { 730 | if (comp[r] == comp[r+1] && comp[r] == comp[r+2]) 731 | break; 732 | ++r; 733 | } 734 | if (r+2 >= width) 735 | r = width; 736 | // dump up to first run 737 | while (x < r) { 738 | int len = r-x; 739 | if (len > 128) len = 128; 740 | stbiw__write_dump_data(s, len, &comp[x]); 741 | x += len; 742 | } 743 | // if there's a run, output it 744 | if (r+2 < width) { // same test as what we break out of in search loop, so only true if we break'd 745 | // find next byte after run 746 | while (r < width && comp[r] == comp[x]) 747 | ++r; 748 | // output run up to r 749 | while (x < r) { 750 | int len = r-x; 751 | if (len > 127) len = 127; 752 | stbiw__write_run_data(s, len, comp[x]); 753 | x += len; 754 | } 755 | } 756 | } 757 | } 758 | } 759 | } 760 | 761 | static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data) 762 | { 763 | if (y <= 0 || x <= 0 || data == NULL) 764 | return 0; 765 | else { 766 | // Each component is stored separately. Allocate scratch space for full output scanline. 767 | unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4); 768 | int i, len; 769 | char buffer[128]; 770 | char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n"; 771 | s->func(s->context, header, sizeof(header)-1); 772 | 773 | #ifdef __STDC_LIB_EXT1__ 774 | len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE= 1.0000000000000\n\n-Y %d +X %d\n", y, x); 775 | #else 776 | len = sprintf(buffer, "EXPOSURE= 1.0000000000000\n\n-Y %d +X %d\n", y, x); 777 | #endif 778 | s->func(s->context, buffer, len); 779 | 780 | for(i=0; i < y; i++) 781 | stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*x*(stbi__flip_vertically_on_write ? y-1-i : i)); 782 | STBIW_FREE(scratch); 783 | return 1; 784 | } 785 | } 786 | 787 | STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data) 788 | { 789 | stbi__write_context s = { 0 }; 790 | stbi__start_write_callbacks(&s, func, context); 791 | return stbi_write_hdr_core(&s, x, y, comp, (float *) data); 792 | } 793 | 794 | STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data) 795 | { 796 | stbi__write_context s = { 0 }; 797 | if (stbi__start_write_file(&s,filename)) { 798 | int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data); 799 | stbi__end_write_file(&s); 800 | return r; 801 | } else 802 | return 0; 803 | } 804 | #endif // STBI_WRITE_NO_STDIO 805 | 806 | 807 | ////////////////////////////////////////////////////////////////////////////// 808 | // 809 | // PNG writer 810 | // 811 | 812 | #ifndef STBIW_ZLIB_COMPRESS 813 | // stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size() 814 | #define stbiw__sbraw(a) ((int *) (void *) (a) - 2) 815 | #define stbiw__sbm(a) stbiw__sbraw(a)[0] 816 | #define stbiw__sbn(a) stbiw__sbraw(a)[1] 817 | 818 | #define stbiw__sbneedgrow(a,n) ((a)==0 || stbiw__sbn(a)+n >= stbiw__sbm(a)) 819 | #define stbiw__sbmaybegrow(a,n) (stbiw__sbneedgrow(a,(n)) ? stbiw__sbgrow(a,n) : 0) 820 | #define stbiw__sbgrow(a,n) stbiw__sbgrowf((void **) &(a), (n), sizeof(*(a))) 821 | 822 | #define stbiw__sbpush(a, v) (stbiw__sbmaybegrow(a,1), (a)[stbiw__sbn(a)++] = (v)) 823 | #define stbiw__sbcount(a) ((a) ? stbiw__sbn(a) : 0) 824 | #define stbiw__sbfree(a) ((a) ? STBIW_FREE(stbiw__sbraw(a)),0 : 0) 825 | 826 | static void *stbiw__sbgrowf(void **arr, int increment, int itemsize) 827 | { 828 | int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1; 829 | void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2); 830 | STBIW_ASSERT(p); 831 | if (p) { 832 | if (!*arr) ((int *) p)[1] = 0; 833 | *arr = (void *) ((int *) p + 2); 834 | stbiw__sbm(*arr) = m; 835 | } 836 | return *arr; 837 | } 838 | 839 | static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount) 840 | { 841 | while (*bitcount >= 8) { 842 | stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer)); 843 | *bitbuffer >>= 8; 844 | *bitcount -= 8; 845 | } 846 | return data; 847 | } 848 | 849 | static int stbiw__zlib_bitrev(int code, int codebits) 850 | { 851 | int res=0; 852 | while (codebits--) { 853 | res = (res << 1) | (code & 1); 854 | code >>= 1; 855 | } 856 | return res; 857 | } 858 | 859 | static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, int limit) 860 | { 861 | int i; 862 | for (i=0; i < limit && i < 258; ++i) 863 | if (a[i] != b[i]) break; 864 | return i; 865 | } 866 | 867 | static unsigned int stbiw__zhash(unsigned char *data) 868 | { 869 | stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16); 870 | hash ^= hash << 3; 871 | hash += hash >> 5; 872 | hash ^= hash << 4; 873 | hash += hash >> 17; 874 | hash ^= hash << 25; 875 | hash += hash >> 6; 876 | return hash; 877 | } 878 | 879 | #define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount)) 880 | #define stbiw__zlib_add(code,codebits) \ 881 | (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush()) 882 | #define stbiw__zlib_huffa(b,c) stbiw__zlib_add(stbiw__zlib_bitrev(b,c),c) 883 | // default huffman tables 884 | #define stbiw__zlib_huff1(n) stbiw__zlib_huffa(0x30 + (n), 8) 885 | #define stbiw__zlib_huff2(n) stbiw__zlib_huffa(0x190 + (n)-144, 9) 886 | #define stbiw__zlib_huff3(n) stbiw__zlib_huffa(0 + (n)-256,7) 887 | #define stbiw__zlib_huff4(n) stbiw__zlib_huffa(0xc0 + (n)-280,8) 888 | #define stbiw__zlib_huff(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n)) 889 | #define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n)) 890 | 891 | #define stbiw__ZHASH 16384 892 | 893 | #endif // STBIW_ZLIB_COMPRESS 894 | 895 | STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality) 896 | { 897 | #ifdef STBIW_ZLIB_COMPRESS 898 | // user provided a zlib compress implementation, use that 899 | return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality); 900 | #else // use builtin 901 | static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 }; 902 | static unsigned char lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0 }; 903 | static unsigned short distc[] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 }; 904 | static unsigned char disteb[] = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 }; 905 | unsigned int bitbuf=0; 906 | int i,j, bitcount=0; 907 | unsigned char *out = NULL; 908 | unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char**)); 909 | if (hash_table == NULL) 910 | return NULL; 911 | if (quality < 5) quality = 5; 912 | 913 | stbiw__sbpush(out, 0x78); // DEFLATE 32K window 914 | stbiw__sbpush(out, 0x5e); // FLEVEL = 1 915 | stbiw__zlib_add(1,1); // BFINAL = 1 916 | stbiw__zlib_add(1,2); // BTYPE = 1 -- fixed huffman 917 | 918 | for (i=0; i < stbiw__ZHASH; ++i) 919 | hash_table[i] = NULL; 920 | 921 | i=0; 922 | while (i < data_len-3) { 923 | // hash next 3 bytes of data to be compressed 924 | int h = stbiw__zhash(data+i)&(stbiw__ZHASH-1), best=3; 925 | unsigned char *bestloc = 0; 926 | unsigned char **hlist = hash_table[h]; 927 | int n = stbiw__sbcount(hlist); 928 | for (j=0; j < n; ++j) { 929 | if (hlist[j]-data > i-32768) { // if entry lies within window 930 | int d = stbiw__zlib_countm(hlist[j], data+i, data_len-i); 931 | if (d >= best) { best=d; bestloc=hlist[j]; } 932 | } 933 | } 934 | // when hash table entry is too long, delete half the entries 935 | if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2*quality) { 936 | STBIW_MEMMOVE(hash_table[h], hash_table[h]+quality, sizeof(hash_table[h][0])*quality); 937 | stbiw__sbn(hash_table[h]) = quality; 938 | } 939 | stbiw__sbpush(hash_table[h],data+i); 940 | 941 | if (bestloc) { 942 | // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal 943 | h = stbiw__zhash(data+i+1)&(stbiw__ZHASH-1); 944 | hlist = hash_table[h]; 945 | n = stbiw__sbcount(hlist); 946 | for (j=0; j < n; ++j) { 947 | if (hlist[j]-data > i-32767) { 948 | int e = stbiw__zlib_countm(hlist[j], data+i+1, data_len-i-1); 949 | if (e > best) { // if next match is better, bail on current match 950 | bestloc = NULL; 951 | break; 952 | } 953 | } 954 | } 955 | } 956 | 957 | if (bestloc) { 958 | int d = (int) (data+i - bestloc); // distance back 959 | STBIW_ASSERT(d <= 32767 && best <= 258); 960 | for (j=0; best > lengthc[j+1]-1; ++j); 961 | stbiw__zlib_huff(j+257); 962 | if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]); 963 | for (j=0; d > distc[j+1]-1; ++j); 964 | stbiw__zlib_add(stbiw__zlib_bitrev(j,5),5); 965 | if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]); 966 | i += best; 967 | } else { 968 | stbiw__zlib_huffb(data[i]); 969 | ++i; 970 | } 971 | } 972 | // write out final bytes 973 | for (;i < data_len; ++i) 974 | stbiw__zlib_huffb(data[i]); 975 | stbiw__zlib_huff(256); // end of block 976 | // pad with 0 bits to byte boundary 977 | while (bitcount) 978 | stbiw__zlib_add(0,1); 979 | 980 | for (i=0; i < stbiw__ZHASH; ++i) 981 | (void) stbiw__sbfree(hash_table[i]); 982 | STBIW_FREE(hash_table); 983 | 984 | // store uncompressed instead if compression was worse 985 | if (stbiw__sbn(out) > data_len + 2 + ((data_len+32766)/32767)*5) { 986 | stbiw__sbn(out) = 2; // truncate to DEFLATE 32K window and FLEVEL = 1 987 | for (j = 0; j < data_len;) { 988 | int blocklen = data_len - j; 989 | if (blocklen > 32767) blocklen = 32767; 990 | stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression 991 | stbiw__sbpush(out, STBIW_UCHAR(blocklen)); // LEN 992 | stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8)); 993 | stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN 994 | stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8)); 995 | memcpy(out+stbiw__sbn(out), data+j, blocklen); 996 | stbiw__sbn(out) += blocklen; 997 | j += blocklen; 998 | } 999 | } 1000 | 1001 | { 1002 | // compute adler32 on input 1003 | unsigned int s1=1, s2=0; 1004 | int blocklen = (int) (data_len % 5552); 1005 | j=0; 1006 | while (j < data_len) { 1007 | for (i=0; i < blocklen; ++i) { s1 += data[j+i]; s2 += s1; } 1008 | s1 %= 65521; s2 %= 65521; 1009 | j += blocklen; 1010 | blocklen = 5552; 1011 | } 1012 | stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8)); 1013 | stbiw__sbpush(out, STBIW_UCHAR(s2)); 1014 | stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8)); 1015 | stbiw__sbpush(out, STBIW_UCHAR(s1)); 1016 | } 1017 | *out_len = stbiw__sbn(out); 1018 | // make returned pointer freeable 1019 | STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len); 1020 | return (unsigned char *) stbiw__sbraw(out); 1021 | #endif // STBIW_ZLIB_COMPRESS 1022 | } 1023 | 1024 | static unsigned int stbiw__crc32(unsigned char *buffer, int len) 1025 | { 1026 | #ifdef STBIW_CRC32 1027 | return STBIW_CRC32(buffer, len); 1028 | #else 1029 | static unsigned int crc_table[256] = 1030 | { 1031 | 0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3, 1032 | 0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91, 1033 | 0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7, 1034 | 0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5, 1035 | 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 1036 | 0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59, 1037 | 0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F, 1038 | 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 1039 | 0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433, 1040 | 0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01, 1041 | 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 1042 | 0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65, 1043 | 0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB, 1044 | 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 1045 | 0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F, 1046 | 0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD, 1047 | 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 1048 | 0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1, 1049 | 0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7, 1050 | 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 1051 | 0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B, 1052 | 0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79, 1053 | 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 1054 | 0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D, 1055 | 0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713, 1056 | 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 1057 | 0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777, 1058 | 0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45, 1059 | 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 1060 | 0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9, 1061 | 0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF, 1062 | 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D 1063 | }; 1064 | 1065 | unsigned int crc = ~0u; 1066 | int i; 1067 | for (i=0; i < len; ++i) 1068 | crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)]; 1069 | return ~crc; 1070 | #endif 1071 | } 1072 | 1073 | #define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4) 1074 | #define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v)); 1075 | #define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3]) 1076 | 1077 | static void stbiw__wpcrc(unsigned char **data, int len) 1078 | { 1079 | unsigned int crc = stbiw__crc32(*data - len - 4, len+4); 1080 | stbiw__wp32(*data, crc); 1081 | } 1082 | 1083 | static unsigned char stbiw__paeth(int a, int b, int c) 1084 | { 1085 | int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c); 1086 | if (pa <= pb && pa <= pc) return STBIW_UCHAR(a); 1087 | if (pb <= pc) return STBIW_UCHAR(b); 1088 | return STBIW_UCHAR(c); 1089 | } 1090 | 1091 | // @OPTIMIZE: provide an option that always forces left-predict or paeth predict 1092 | static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer) 1093 | { 1094 | static int mapping[] = { 0,1,2,3,4 }; 1095 | static int firstmap[] = { 0,1,0,5,6 }; 1096 | int *mymap = (y != 0) ? mapping : firstmap; 1097 | int i; 1098 | int type = mymap[filter_type]; 1099 | unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height-1-y : y); 1100 | int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes; 1101 | 1102 | if (type==0) { 1103 | memcpy(line_buffer, z, width*n); 1104 | return; 1105 | } 1106 | 1107 | // first loop isn't optimized since it's just one pixel 1108 | for (i = 0; i < n; ++i) { 1109 | switch (type) { 1110 | case 1: line_buffer[i] = z[i]; break; 1111 | case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break; 1112 | case 3: line_buffer[i] = z[i] - (z[i-signed_stride]>>1); break; 1113 | case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-signed_stride],0)); break; 1114 | case 5: line_buffer[i] = z[i]; break; 1115 | case 6: line_buffer[i] = z[i]; break; 1116 | } 1117 | } 1118 | switch (type) { 1119 | case 1: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-n]; break; 1120 | case 2: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - z[i-signed_stride]; break; 1121 | case 3: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - ((z[i-n] + z[i-signed_stride])>>1); break; 1122 | case 4: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-signed_stride], z[i-signed_stride-n]); break; 1123 | case 5: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - (z[i-n]>>1); break; 1124 | case 6: for (i=n; i < width*n; ++i) line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break; 1125 | } 1126 | } 1127 | 1128 | STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len) 1129 | { 1130 | int force_filter = stbi_write_force_png_filter; 1131 | int ctype[5] = { -1, 0, 4, 2, 6 }; 1132 | unsigned char sig[8] = { 137,80,78,71,13,10,26,10 }; 1133 | unsigned char *out,*o, *filt, *zlib; 1134 | signed char *line_buffer; 1135 | int j,zlen; 1136 | 1137 | if (stride_bytes == 0) 1138 | stride_bytes = x * n; 1139 | 1140 | if (force_filter >= 5) { 1141 | force_filter = -1; 1142 | } 1143 | 1144 | filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0; 1145 | line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; } 1146 | for (j=0; j < y; ++j) { 1147 | int filter_type; 1148 | if (force_filter > -1) { 1149 | filter_type = force_filter; 1150 | stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, force_filter, line_buffer); 1151 | } else { // Estimate the best filter by running through all of them: 1152 | int best_filter = 0, best_filter_val = 0x7fffffff, est, i; 1153 | for (filter_type = 0; filter_type < 5; filter_type++) { 1154 | stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, filter_type, line_buffer); 1155 | 1156 | // Estimate the entropy of the line using this filter; the less, the better. 1157 | est = 0; 1158 | for (i = 0; i < x*n; ++i) { 1159 | est += abs((signed char) line_buffer[i]); 1160 | } 1161 | if (est < best_filter_val) { 1162 | best_filter_val = est; 1163 | best_filter = filter_type; 1164 | } 1165 | } 1166 | if (filter_type != best_filter) { // If the last iteration already got us the best filter, don't redo it 1167 | stbiw__encode_png_line((unsigned char*)(pixels), stride_bytes, x, y, j, n, best_filter, line_buffer); 1168 | filter_type = best_filter; 1169 | } 1170 | } 1171 | // when we get here, filter_type contains the filter type, and line_buffer contains the data 1172 | filt[j*(x*n+1)] = (unsigned char) filter_type; 1173 | STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n); 1174 | } 1175 | STBIW_FREE(line_buffer); 1176 | zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level); 1177 | STBIW_FREE(filt); 1178 | if (!zlib) return 0; 1179 | 1180 | // each tag requires 12 bytes of overhead 1181 | out = (unsigned char *) STBIW_MALLOC(8 + 12+13 + 12+zlen + 12); 1182 | if (!out) return 0; 1183 | *out_len = 8 + 12+13 + 12+zlen + 12; 1184 | 1185 | o=out; 1186 | STBIW_MEMMOVE(o,sig,8); o+= 8; 1187 | stbiw__wp32(o, 13); // header length 1188 | stbiw__wptag(o, "IHDR"); 1189 | stbiw__wp32(o, x); 1190 | stbiw__wp32(o, y); 1191 | *o++ = 8; 1192 | *o++ = STBIW_UCHAR(ctype[n]); 1193 | *o++ = 0; 1194 | *o++ = 0; 1195 | *o++ = 0; 1196 | stbiw__wpcrc(&o,13); 1197 | 1198 | stbiw__wp32(o, zlen); 1199 | stbiw__wptag(o, "IDAT"); 1200 | STBIW_MEMMOVE(o, zlib, zlen); 1201 | o += zlen; 1202 | STBIW_FREE(zlib); 1203 | stbiw__wpcrc(&o, zlen); 1204 | 1205 | stbiw__wp32(o,0); 1206 | stbiw__wptag(o, "IEND"); 1207 | stbiw__wpcrc(&o,0); 1208 | 1209 | STBIW_ASSERT(o == out + *out_len); 1210 | 1211 | return out; 1212 | } 1213 | 1214 | #ifndef STBI_WRITE_NO_STDIO 1215 | STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes) 1216 | { 1217 | FILE *f; 1218 | int len; 1219 | unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len); 1220 | if (png == NULL) return 0; 1221 | 1222 | f = stbiw__fopen(filename, "wb"); 1223 | if (!f) { STBIW_FREE(png); return 0; } 1224 | fwrite(png, 1, len, f); 1225 | fclose(f); 1226 | STBIW_FREE(png); 1227 | return 1; 1228 | } 1229 | #endif 1230 | 1231 | STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes) 1232 | { 1233 | int len; 1234 | unsigned char *png = stbi_write_png_to_mem((const unsigned char *) data, stride_bytes, x, y, comp, &len); 1235 | if (png == NULL) return 0; 1236 | func(context, png, len); 1237 | STBIW_FREE(png); 1238 | return 1; 1239 | } 1240 | 1241 | 1242 | /* *************************************************************************** 1243 | * 1244 | * JPEG writer 1245 | * 1246 | * This is based on Jon Olick's jo_jpeg.cpp: 1247 | * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html 1248 | */ 1249 | 1250 | static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18, 1251 | 24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 }; 1252 | 1253 | static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs) { 1254 | int bitBuf = *bitBufP, bitCnt = *bitCntP; 1255 | bitCnt += bs[1]; 1256 | bitBuf |= bs[0] << (24 - bitCnt); 1257 | while(bitCnt >= 8) { 1258 | unsigned char c = (bitBuf >> 16) & 255; 1259 | stbiw__putc(s, c); 1260 | if(c == 255) { 1261 | stbiw__putc(s, 0); 1262 | } 1263 | bitBuf <<= 8; 1264 | bitCnt -= 8; 1265 | } 1266 | *bitBufP = bitBuf; 1267 | *bitCntP = bitCnt; 1268 | } 1269 | 1270 | static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p) { 1271 | float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p; 1272 | float z1, z2, z3, z4, z5, z11, z13; 1273 | 1274 | float tmp0 = d0 + d7; 1275 | float tmp7 = d0 - d7; 1276 | float tmp1 = d1 + d6; 1277 | float tmp6 = d1 - d6; 1278 | float tmp2 = d2 + d5; 1279 | float tmp5 = d2 - d5; 1280 | float tmp3 = d3 + d4; 1281 | float tmp4 = d3 - d4; 1282 | 1283 | // Even part 1284 | float tmp10 = tmp0 + tmp3; // phase 2 1285 | float tmp13 = tmp0 - tmp3; 1286 | float tmp11 = tmp1 + tmp2; 1287 | float tmp12 = tmp1 - tmp2; 1288 | 1289 | d0 = tmp10 + tmp11; // phase 3 1290 | d4 = tmp10 - tmp11; 1291 | 1292 | z1 = (tmp12 + tmp13) * 0.707106781f; // c4 1293 | d2 = tmp13 + z1; // phase 5 1294 | d6 = tmp13 - z1; 1295 | 1296 | // Odd part 1297 | tmp10 = tmp4 + tmp5; // phase 2 1298 | tmp11 = tmp5 + tmp6; 1299 | tmp12 = tmp6 + tmp7; 1300 | 1301 | // The rotator is modified from fig 4-8 to avoid extra negations. 1302 | z5 = (tmp10 - tmp12) * 0.382683433f; // c6 1303 | z2 = tmp10 * 0.541196100f + z5; // c2-c6 1304 | z4 = tmp12 * 1.306562965f + z5; // c2+c6 1305 | z3 = tmp11 * 0.707106781f; // c4 1306 | 1307 | z11 = tmp7 + z3; // phase 5 1308 | z13 = tmp7 - z3; 1309 | 1310 | *d5p = z13 + z2; // phase 6 1311 | *d3p = z13 - z2; 1312 | *d1p = z11 + z4; 1313 | *d7p = z11 - z4; 1314 | 1315 | *d0p = d0; *d2p = d2; *d4p = d4; *d6p = d6; 1316 | } 1317 | 1318 | static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) { 1319 | int tmp1 = val < 0 ? -val : val; 1320 | val = val < 0 ? val-1 : val; 1321 | bits[1] = 1; 1322 | while(tmp1 >>= 1) { 1323 | ++bits[1]; 1324 | } 1325 | bits[0] = val & ((1<0)&&(DU[end0pos]==0); --end0pos) { 1368 | } 1369 | // end0pos = first element in reverse order !=0 1370 | if(end0pos == 0) { 1371 | stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB); 1372 | return DU[0]; 1373 | } 1374 | for(i = 1; i <= end0pos; ++i) { 1375 | int startpos = i; 1376 | int nrzeroes; 1377 | unsigned short bits[2]; 1378 | for (; DU[i]==0 && i<=end0pos; ++i) { 1379 | } 1380 | nrzeroes = i-startpos; 1381 | if ( nrzeroes >= 16 ) { 1382 | int lng = nrzeroes>>4; 1383 | int nrmarker; 1384 | for (nrmarker=1; nrmarker <= lng; ++nrmarker) 1385 | stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes); 1386 | nrzeroes &= 15; 1387 | } 1388 | stbiw__jpg_calcBits(DU[i], bits); 1389 | stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes<<4)+bits[1]]); 1390 | stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits); 1391 | } 1392 | if(end0pos != 63) { 1393 | stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB); 1394 | } 1395 | return DU[0]; 1396 | } 1397 | 1398 | static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) { 1399 | // Constants that don't pollute global namespace 1400 | static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0}; 1401 | static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11}; 1402 | static const unsigned char std_ac_luminance_nrcodes[] = {0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d}; 1403 | static const unsigned char std_ac_luminance_values[] = { 1404 | 0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08, 1405 | 0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28, 1406 | 0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59, 1407 | 0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89, 1408 | 0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6, 1409 | 0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2, 1410 | 0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa 1411 | }; 1412 | static const unsigned char std_dc_chrominance_nrcodes[] = {0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0}; 1413 | static const unsigned char std_dc_chrominance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11}; 1414 | static const unsigned char std_ac_chrominance_nrcodes[] = {0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77}; 1415 | static const unsigned char std_ac_chrominance_values[] = { 1416 | 0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91, 1417 | 0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26, 1418 | 0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58, 1419 | 0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87, 1420 | 0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4, 1421 | 0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda, 1422 | 0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa 1423 | }; 1424 | // Huffman tables 1425 | static const unsigned short YDC_HT[256][2] = { {0,2},{2,3},{3,3},{4,3},{5,3},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9}}; 1426 | static const unsigned short UVDC_HT[256][2] = { {0,2},{1,2},{2,2},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9},{1022,10},{2046,11}}; 1427 | static const unsigned short YAC_HT[256][2] = { 1428 | {10,4},{0,2},{1,2},{4,3},{11,4},{26,5},{120,7},{248,8},{1014,10},{65410,16},{65411,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1429 | {12,4},{27,5},{121,7},{502,9},{2038,11},{65412,16},{65413,16},{65414,16},{65415,16},{65416,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1430 | {28,5},{249,8},{1015,10},{4084,12},{65417,16},{65418,16},{65419,16},{65420,16},{65421,16},{65422,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1431 | {58,6},{503,9},{4085,12},{65423,16},{65424,16},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1432 | {59,6},{1016,10},{65430,16},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1433 | {122,7},{2039,11},{65438,16},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1434 | {123,7},{4086,12},{65446,16},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1435 | {250,8},{4087,12},{65454,16},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1436 | {504,9},{32704,15},{65462,16},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1437 | {505,9},{65470,16},{65471,16},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1438 | {506,9},{65479,16},{65480,16},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1439 | {1017,10},{65488,16},{65489,16},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1440 | {1018,10},{65497,16},{65498,16},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1441 | {2040,11},{65506,16},{65507,16},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1442 | {65515,16},{65516,16},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{0,0},{0,0},{0,0},{0,0},{0,0}, 1443 | {2041,11},{65525,16},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0} 1444 | }; 1445 | static const unsigned short UVAC_HT[256][2] = { 1446 | {0,2},{1,2},{4,3},{10,4},{24,5},{25,5},{56,6},{120,7},{500,9},{1014,10},{4084,12},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1447 | {11,4},{57,6},{246,8},{501,9},{2038,11},{4085,12},{65416,16},{65417,16},{65418,16},{65419,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1448 | {26,5},{247,8},{1015,10},{4086,12},{32706,15},{65420,16},{65421,16},{65422,16},{65423,16},{65424,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1449 | {27,5},{248,8},{1016,10},{4087,12},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{65430,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1450 | {58,6},{502,9},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{65438,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1451 | {59,6},{1017,10},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{65446,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1452 | {121,7},{2039,11},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{65454,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1453 | {122,7},{2040,11},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{65462,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1454 | {249,8},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{65470,16},{65471,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1455 | {503,9},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{65479,16},{65480,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1456 | {504,9},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{65488,16},{65489,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1457 | {505,9},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{65497,16},{65498,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1458 | {506,9},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{65506,16},{65507,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1459 | {2041,11},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{65515,16},{65516,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}, 1460 | {16352,14},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{65525,16},{0,0},{0,0},{0,0},{0,0},{0,0}, 1461 | {1018,10},{32707,15},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0} 1462 | }; 1463 | static const int YQT[] = {16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22, 1464 | 37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99}; 1465 | static const int UVQT[] = {17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99, 1466 | 99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99}; 1467 | static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f, 1468 | 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f }; 1469 | 1470 | int row, col, i, k, subsample; 1471 | float fdtbl_Y[64], fdtbl_UV[64]; 1472 | unsigned char YTable[64], UVTable[64]; 1473 | 1474 | if(!data || !width || !height || comp > 4 || comp < 1) { 1475 | return 0; 1476 | } 1477 | 1478 | quality = quality ? quality : 90; 1479 | subsample = quality <= 90 ? 1 : 0; 1480 | quality = quality < 1 ? 1 : quality > 100 ? 100 : quality; 1481 | quality = quality < 50 ? 5000 / quality : 200 - quality * 2; 1482 | 1483 | for(i = 0; i < 64; ++i) { 1484 | int uvti, yti = (YQT[i]*quality+50)/100; 1485 | YTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (yti < 1 ? 1 : yti > 255 ? 255 : yti); 1486 | uvti = (UVQT[i]*quality+50)/100; 1487 | UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (uvti < 1 ? 1 : uvti > 255 ? 255 : uvti); 1488 | } 1489 | 1490 | for(row = 0, k = 0; row < 8; ++row) { 1491 | for(col = 0; col < 8; ++col, ++k) { 1492 | fdtbl_Y[k] = 1 / (YTable [stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]); 1493 | fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]); 1494 | } 1495 | } 1496 | 1497 | // Write Headers 1498 | { 1499 | static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 }; 1500 | static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 }; 1501 | const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width), 1502 | 3,1,(unsigned char)(subsample?0x22:0x11),0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 }; 1503 | s->func(s->context, (void*)head0, sizeof(head0)); 1504 | s->func(s->context, (void*)YTable, sizeof(YTable)); 1505 | stbiw__putc(s, 1); 1506 | s->func(s->context, UVTable, sizeof(UVTable)); 1507 | s->func(s->context, (void*)head1, sizeof(head1)); 1508 | s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1); 1509 | s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values)); 1510 | stbiw__putc(s, 0x10); // HTYACinfo 1511 | s->func(s->context, (void*)(std_ac_luminance_nrcodes+1), sizeof(std_ac_luminance_nrcodes)-1); 1512 | s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values)); 1513 | stbiw__putc(s, 1); // HTUDCinfo 1514 | s->func(s->context, (void*)(std_dc_chrominance_nrcodes+1), sizeof(std_dc_chrominance_nrcodes)-1); 1515 | s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values)); 1516 | stbiw__putc(s, 0x11); // HTUACinfo 1517 | s->func(s->context, (void*)(std_ac_chrominance_nrcodes+1), sizeof(std_ac_chrominance_nrcodes)-1); 1518 | s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values)); 1519 | s->func(s->context, (void*)head2, sizeof(head2)); 1520 | } 1521 | 1522 | // Encode 8x8 macroblocks 1523 | { 1524 | static const unsigned short fillBits[] = {0x7F, 7}; 1525 | int DCY=0, DCU=0, DCV=0; 1526 | int bitBuf=0, bitCnt=0; 1527 | // comp == 2 is grey+alpha (alpha is ignored) 1528 | int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0; 1529 | const unsigned char *dataR = (const unsigned char *)data; 1530 | const unsigned char *dataG = dataR + ofsG; 1531 | const unsigned char *dataB = dataR + ofsB; 1532 | int x, y, pos; 1533 | if(subsample) { 1534 | for(y = 0; y < height; y += 16) { 1535 | for(x = 0; x < width; x += 16) { 1536 | float Y[256], U[256], V[256]; 1537 | for(row = y, pos = 0; row < y+16; ++row) { 1538 | // row >= height => use last input row 1539 | int clamped_row = (row < height) ? row : height - 1; 1540 | int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp; 1541 | for(col = x; col < x+16; ++col, ++pos) { 1542 | // if col >= width => use pixel from last input column 1543 | int p = base_p + ((col < width) ? col : (width-1))*comp; 1544 | float r = dataR[p], g = dataG[p], b = dataB[p]; 1545 | Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128; 1546 | U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b; 1547 | V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b; 1548 | } 1549 | } 1550 | DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+0, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT); 1551 | DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+8, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT); 1552 | DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+128, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT); 1553 | DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y+136, 16, fdtbl_Y, DCY, YDC_HT, YAC_HT); 1554 | 1555 | // subsample U,V 1556 | { 1557 | float subU[64], subV[64]; 1558 | int yy, xx; 1559 | for(yy = 0, pos = 0; yy < 8; ++yy) { 1560 | for(xx = 0; xx < 8; ++xx, ++pos) { 1561 | int j = yy*32+xx*2; 1562 | subU[pos] = (U[j+0] + U[j+1] + U[j+16] + U[j+17]) * 0.25f; 1563 | subV[pos] = (V[j+0] + V[j+1] + V[j+16] + V[j+17]) * 0.25f; 1564 | } 1565 | } 1566 | DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subU, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT); 1567 | DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subV, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT); 1568 | } 1569 | } 1570 | } 1571 | } else { 1572 | for(y = 0; y < height; y += 8) { 1573 | for(x = 0; x < width; x += 8) { 1574 | float Y[64], U[64], V[64]; 1575 | for(row = y, pos = 0; row < y+8; ++row) { 1576 | // row >= height => use last input row 1577 | int clamped_row = (row < height) ? row : height - 1; 1578 | int base_p = (stbi__flip_vertically_on_write ? (height-1-clamped_row) : clamped_row)*width*comp; 1579 | for(col = x; col < x+8; ++col, ++pos) { 1580 | // if col >= width => use pixel from last input column 1581 | int p = base_p + ((col < width) ? col : (width-1))*comp; 1582 | float r = dataR[p], g = dataG[p], b = dataB[p]; 1583 | Y[pos]= +0.29900f*r + 0.58700f*g + 0.11400f*b - 128; 1584 | U[pos]= -0.16874f*r - 0.33126f*g + 0.50000f*b; 1585 | V[pos]= +0.50000f*r - 0.41869f*g - 0.08131f*b; 1586 | } 1587 | } 1588 | 1589 | DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y, 8, fdtbl_Y, DCY, YDC_HT, YAC_HT); 1590 | DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, U, 8, fdtbl_UV, DCU, UVDC_HT, UVAC_HT); 1591 | DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, V, 8, fdtbl_UV, DCV, UVDC_HT, UVAC_HT); 1592 | } 1593 | } 1594 | } 1595 | 1596 | // Do the bit alignment of the EOI marker 1597 | stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits); 1598 | } 1599 | 1600 | // EOI 1601 | stbiw__putc(s, 0xFF); 1602 | stbiw__putc(s, 0xD9); 1603 | 1604 | return 1; 1605 | } 1606 | 1607 | STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality) 1608 | { 1609 | stbi__write_context s = { 0 }; 1610 | stbi__start_write_callbacks(&s, func, context); 1611 | return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality); 1612 | } 1613 | 1614 | 1615 | #ifndef STBI_WRITE_NO_STDIO 1616 | STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality) 1617 | { 1618 | stbi__write_context s = { 0 }; 1619 | if (stbi__start_write_file(&s,filename)) { 1620 | int r = stbi_write_jpg_core(&s, x, y, comp, data, quality); 1621 | stbi__end_write_file(&s); 1622 | return r; 1623 | } else 1624 | return 0; 1625 | } 1626 | #endif 1627 | 1628 | #endif // STB_IMAGE_WRITE_IMPLEMENTATION 1629 | 1630 | /* Revision history 1631 | 1.16 (2021-07-11) 1632 | make Deflate code emit uncompressed blocks when it would otherwise expand 1633 | support writing BMPs with alpha channel 1634 | 1.15 (2020-07-13) unknown 1635 | 1.14 (2020-02-02) updated JPEG writer to downsample chroma channels 1636 | 1.13 1637 | 1.12 1638 | 1.11 (2019-08-11) 1639 | 1640 | 1.10 (2019-02-07) 1641 | support utf8 filenames in Windows; fix warnings and platform ifdefs 1642 | 1.09 (2018-02-11) 1643 | fix typo in zlib quality API, improve STB_I_W_STATIC in C++ 1644 | 1.08 (2018-01-29) 1645 | add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter 1646 | 1.07 (2017-07-24) 1647 | doc fix 1648 | 1.06 (2017-07-23) 1649 | writing JPEG (using Jon Olick's code) 1650 | 1.05 ??? 1651 | 1.04 (2017-03-03) 1652 | monochrome BMP expansion 1653 | 1.03 ??? 1654 | 1.02 (2016-04-02) 1655 | avoid allocating large structures on the stack 1656 | 1.01 (2016-01-16) 1657 | STBIW_REALLOC_SIZED: support allocators with no realloc support 1658 | avoid race-condition in crc initialization 1659 | minor compile issues 1660 | 1.00 (2015-09-14) 1661 | installable file IO function 1662 | 0.99 (2015-09-13) 1663 | warning fixes; TGA rle support 1664 | 0.98 (2015-04-08) 1665 | added STBIW_MALLOC, STBIW_ASSERT etc 1666 | 0.97 (2015-01-18) 1667 | fixed HDR asserts, rewrote HDR rle logic 1668 | 0.96 (2015-01-17) 1669 | add HDR output 1670 | fix monochrome BMP 1671 | 0.95 (2014-08-17) 1672 | add monochrome TGA output 1673 | 0.94 (2014-05-31) 1674 | rename private functions to avoid conflicts with stb_image.h 1675 | 0.93 (2014-05-27) 1676 | warning fixes 1677 | 0.92 (2010-08-01) 1678 | casts to unsigned char to fix warnings 1679 | 0.91 (2010-07-17) 1680 | first public release 1681 | 0.90 first internal release 1682 | */ 1683 | 1684 | /* 1685 | ------------------------------------------------------------------------------ 1686 | This software is available under 2 licenses -- choose whichever you prefer. 1687 | ------------------------------------------------------------------------------ 1688 | ALTERNATIVE A - MIT License 1689 | Copyright (c) 2017 Sean Barrett 1690 | Permission is hereby granted, free of charge, to any person obtaining a copy of 1691 | this software and associated documentation files (the "Software"), to deal in 1692 | the Software without restriction, including without limitation the rights to 1693 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 1694 | of the Software, and to permit persons to whom the Software is furnished to do 1695 | so, subject to the following conditions: 1696 | The above copyright notice and this permission notice shall be included in all 1697 | copies or substantial portions of the Software. 1698 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1699 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1700 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 1701 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1702 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 1703 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 1704 | SOFTWARE. 1705 | ------------------------------------------------------------------------------ 1706 | ALTERNATIVE B - Public Domain (www.unlicense.org) 1707 | This is free and unencumbered software released into the public domain. 1708 | Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 1709 | software, either in source code form or as a compiled binary, for any purpose, 1710 | commercial or non-commercial, and by any means. 1711 | In jurisdictions that recognize copyright laws, the author or authors of this 1712 | software dedicate any and all copyright interest in the software to the public 1713 | domain. We make this dedication for the benefit of the public at large and to 1714 | the detriment of our heirs and successors. We intend this dedication to be an 1715 | overt act of relinquishment in perpetuity of all present and future rights to 1716 | this software under copyright law. 1717 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1718 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1719 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 1720 | AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 1721 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 1722 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 1723 | ------------------------------------------------------------------------------ 1724 | */ 1725 | --------------------------------------------------------------------------------