├── Makefile ├── README.md ├── LICENSE ├── src ├── main.cpp └── jpeg_encoder.cpp ├── include └── jpeg_encoder.hpp └── kernel └── jpeg-encoder.cl /Makefile: -------------------------------------------------------------------------------- 1 | all: jpeg_encoder.o main.o 2 | g++ -O3 -Wall -Werror -pedantic jpeg_encoder.o main.o -o jpeg_enc -lOpenCL 3 | 4 | main.o: 5 | g++ -O3 -Wall -Werror -pedantic -c src/main.cpp 6 | 7 | jpeg_encoder.o: 8 | g++ -O3 -Wall -Werror -pedantic -c src/jpeg_encoder.cpp 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenCL JPEG Encoder 2 | 3 | Encodes an RGB image buffer to *JPEG* using *OpenCL*. Beside the final run length encoding, all previous steps such as color transformation, downsampling, discrete cosine transformation and quantification are performed on the *OpenCL* Device. 4 | 5 | ## Building 6 | Beside the *OpenCL C++ Wrapper* and *make* there are no dependencies. The program can be built calling 7 | ``` 8 | make 9 | ``` 10 | 11 | ## Running 12 | The program encodes a raw ppm image to an jpeg image 13 | ``` 14 | ./jpeg_enc src.ppm out.jpg 15 | ``` 16 | 17 | ## Usage 18 | ```c++ 19 | /* Create the encoder */ 20 | jpeg::JPEGEncoder encoder(, ); 21 | 22 | /* Encode the image */ 23 | encoder.encode_image(, , , ); 24 | ``` 25 | 26 | # Performance 27 | The performance was measured running on a Radeon R9 290 encoding an image with 12079x7025 pixels showing a cove. 28 | - Color conversion: 8.4ms 29 | - Downsampling: 9.5ms 30 | - DCT + Quantification: 20ms 31 | - Time to copy buffers: 40ms 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Oliver 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "../include/jpeg_encoder.hpp" 12 | 13 | struct rgb { 14 | unsigned char r; 15 | unsigned char g; 16 | unsigned char b; 17 | }; 18 | typedef struct rgb rgb_t; 19 | 20 | struct PPMimage { 21 | size_t w, h; 22 | rgb_t *pixel; 23 | }; 24 | typedef struct PPMimage ppm_t; 25 | 26 | int readPPMImage(const char * const file, size_t *width, size_t *height, rgb_t **buffer); 27 | 28 | int readPPMImage(const char * const file, size_t *width, size_t *height, rgb_t **buffer) 29 | { 30 | char line[0x80]; 31 | char *tok; 32 | int ret; 33 | 34 | ret = 0; 35 | FILE *fp = fopen(file, "rb"); 36 | if(fp == NULL) { 37 | //fprintf(stderr, "Could not open file"); 38 | return 0x1; 39 | } 40 | 41 | if(fgets(line, 0x80, fp) == NULL) { 42 | //fprintf(stderr, "Could not get content from file"); 43 | ret = 0x2; 44 | goto end; 45 | } 46 | 47 | if(strcmp(line, "P6\n")) { 48 | //fprintf(stderr, "Illegal file format"); 49 | ret = 0x3; 50 | goto end; 51 | } 52 | while(fgets(line, 0x80, fp)) { 53 | if(line[0] == '#') 54 | continue; 55 | else { 56 | tok = strtok(line, " "); 57 | *width = atoi(tok); 58 | tok = strtok(NULL, " "); 59 | *height = atoi(tok); 60 | (void)fgets(line, 0x80, fp); 61 | break; 62 | } 63 | } 64 | 65 | #ifdef __cplusplus 66 | *buffer = (rgb_t*)malloc(*width * *height * sizeof(rgb_t)); 67 | #else 68 | *buffer = malloc(*width * *height * sizeof(rgb_t)); 69 | #endif 70 | if(*buffer == NULL) { 71 | //fprintf(stderr, "Memory Allocation failed"); 72 | ret = 0x4; 73 | goto end; 74 | } 75 | 76 | (void)fread(*buffer, sizeof(rgb_t), *width * *height, fp); 77 | 78 | end: 79 | fclose(fp); 80 | return ret; 81 | } 82 | 83 | ////////////////////////////////////////////////////////////////////////////// 84 | // Main function 85 | ////////////////////////////////////////////////////////////////////////////// 86 | int main(int argc, char** argv) { 87 | ppm_t image; 88 | 89 | if(argc != 4) 90 | return 1; 91 | 92 | /* Create the encoder */ 93 | jpeg::JPEGEncoder encoder(CL_DEVICE_TYPE_ALL, atoi(argv[3])); 94 | 95 | /* Read input image */ 96 | if(readPPMImage(argv[1], &image.w, &image.h, &image.pixel)) 97 | { 98 | fprintf(stderr, "Error Reading input file\naborting...\n"); 99 | return 0x2; 100 | } 101 | 102 | /* Encode the image */ 103 | encoder.encode_image((unsigned char*)image.pixel, image.w, image.h, argv[2]); 104 | 105 | /* Free image memory */ 106 | free(image.pixel); 107 | 108 | return 0; 109 | } 110 | -------------------------------------------------------------------------------- /include/jpeg_encoder.hpp: -------------------------------------------------------------------------------- 1 | #ifndef _JPEG_ENCODER_ 2 | #define _JPEG_ENCODER_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "tables.h" 11 | 12 | namespace jpeg 13 | { 14 | 15 | struct derived_huffman_table 16 | { 17 | unsigned int code[0x100]; 18 | unsigned char length[0x100]; 19 | }; 20 | typedef derived_huffman_table derived_huffman_table_t; 21 | 22 | struct huffman_table 23 | { 24 | unsigned char bits[0x11]; 25 | unsigned char value[0x100]; 26 | }; 27 | typedef struct huffman_table huffman_table_t; 28 | 29 | struct entropy_state 30 | { 31 | size_t buffer; 32 | int bits; 33 | int last_dc_val[0x3]; 34 | }; 35 | typedef struct entropy_state entropy_state_t; 36 | 37 | struct quantification_table 38 | { 39 | unsigned char value[0x40]; 40 | }; 41 | typedef struct quantification_table quantification_table_t; 42 | 43 | 44 | 45 | class JPEGEncoder 46 | { 47 | private: 48 | /* Quantification Table, one for luminance, one for chrominance */ 49 | quantification_table_t m_quant_tbls[0x2]; 50 | 51 | /* Division Lookup table for DCT, one for luminance, one for chrominance */ 52 | short m_fdct_divisors[0x2][0x100]; 53 | 54 | /* entropy encoding */ 55 | derived_huffman_table_t m_dc_derived_tbls[0x2]; 56 | derived_huffman_table_t m_ac_derived_tbls[0x2]; 57 | huffman_table_t m_dc_huff_tbls[0x2]; 58 | huffman_table_t m_ac_huff_tbls[0x2]; 59 | 60 | /* OpenCL Context used in this class */ 61 | cl::Context m_context; 62 | 63 | /* OpenCL device to use */ 64 | cl::Device m_device; 65 | 66 | /* OpenCL command queue to use */ 67 | cl::CommandQueue m_queue; 68 | 69 | /* Program containing the kernels */ 70 | cl::Program m_program; 71 | 72 | /* Kernels */ 73 | cl::Kernel m_transformation_kernel; 74 | cl::Kernel m_downsample_full_kernel; 75 | cl::Kernel m_downsample_2v2_kernel; 76 | cl::Kernel m_dct_quant; 77 | cl::Kernel m_zero_out_right; 78 | cl::Kernel m_zero_out_bottom; 79 | 80 | /* 81 | Look up tables 82 | with size of 3 * 3 * 256 83 | 84 | the partial tables are in the following ranges 85 | 0 <= x < 768: red 86 | 768 <= x < 1536: green 87 | 1536 <= x < 2304: blue 88 | 89 | each of the following tables consists of 256 elements, 90 | where each element is a structure as the follows 91 | struct { 92 | unsigned int y; 93 | unsigned int cr; 94 | unsigned int cb; 95 | }; 96 | 97 | The values are stored as integers since they are shifted by 16 98 | to not lose precision. After summing up all the elements for 99 | a channel the result needs to be right-shifted again 100 | and then fits into a single unsigned char (1 Byte) value 101 | */ 102 | cl::Buffer md_color_conversion_table; 103 | 104 | /* Divisor table for the quantification */ 105 | cl::Buffer md_fdct_divisors; 106 | cl::Buffer md_fdct_multiplier; 107 | cl::Buffer md_fdct_sign; 108 | cl::Buffer md_fdct_indices; 109 | cl::Buffer md_fdct_descaler; 110 | cl::Buffer md_fdct_descaler_offset; 111 | 112 | 113 | /** 114 | * Encode the given image 115 | * 116 | * @param image pointer to the image data in flat row major layout 117 | * @param width of the image 118 | * @param height of the image 119 | * @param file the output file to store the image at 120 | * @param cpu 0 iff cpu shall not be used to compare and validate 121 | * @return 0 on success 122 | */ 123 | int encode_image(unsigned char* image, size_t width, size_t height, const char * const file, int cpu); 124 | 125 | /** 126 | * Prepare the device that runs the encoding process by uploading 127 | * the color conversion table and preparing dct, huffman, ... 128 | */ 129 | void prepare_device(void); 130 | 131 | 132 | /** 133 | * Create the encoder 134 | * 135 | * @param quality the quality to use (clamped between 1 and 100) 136 | */ 137 | void create_encoder(unsigned char); 138 | 139 | /** 140 | * Set the quality in the quantification tables 141 | * 142 | * @param quality quality the quality to use (clamped between 1 and 100) 143 | */ 144 | void set_quality_setting(unsigned char); 145 | 146 | /** 147 | * Create the huffman tables for the given encoder 148 | */ 149 | void create_huffman_tables(void); 150 | 151 | /** 152 | * Create the dct divisor tables 153 | */ 154 | void create_dct_division_tables(void); 155 | 156 | /** 157 | * Create the derived huffman tables 158 | */ 159 | void create_derived_huffman_tables(void); 160 | 161 | /** 162 | * Create the quantification tables for the encoder based on the given scale factor 163 | * 164 | * @param table_id the id of the quantification table in the encoder 165 | * @param scale the scale to use (quality setting) 166 | * @param base_table the base table to scale 167 | */ 168 | void create_quant_table(int, unsigned char, const unsigned int *); 169 | 170 | /** 171 | * Add the huffman table to the encoder, count the number of bits 172 | * and copy the number of values accordingly to the huffman table 173 | * 174 | * @param tblptr huffman table to use 175 | * @param bits length 176 | * @param values the values 177 | */ 178 | void add_huffman_table(huffman_table_t *, const unsigned char *, const unsigned char *); 179 | 180 | /** 181 | * Derive the huffman tables 182 | * 183 | * @param is_dc flag whether current table is dc or ac 184 | * @param table_idx table index to use 185 | * @param pointer to the derived huffman table to be filled 186 | */ 187 | void derive_huffman_table(unsigned char, huffman_table_t *, derived_huffman_table_t *); 188 | 189 | /** 190 | * Encode the entropy of a single block 191 | * 192 | * @param block the block 193 | * @param table_index the table index for the huffman tables to use 194 | * @param last_dc_val the last dc value from the previous block 195 | * @param outputbuf the output buffer to use 196 | * @param state current bits to be exported from the entropy 197 | */ 198 | void encode_entropy_single_block(short *, int, int, std::vector&, entropy_state_t&); 199 | 200 | /** 201 | * Do a entropy encoding for a super block containing of four luminance blocks and 202 | * for the cb/cr one chrominance block each 203 | * 204 | * @param mcu_buffer pointer to the blocks 205 | * @param outputbuf the output buffer 206 | * @param state the entropy state 207 | */ 208 | void encode_entropy(short *mcu_buffer[0x6], std::vector&, entropy_state_t&); 209 | 210 | /** 211 | * Write the file header 212 | * 213 | * @param output_buf the output buffer to use 214 | */ 215 | void write_file_header(std::vector&); 216 | 217 | /** 218 | * Write the frame header containing the quantification tables used 219 | * 220 | * @param output_buf the output buffer 221 | * @param w the width of the image 222 | * @param h the height of the image 223 | */ 224 | void write_frame_header(std::vector& output_buf, size_t w, size_t h); 225 | 226 | /** 227 | * Export the quantification table 228 | * 229 | * @param output_buf the output buffer 230 | * @param index the index of the quantification table 231 | */ 232 | void write_quant_table(std::vector& output_buf, int index); 233 | 234 | /** 235 | * Export the huffman tables 236 | * 237 | * @param output_buf the output buffer 238 | * @param index the index of the quantification table 239 | * @param is_ac flag, true if ac table shall be exported, false if dc 240 | */ 241 | void write_huffman_table(std::vector& output_buf, int index, unsigned char is_ac); 242 | 243 | /** 244 | * Write the sos marker 245 | * 246 | * @param output_buf the output buffer to use 247 | */ 248 | void write_sos(std::vector& output_buf); 249 | 250 | /** 251 | * Write the scan header containing the huffman tables 252 | * 253 | * @param output_buf the output buffer to use 254 | */ 255 | void write_scan_header(std::vector& output_buf); 256 | 257 | /** 258 | * Write the SOF Part containing the sampling parameters and image size 259 | * 260 | * @param output_buf the output buffer to use 261 | * @param w image width 262 | * @param h image height 263 | */ 264 | void write_sof(std::vector& outputbuf, size_t w, size_t h); 265 | 266 | public: 267 | 268 | /** 269 | * Create a new encoder 270 | * 271 | * @param type the device type to use 272 | * @param quality the quality setting to use (clamped between 1 and 100) 273 | */ 274 | JPEGEncoder(cl_device_type type, unsigned char quality); 275 | 276 | /** 277 | * Encode the given image 278 | * 279 | * @param image pointer to the image data in flat row major layout 280 | * @param width of the image 281 | * @param height of the image 282 | * @param file the output file to store the image at 283 | * @return 0 on success 284 | */ 285 | int encode_image(unsigned char* image, size_t width, size_t height, const char * const file); 286 | }; 287 | } 288 | 289 | #endif 290 | -------------------------------------------------------------------------------- /kernel/jpeg-encoder.cl: -------------------------------------------------------------------------------- 1 | #define RED_OFFSET 0x0 2 | #define GREEN_OFFSET 0x300 3 | #define BLUE_OFFSET 0x600 4 | 5 | __kernel void color_space_transform(__global unsigned int *color_conversion_table, 6 | __global unsigned char *image, unsigned int sz) 7 | { 8 | size_t gx = get_global_id(0); 9 | 10 | /* only execute inside of the image range */ 11 | if(gx < sz) 12 | { 13 | gx *= 3; 14 | 15 | /* read RGB values */ 16 | unsigned char r = image[gx + 0]; 17 | unsigned char g = image[gx + 1]; 18 | unsigned char b = image[gx + 2]; 19 | 20 | /* convert them into yCbCr */ 21 | unsigned int ry = color_conversion_table[0 + r * 3 + 0]; 22 | unsigned int rcr = color_conversion_table[0 + r * 3 + 1]; 23 | unsigned int rcb = color_conversion_table[0 + r * 3 + 2]; 24 | 25 | unsigned int gy = color_conversion_table[GREEN_OFFSET + g * 3 + 0]; 26 | unsigned int gcr = color_conversion_table[GREEN_OFFSET + g * 3 + 1]; 27 | unsigned int gcb = color_conversion_table[GREEN_OFFSET + g * 3 + 2]; 28 | 29 | unsigned int by = color_conversion_table[BLUE_OFFSET + b * 3 + 0]; 30 | unsigned int bcr = color_conversion_table[BLUE_OFFSET + b * 3 + 1]; 31 | unsigned int bcb = color_conversion_table[BLUE_OFFSET + b * 3 + 2]; 32 | 33 | /* store them back */ 34 | image[gx + 0] = ((unsigned char)((ry + gy + by) >> 0x10)); 35 | image[gx + 1] = ((unsigned char)((rcb + gcb + bcb) >> 0x10)); 36 | image[gx + 2] = ((unsigned char)((rcr + gcr + bcr) >> 0x10)); 37 | } 38 | } 39 | 40 | 41 | __kernel void downsample_full(__global short *buffer, __global unsigned char *image, 42 | unsigned int nsbw, unsigned int nbw, 43 | unsigned int nbh, unsigned int width, unsigned int height) 44 | { 45 | size_t gx = get_global_id(0); 46 | 47 | /* compute id of super block */ 48 | size_t super_block_id = gx >> 0x8; 49 | 50 | /* compute x and y of super block */ 51 | size_t super_block_x = super_block_id % nsbw; 52 | size_t super_block_y = super_block_id / nsbw; 53 | 54 | /* super sub block id and x and y position */ 55 | size_t sub_block_id = (gx & 0xFF) >> 0x6; 56 | size_t sub_block_x = sub_block_id & 0x1; 57 | size_t sub_block_y = sub_block_id >> 0x1; 58 | 59 | /* compute in block index and x and y index */ 60 | size_t field_id = gx & 0x3F; 61 | size_t field_x = field_id & 0x7; 62 | size_t field_y = field_id >> 0x3; 63 | 64 | /* Global x and y image position */ 65 | size_t image_x = (super_block_x << 0x4) | (sub_block_x << 0x3) | field_x; 66 | size_t image_y = (super_block_y << 0x4) | (sub_block_y << 0x3) | field_y; 67 | 68 | /* Clamp */ 69 | if(image_x >= width) image_x = width - 1; 70 | if(image_y >= height) image_y = height - 1; 71 | 72 | /* Copy the pixel */ 73 | buffer[gx] = (short)image[(image_x + (image_y * width)) * 3] - (short)0x80; 74 | } 75 | 76 | __kernel void downsample_2v2(__global short *cb, __global short *cr, 77 | __global unsigned char *image, unsigned int nsbw, 78 | unsigned int nbw, unsigned int nbh, 79 | unsigned int width, unsigned int height) 80 | { 81 | size_t gx = get_global_id(0); 82 | 83 | /* compute id of super block */ 84 | size_t super_block_id = gx >> 0x6; /* divide by 64 */ 85 | 86 | /* compute x and y of super block */ 87 | size_t super_block_x = super_block_id % nsbw; 88 | size_t super_block_y = super_block_id / nsbw; 89 | 90 | /* super sub block id and x and y position */ 91 | size_t sub_block_x = (gx & 0x7) > 0x3; 92 | size_t sub_block_y = (gx & 0x3F) > 0x1F; 93 | 94 | /* compute in block index and x and y index */ 95 | size_t field_id = gx & 0x3F; 96 | size_t field_x = (field_id & 0x7) << 0x1; 97 | size_t field_y = (field_id >> 0x3) << 0x1; 98 | 99 | /* Global x and y image position */ 100 | size_t image_x = (super_block_x << 0x4) | (sub_block_x << 0x3) | field_x; 101 | size_t image_y = (super_block_y << 0x4) | (sub_block_y << 0x3) | field_y; 102 | 103 | /* Compute pixels x and y values */ 104 | size_t pixel_x0 = image_x; 105 | size_t pixel_x1 = image_x + 1; 106 | size_t pixel_y0 = image_y; 107 | size_t pixel_y1 = image_y + 1; 108 | 109 | /* Clamp */ 110 | if(pixel_x0 >= width) pixel_x0 = width - 1; 111 | if(pixel_x1 >= width) pixel_x1 = width - 1; 112 | if(pixel_y0 >= height) pixel_y0 = height - 1; 113 | if(pixel_y1 >= height) pixel_y1 = height - 1; 114 | 115 | /* compute pixel ids */ 116 | size_t pixel00 = (pixel_x0 + (pixel_y0 * width)); 117 | size_t pixel10 = (pixel_x1 + (pixel_y0 * width)); 118 | size_t pixel01 = (pixel_x0 + (pixel_y1 * width)); 119 | size_t pixel11 = (pixel_x1 + (pixel_y1 * width)); 120 | 121 | /* Sum up the components */ 122 | long cb_sum = 0; 123 | long cr_sum = 0; 124 | 125 | size_t pixel = pixel00 * 3; 126 | cb_sum += (long)image[pixel + 1]; 127 | cr_sum += (long)image[pixel + 2]; 128 | 129 | pixel = pixel10 * 3; 130 | cb_sum += (long)image[pixel + 1]; 131 | cr_sum += (long)image[pixel + 2]; 132 | 133 | pixel = pixel01 * 3; 134 | cb_sum += (long)image[pixel + 1]; 135 | cr_sum += (long)image[pixel + 2]; 136 | 137 | pixel = pixel11 * 3; 138 | cb_sum += (long)image[pixel + 1]; 139 | cr_sum += (long)image[pixel + 2]; 140 | 141 | int bias = 0x1 << (gx & 0x1); 142 | cb_sum += bias; 143 | cr_sum += bias; 144 | 145 | /* Store the result */ 146 | cb[gx] = (short)(cb_sum >> 0x2) - (short)0x80; 147 | cr[gx] = (short)(cr_sum >> 0x2) - (short)0x80; 148 | } 149 | 150 | 151 | /* 152 | * NOTE: this algorithm is described in C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT 153 | * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, 154 | * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. 155 | */ 156 | #define LEFT_SHIFT(a, b) ((int)((unsigned int)(a) << (b))) 157 | #define DESCALE(x,n) RIGHT_SHIFT((x) + (1 << ((n)-1)), n) 158 | #define RIGHT_SHIFT(x,shft) ((x) >> (shft)) 159 | __kernel void dct_quant(__global short *block, __global short *divisors, unsigned int divisor_offset, 160 | __global short *multiplier, __global int *sign, __global int *indices, 161 | __global char *descaler, __global short *descaler_offset) 162 | { 163 | unsigned int product; 164 | unsigned short recip, corr; 165 | short ioffset, moffset, soffset, doffset; 166 | short t0, t1, t2, t3, res, neg; 167 | int value; 168 | __local short *dataptr; 169 | int shift; 170 | 171 | size_t gx = get_global_id(0); 172 | size_t lx = get_local_id(0); 173 | 174 | short row = lx >> 0x3; 175 | short row_offset = (row) << 0x3; 176 | short column = lx & 0x7; 177 | 178 | __local short lblock[0x40]; 179 | lblock[lx] = block[gx]; 180 | barrier(CLK_LOCAL_MEM_FENCE); 181 | dataptr = &lblock[row_offset]; 182 | 183 | /* Pass 1: process rows. */ 184 | ioffset = column << 0x3; 185 | moffset = column << 0x2; 186 | soffset = column << 0x1; 187 | doffset = column << 0x1; 188 | t0 = dataptr[indices[ioffset + 0]] + (dataptr[indices[ioffset + 1]] * sign[soffset + 0]); 189 | t1 = dataptr[indices[ioffset + 2]] + (dataptr[indices[ioffset + 3]] * sign[soffset + 0]); 190 | t2 = dataptr[indices[ioffset + 4]] + (dataptr[indices[ioffset + 5]] * sign[soffset + 0]); 191 | t3 = dataptr[indices[ioffset + 6]] + (dataptr[indices[ioffset + 7]] * sign[soffset + 0]); 192 | value = t0 * multiplier[moffset + 0] + (t1 + t0) * multiplier[moffset + 1] + (t2 + t0) 193 | * multiplier[moffset + 2] + ((t0 + t1) + ((t2 + t3) * sign[soffset + 1])) * multiplier[moffset + 3]; 194 | res = (short)DESCALE(value, 0xB) * descaler[doffset + 0] + LEFT_SHIFT(value, 0x2) * descaler[doffset + 1]; 195 | 196 | /* Wait for all rows in the local execution to complete */ 197 | barrier(CLK_LOCAL_MEM_FENCE); 198 | lblock[lx] = res; 199 | barrier(CLK_LOCAL_MEM_FENCE); 200 | 201 | /* Pass 2: process columns */ 202 | dataptr = &lblock[column]; 203 | 204 | ioffset = row << 0x3; 205 | moffset = row << 0x2; 206 | soffset = row << 0x1; 207 | t0 = dataptr[indices[ioffset + 0] << 0x3] + (dataptr[indices[ioffset + 1] << 0x3] * sign[soffset + 0]); 208 | t1 = dataptr[indices[ioffset + 2] << 0x3] + (dataptr[indices[ioffset + 3] << 0x3] * sign[soffset + 0]); 209 | t2 = dataptr[indices[ioffset + 4] << 0x3] + (dataptr[indices[ioffset + 5] << 0x3] * sign[soffset + 0]); 210 | t3 = dataptr[indices[ioffset + 6] << 0x3] + (dataptr[indices[ioffset + 7] << 0x3] * sign[soffset + 0]); 211 | value = t0 * multiplier[moffset + 0] + (t1 + t0) * multiplier[moffset + 1] + (t2 + t0) 212 | * multiplier[moffset + 2] + ((t0 + t1) + ((t2 + t3) * sign[soffset + 1])) * multiplier[moffset + 3]; 213 | res = DESCALE(value, 0x2 + descaler_offset[row]); 214 | 215 | /* Pass 3: quantize */ 216 | recip = divisors[divisor_offset + lx + 0x40 * 0]; 217 | corr = divisors[divisor_offset + lx + 0x40 * 1]; 218 | shift = divisors[divisor_offset + lx + 0x40 * 3]; 219 | neg = res < 0 ? -1 : 1; 220 | res *= neg; 221 | product = (unsigned int) (res + corr) * recip; 222 | product >>= shift + sizeof(short) * 8; 223 | res = (short) product; 224 | res *= neg; 225 | block[gx] = (short)res; 226 | } 227 | 228 | __kernel void zero_out_right(__global short *buffer, unsigned int nsbw, unsigned int nsbh, unsigned int nbw) 229 | { 230 | size_t gx = get_global_id(0); 231 | size_t super_block_x = nsbw - 1; 232 | if ((super_block_x << 0x1) + 1 >= nbw) { 233 | size_t super_block_y = gx >> 0x7; 234 | size_t super_block_id = (super_block_y * nsbw) + super_block_x; 235 | size_t local_block_id = (gx & 0x7F) > 0x3F ? 3 : 1; 236 | size_t field_id = (gx & 0x3F); 237 | size_t block_id = ((super_block_id << 0x8) | (local_block_id << 0x6) | field_id); 238 | if (field_id == 0) 239 | { 240 | size_t left_block_0_id = ((super_block_id << 0x8)| ((local_block_id - 1) << 0x6) | field_id); 241 | buffer[block_id] = buffer[left_block_0_id]; 242 | } 243 | else 244 | { 245 | buffer[block_id] = 0; 246 | } 247 | } 248 | } 249 | 250 | __kernel void zero_out_bottom(__global short *buffer, unsigned int nsbw, unsigned int nsbh, unsigned int nbh) 251 | { 252 | size_t gx = get_global_id(0); 253 | size_t super_block_y = nsbh - 1; 254 | if ((super_block_y << 0x1) + 1 >= nbh) { 255 | size_t super_block_x = gx >> 0x7; 256 | size_t super_block_id = (super_block_y * nsbw) + super_block_x; 257 | size_t local_block_id = (gx & 0x7F) > 0x3F ? 3 : 2; 258 | size_t field_id = (gx & 0x3F); 259 | size_t block_id = ((super_block_id << 0x8) | (local_block_id << 0x6) | field_id); 260 | 261 | /* Note we do NOT copy the value from the neighbor field, since the value in the neighbor 262 | * field is not yet guaranteed to be entered. Therefore fill out with zeroes and copy single 263 | * values back on the host before performing entropy encoding */ 264 | buffer[block_id] = 0; 265 | } 266 | } 267 | 268 | -------------------------------------------------------------------------------- /src/jpeg_encoder.cpp: -------------------------------------------------------------------------------- 1 | #include "../include/jpeg_encoder.hpp" 2 | 3 | namespace jpeg 4 | { 5 | 6 | #define SCALEBITS 0x10 7 | #define FIX(x) ((unsigned int) ((x) * (1L<& output_buf, int value) 18 | { 19 | output_buf.push_back((char)value); 20 | } 21 | 22 | /** 23 | * Write two bytes to the output buffer by splitting it and writing those two bytes 24 | * seperately 25 | * 26 | * @param output_buf the buffer 27 | * @param value the value 28 | */ 29 | static void write_2byte(std::vector& output_buf, int value) 30 | { 31 | write_byte(output_buf, (value >> 0x8) & 0xFF); 32 | write_byte(output_buf, value & 0xFF); 33 | } 34 | 35 | /** 36 | * Export a JPEG marker 37 | * 38 | * @param output_buf the buffer 39 | * @param value the marker 40 | */ 41 | static void write_marker(std::vector& output_buf, int value) 42 | { 43 | write_byte(output_buf, 0xFF); 44 | write_byte(output_buf, value); 45 | } 46 | 47 | /** 48 | * Compute the reciprocal for the divisor and save in the given table 49 | * the reciprocal, length and shift values 50 | * Taken from: https://github.com/libjpeg-turbo/libjpeg-turbo/ 51 | * 52 | * @param divisor the divisor 53 | * @param dtbl table to store 54 | */ 55 | static int compute_reciprocal (unsigned short divisor, short *dtbl) 56 | { 57 | unsigned int fq, fr; 58 | unsigned short c; 59 | int b, r; 60 | 61 | if (divisor == 1) 62 | { 63 | dtbl[0x40 * 0] = (short) 1; /* reciprocal */ 64 | dtbl[0x40 * 1] = (short) 0; /* correction */ 65 | dtbl[0x40 * 2] = (short) 1; /* scale */ 66 | dtbl[0x40 * 3] = -(short) (sizeof(short) * 8); /* shift */ 67 | return 0; 68 | } 69 | 70 | b = nbits_table[divisor] - 1; 71 | r = sizeof(short) * 8 + b; 72 | 73 | fq = ((unsigned int)1 << r) / divisor; 74 | fr = ((unsigned int)1 << r) % divisor; 75 | 76 | c = divisor >> 0x1; 77 | 78 | if (fr == 0) 79 | { 80 | fq >>= 1; 81 | r--; 82 | } 83 | else if (fr <= (divisor / 2U)) 84 | { 85 | c++; 86 | } 87 | else 88 | { 89 | fq++; 90 | } 91 | 92 | dtbl[0x40 * 0] = (short) fq; 93 | dtbl[0x40 * 1] = (short) c; 94 | dtbl[0x40 * 2] = (short) (1 << (sizeof(short)*8*2 - r)); 95 | dtbl[0x40 * 3] = (short) r - sizeof(short)*8; 96 | 97 | return r <= 16 ? 0 : 1; 98 | } 99 | 100 | /** 101 | * Build the program from the given file 102 | * 103 | * @param context the OpenCL context to build the program in 104 | * @param device the device to build for 105 | * @param file the kernel file 106 | * @return the created program 107 | */ 108 | static cl::Program build_from_file(cl::Context &context, cl::Device &device, const char* const file) 109 | { 110 | std::ifstream t(file); 111 | std::string str; 112 | 113 | t.seekg(0, std::ios::end); 114 | str.reserve(t.tellg()); 115 | t.seekg(0, std::ios::beg); 116 | 117 | str.assign((std::istreambuf_iterator(t)), 118 | std::istreambuf_iterator()); 119 | cl::Program ret(context, str); 120 | ret.build({device}); 121 | return ret; 122 | } 123 | 124 | /** 125 | * Create a new encoder 126 | * 127 | * @param type the device type to use 128 | * @param quality the quality setting to use (clamped between 1 and 100) 129 | */ 130 | JPEGEncoder::JPEGEncoder(cl_device_type type, unsigned char quality) : 131 | m_context(type), 132 | m_device(m_context.getInfo()[0]), 133 | m_queue(m_context, m_device, CL_QUEUE_PROFILING_ENABLE), 134 | m_program(build_from_file(m_context, m_device, "kernel/jpeg-encoder.cl")), 135 | md_color_conversion_table(m_context, CL_MEM_READ_ONLY, sizeof(color_conversion_table)), 136 | md_fdct_divisors(m_context, CL_MEM_READ_ONLY, sizeof(m_fdct_divisors)), 137 | md_fdct_multiplier(m_context, CL_MEM_READ_ONLY, sizeof(MULTIPLIER)), 138 | md_fdct_sign(m_context, CL_MEM_READ_ONLY, sizeof(SIGN)), 139 | md_fdct_indices(m_context, CL_MEM_READ_ONLY, sizeof(INDICES)), 140 | md_fdct_descaler(m_context, CL_MEM_READ_ONLY, sizeof(DESCALER)), 141 | md_fdct_descaler_offset(m_context, CL_MEM_READ_ONLY, sizeof(DESCALER_OFFSET)) 142 | { 143 | this->create_encoder(quality); 144 | this->prepare_device(); 145 | } 146 | 147 | /** 148 | * Prepare the device, create kernels and write conversion table and divisor table to device 149 | */ 150 | void JPEGEncoder::prepare_device(void) 151 | { 152 | /* copy tables to device */ 153 | this->m_queue.enqueueWriteBuffer(this->md_color_conversion_table, false, 0, sizeof(color_conversion_table), color_conversion_table); 154 | this->m_queue.enqueueWriteBuffer(this->md_fdct_divisors, false, 0, sizeof(m_fdct_divisors), &this->m_fdct_divisors); 155 | this->m_queue.enqueueWriteBuffer(this->md_fdct_multiplier, false, 0, sizeof(MULTIPLIER), &MULTIPLIER); 156 | this->m_queue.enqueueWriteBuffer(this->md_fdct_sign, false, 0, sizeof(SIGN), &SIGN); 157 | this->m_queue.enqueueWriteBuffer(this->md_fdct_indices, false, 0, sizeof(INDICES), &INDICES); 158 | this->m_queue.enqueueWriteBuffer(this->md_fdct_descaler, false, 0, sizeof(DESCALER), &DESCALER); 159 | this->m_queue.enqueueWriteBuffer(this->md_fdct_descaler_offset, false, 0, sizeof(DESCALER_OFFSET), &DESCALER_OFFSET); 160 | 161 | /* create kernels */ 162 | this->m_transformation_kernel = cl::Kernel(this->m_program, "color_space_transform"); 163 | this->m_downsample_full_kernel = cl::Kernel(this->m_program, "downsample_full"); 164 | this->m_downsample_2v2_kernel = cl::Kernel(this->m_program, "downsample_2v2"); 165 | this->m_dct_quant = cl::Kernel(this->m_program, "dct_quant"); 166 | this->m_zero_out_right = cl::Kernel(this->m_program, "zero_out_right"); 167 | this->m_zero_out_bottom = cl::Kernel(this->m_program, "zero_out_bottom"); 168 | } 169 | 170 | /** 171 | * Encode the given image 172 | * 173 | * @param image pointer to the image data in flat row major layout 174 | * @param width of the image 175 | * @param height of the image 176 | * @param file the output file to store the image at 177 | * @return 0 on success 178 | */ 179 | int JPEGEncoder::encode_image(unsigned char *image, size_t width, size_t height, const char * const file) 180 | { 181 | size_t wg; 182 | std::vector output_buffer; 183 | FILE *fp; 184 | 185 | /* Make sure the image pointer is valid */ 186 | if(image == NULL) 187 | { 188 | fprintf(stderr, "Image data needs to be provided\n"); 189 | return 0x2; 190 | } 191 | 192 | /* Validate file handler */ 193 | fp = fopen(file, "wb"); 194 | if(fp == NULL) 195 | { 196 | fprintf(stderr, "The file \'%s\' could not be opened, aborting compressing\n", file); 197 | return 0x1; 198 | } 199 | 200 | /* Write the file, frame and scan header to the output buffer */ 201 | this->write_file_header(output_buffer); 202 | this->write_frame_header(output_buffer, width, height); 203 | this->write_scan_header(output_buffer); 204 | 205 | // 206 | // Color Space Transformation 207 | // 208 | /* Initialize image buffer */ 209 | cl::Buffer image_buffer(this->m_context, CL_MEM_READ_WRITE, sizeof(unsigned char) * 3 * width * height); 210 | this->m_queue.enqueueWriteBuffer(image_buffer, true, 0, sizeof(unsigned char) * 3 * width * height, image); 211 | 212 | /* Set arguments */ 213 | this->m_transformation_kernel.setArg(0, this->md_color_conversion_table); 214 | this->m_transformation_kernel.setArg(1, image_buffer); 215 | this->m_transformation_kernel.setArg(2, (cl_uint)(width * height)); 216 | 217 | /* Compute work group size to be the closest bigger multiple of 64 to the number of pixels in the image */ 218 | wg = (((width * height) + 0x3F) >> 0x6) << 0x6; 219 | this->m_queue.enqueueNDRangeKernel(this->m_transformation_kernel, 0, wg, 0x40); 220 | 221 | 222 | // 223 | // Downsampling 224 | // 225 | /* Do the downsampling for the y channel 226 | * This does a full downsample, which means keeping all the pixels we already have 227 | * For future processing the downsampling splits the image, which is currently in flat 228 | * row layout into super blocks containing of 4 sub blocks each which represent a full 229 | * MCU block 230 | * 231 | * Where each rx is a row and each ax/bx/.../zx is a super block 232 | * +--------- 233 | * | a1 a2 a3 ... 234 | * [r1.....][r2.....]....[rn.....] => | b1 b2 b3 ... 235 | * .............. 236 | * | z1 z2 z3 ... 237 | * +--------- The super block is stored in 238 | * | in flat layout hierarchy 239 | * v 240 | * [a1 a2 a3...][b1 b2 b3...]...[z1 z2 z3...] 241 | * 242 | * [0][1][2][3] 243 | * ^ 244 | * | 245 | * +-----+ Each superblock contains 4 sub blocks which are 246 | * | 0 1 | are ordered as displayed, where each of the four 247 | * | 2 3 | sub blocks represents a full MCU block 248 | * +-----+ the are stored in a flat layout in memory 249 | */ 250 | 251 | /* Compute the number of blocks in x and y direction */ 252 | cl_uint nbw = (width + 0x7) >> 0x3; 253 | cl_uint nbh = (height + 0x7) >> 0x3; 254 | 255 | /* Compute the number of super blocks in x and y direction */ 256 | cl_uint nsbw = (width + 0xF) >> 0x4; 257 | cl_uint nsbh = (height + 0xF) >> 0x4; 258 | 259 | /* Compute work group size */ 260 | wg = (nsbw * nsbh) << 0x8; 261 | 262 | /* Initialize the block buffer */ 263 | cl::Buffer y_block_buffer(this->m_context, CL_MEM_READ_WRITE, wg * sizeof(cl_short)); 264 | 265 | /* Set the kernel arguments */ 266 | this->m_downsample_full_kernel.setArg(0, y_block_buffer); 267 | this->m_downsample_full_kernel.setArg(1, image_buffer); 268 | this->m_downsample_full_kernel.setArg(2, nsbw); 269 | this->m_downsample_full_kernel.setArg(3, nbw); 270 | this->m_downsample_full_kernel.setArg(4, nbh); 271 | this->m_downsample_full_kernel.setArg(5, (cl_uint)width); 272 | this->m_downsample_full_kernel.setArg(6, (cl_uint)height); 273 | 274 | /* Execute kernel */ 275 | this->m_queue.enqueueNDRangeKernel(this->m_downsample_full_kernel, 0, wg, 0x40); 276 | 277 | 278 | /* Downsample Cb/Cr Channels */ 279 | /* The number of blocks and super blocks stays the same, 280 | * since we do a 2:2 downsample only a fourth of the number 281 | * of original items are stored. */ 282 | wg = (nsbw * nsbh) << 0x6; 283 | 284 | /* Create buffer for the cb and cr channels */ 285 | cl::Buffer cb_block_buffer(this->m_context, CL_MEM_READ_WRITE, wg * sizeof(cl_short)); 286 | cl::Buffer cr_block_buffer(this->m_context, CL_MEM_READ_WRITE, wg * sizeof(cl_short)); 287 | 288 | /* Set the kernel arguments */ 289 | this->m_downsample_2v2_kernel.setArg(0, cb_block_buffer); 290 | this->m_downsample_2v2_kernel.setArg(1, cr_block_buffer); 291 | this->m_downsample_2v2_kernel.setArg(2, image_buffer); 292 | this->m_downsample_2v2_kernel.setArg(3, nsbw); 293 | this->m_downsample_2v2_kernel.setArg(4, nbw); 294 | this->m_downsample_2v2_kernel.setArg(5, nbh); 295 | this->m_downsample_2v2_kernel.setArg(6, (cl_uint) width); 296 | this->m_downsample_2v2_kernel.setArg(7, (cl_uint) height); 297 | 298 | /* Execute the kernel */ 299 | this->m_queue.enqueueNDRangeKernel(this->m_downsample_2v2_kernel, 0, wg, 0x40); 300 | 301 | // 302 | // DCT and Quantification 303 | // 304 | /* Prepare and execute kernel for y channel */ 305 | wg = (nsbw * nsbh) << 0x8; 306 | this->m_dct_quant.setArg(0, y_block_buffer); 307 | this->m_dct_quant.setArg(1, this->md_fdct_divisors); 308 | this->m_dct_quant.setArg(2, 0); 309 | this->m_dct_quant.setArg(3, this->md_fdct_multiplier); 310 | this->m_dct_quant.setArg(4, this->md_fdct_sign); 311 | this->m_dct_quant.setArg(5, this->md_fdct_indices); 312 | this->m_dct_quant.setArg(6, this->md_fdct_descaler); 313 | this->m_dct_quant.setArg(7, this->md_fdct_descaler_offset); 314 | this->m_queue.enqueueNDRangeKernel(this->m_dct_quant, 0x0, wg, 0x40); 315 | 316 | /* Prepare and execute kernel for cb channel */ 317 | wg = (nsbw * nsbh) << 0x6; 318 | this->m_dct_quant.setArg(0, cb_block_buffer); 319 | this->m_dct_quant.setArg(1, this->md_fdct_divisors); 320 | this->m_dct_quant.setArg(2, 0x100); 321 | this->m_dct_quant.setArg(3, this->md_fdct_multiplier); 322 | this->m_dct_quant.setArg(4, this->md_fdct_sign); 323 | this->m_dct_quant.setArg(5, this->md_fdct_indices); 324 | this->m_dct_quant.setArg(6, this->md_fdct_descaler); 325 | this->m_dct_quant.setArg(7, this->md_fdct_descaler_offset); 326 | this->m_queue.enqueueNDRangeKernel(this->m_dct_quant, 0x0, wg, 0x40); 327 | 328 | /* Prepare and execute kernel for cr channel */ 329 | this->m_dct_quant.setArg(0, cr_block_buffer); 330 | this->m_dct_quant.setArg(1, this->md_fdct_divisors); 331 | this->m_dct_quant.setArg(2, 0x100); 332 | this->m_dct_quant.setArg(3, this->md_fdct_multiplier); 333 | this->m_dct_quant.setArg(4, this->md_fdct_sign); 334 | this->m_dct_quant.setArg(5, this->md_fdct_indices); 335 | this->m_dct_quant.setArg(6, this->md_fdct_descaler); 336 | this->m_dct_quant.setArg(7, this->md_fdct_descaler_offset); 337 | this->m_queue.enqueueNDRangeKernel(this->m_dct_quant, 0x0, wg, 0x40); 338 | 339 | /* Zero out unused blocks on the right side */ 340 | wg = (nbh << 0x6); 341 | this->m_zero_out_right.setArg(0, y_block_buffer); 342 | this->m_zero_out_right.setArg(1, (cl_uint)nsbw); 343 | this->m_zero_out_right.setArg(2, (cl_uint)nsbh); 344 | this->m_zero_out_right.setArg(3, (cl_uint)nbw); 345 | this->m_queue.enqueueNDRangeKernel(this->m_zero_out_right, 0, wg, 0x40); 346 | 347 | /* Zero out unsued blocks on the bottom of the image */ 348 | wg = (nsbw << 0x7); 349 | this->m_zero_out_bottom.setArg(0, y_block_buffer); 350 | this->m_zero_out_bottom.setArg(1, (cl_uint)nsbw); 351 | this->m_zero_out_bottom.setArg(2, (cl_uint)nsbh); 352 | this->m_zero_out_bottom.setArg(3, (cl_uint)nbh); 353 | this->m_queue.enqueueNDRangeKernel(this->m_zero_out_bottom, 0, wg, 0x80); 354 | 355 | /* Copy result back to host to perform entropy on host device */ 356 | short *y_buffer = (short*)malloc(sizeof(short) * (nsbw * nsbh) << 0x8); 357 | short *cb_buffer = (short*)malloc(sizeof(short) * (nsbw * nsbh) << 0x6); 358 | short *cr_buffer = (short*)malloc(sizeof(short) * (nsbw * nsbh) << 0x6); 359 | this->m_queue.enqueueReadBuffer(y_block_buffer, true, 0, sizeof(short) * (nsbw * nsbh) << 0x8, y_buffer); 360 | this->m_queue.enqueueReadBuffer(cb_block_buffer, true, 0, sizeof(short) * (nsbw * nsbh) << 0x6, cb_buffer); 361 | this->m_queue.enqueueReadBuffer(cr_block_buffer, true, 0, sizeof(short) * (nsbw * nsbh) << 0x6, cr_buffer); 362 | 363 | /* For convenient access, cast to 3D/2D arrays */ 364 | short (*y_blocks)[0x4][0x40] = (short (*)[0x4][0x40])y_buffer; 365 | short (*cb_blocks)[0x40] = (short (*)[0x40])cb_buffer; 366 | short (*cr_blocks)[0x40] = (short (*)[0x40])cr_buffer; 367 | 368 | /* As mentioned in the kernel code we can not the field zero values in the kernel 369 | * since neighboring blocks are processed concurrently. 370 | * Since the entropy encoding is performed on the host the data needs to be copied 371 | * anyways, so setting these values on the host does not introduce an extra 372 | * copy operation */ 373 | size_t super_block_y = nsbh - 1; 374 | size_t super_block_id_base = (super_block_y * nsbw); 375 | for(size_t gx = 0; gx < nsbw; ++gx) 376 | { 377 | if ((super_block_y << 0x1) + 1 >= nbh) { 378 | size_t super_block_x = gx; 379 | size_t super_block_id = super_block_id_base + super_block_x; 380 | short value = y_blocks[super_block_id][1][0]; 381 | y_blocks[super_block_id][2][0] = value; 382 | y_blocks[super_block_id][3][0] = value; 383 | } 384 | } 385 | 386 | // 387 | // Entropy coding 388 | // 389 | wg = (nsbw * nsbh); /* number of super blocks */ 390 | short *mcu_buffer[0x6]; 391 | entropy_state_t state; 392 | memset(state.last_dc_val, 0, sizeof(state.last_dc_val)); 393 | state.bits = 0; 394 | for(size_t i = 0; i < wg; ++i) 395 | { 396 | /* Perform entropy encoding on each block */ 397 | mcu_buffer[0] = y_blocks[i][0]; 398 | mcu_buffer[1] = y_blocks[i][1]; 399 | mcu_buffer[2] = y_blocks[i][2]; 400 | mcu_buffer[3] = y_blocks[i][3]; 401 | mcu_buffer[4] = cb_blocks[i]; 402 | mcu_buffer[5] = cr_blocks[i]; 403 | this->encode_entropy(mcu_buffer, output_buffer, state); 404 | } 405 | 406 | 407 | /* Flush Entropy */ 408 | size_t bits = state.bits; 409 | size_t buffer = state.buffer; 410 | bits += 7; 411 | buffer = (buffer << 0x7) | 0x7F; 412 | while(bits > 0x7) 413 | { 414 | bits -= 0x8; 415 | unsigned char c = (unsigned char)(buffer >> bits); 416 | output_buffer.push_back(c); 417 | if(c == 0xFF) 418 | output_buffer.push_back(c); 419 | } 420 | 421 | /* Write the file tailor to the output buffer */ 422 | write_marker(output_buffer, 0xD9); 423 | 424 | /* write the content to file */ 425 | (void)fwrite(output_buffer.data(), sizeof(char), output_buffer.size(), fp); 426 | fclose(fp); 427 | 428 | /* Release allocated memory */ 429 | free(y_buffer); 430 | free(cb_buffer); 431 | free(cr_buffer); 432 | 433 | return 0x0; 434 | } 435 | 436 | /** 437 | * Write the file header 438 | * 439 | * @param output_buf the output buffer to use 440 | */ 441 | void JPEGEncoder::write_file_header(std::vector& output) 442 | { 443 | static unsigned char headMagic[] = {0xFF, 0xD8, 0xFF, 0xE0}; 444 | static unsigned char jfifApp0[] = {0x00, 0x10, 'J', 'F', 'I', 'F', 0x0, 0x1, 0x1, 0x0, 0x0, 0x1, 0x0, 0x1, 0x0, 0x0}; 445 | 446 | /* Copy the head to the output buffer */ 447 | for(size_t i = 0; i < 0x4; ++i) 448 | { 449 | output.push_back(headMagic[i]); 450 | } 451 | 452 | /* Copy the jfif app0 marker to the output buffer */ 453 | for(size_t i = 0; i < 0x10; ++i) 454 | { 455 | output.push_back(jfifApp0[i]); 456 | } 457 | } 458 | 459 | /** 460 | * Write the frame header containing the quantification tables used 461 | * 462 | * @param output_buf the output buffer 463 | * @param w the width of the image 464 | * @param h the height of the image 465 | */ 466 | void JPEGEncoder::write_frame_header(std::vector& output_buf, size_t w, size_t h) 467 | { 468 | write_quant_table(output_buf, 0); /* Y Channel */ 469 | write_quant_table(output_buf, 1); /* Cb/Cr Channel */ 470 | write_sof(output_buf, w, h); 471 | } 472 | 473 | /** 474 | * Export the quantification table 475 | * 476 | * @param output_buf the output buffer 477 | * @param index the index of the quantification table 478 | */ 479 | void JPEGEncoder::write_quant_table(std::vector& output_buf, int index) 480 | { 481 | quantification_table_t *qtblptr; 482 | size_t i; 483 | qtblptr = &this->m_quant_tbls[index]; 484 | 485 | write_marker(output_buf, 0xDB); 486 | write_2byte(output_buf, 0x40 + 1 + 2); 487 | write_byte(output_buf, index); 488 | for(i = 0; i < 0x40; ++i) 489 | { 490 | unsigned int qval = qtblptr->value[jpeg_natural_order[i]]; 491 | write_byte(output_buf, (int)(qval & 0xFF)); 492 | } 493 | } 494 | 495 | 496 | /** 497 | * Export the huffman tables 498 | * 499 | * @param output_buf the output buffer 500 | * @param index the index of the quantification table 501 | * @param is_ac flag, true if ac table shall be exported, false if dc 502 | */ 503 | void JPEGEncoder::write_huffman_table(std::vector& output_buf, int index, unsigned char is_ac) 504 | { 505 | huffman_table_t *htblptr; 506 | size_t length, i; 507 | 508 | if(is_ac) { 509 | htblptr = &this->m_ac_huff_tbls[index]; 510 | index += 0x10; 511 | } else { 512 | htblptr = &this->m_dc_huff_tbls[index]; 513 | } 514 | 515 | write_marker(output_buf, 0xC4); 516 | length = 0; 517 | for(i = 1; i < 0x11; ++i) 518 | length += htblptr->bits[i]; 519 | 520 | /* Write section header containing number of bytes the secion contains */ 521 | write_2byte(output_buf, length + 2 + 1 + 0x10); 522 | 523 | /* output the number of bytes consisting of the index, the bits and the values */ 524 | write_byte(output_buf, index); 525 | for(i = 1; i < 0x11; ++i) 526 | write_byte(output_buf, htblptr->bits[i]); 527 | for(i = 0; i < length; ++i) 528 | write_byte(output_buf, htblptr->value[i]); 529 | } 530 | 531 | 532 | /** 533 | * Write the sos marker 534 | * 535 | * @param output_buf the output buffer to use 536 | */ 537 | void JPEGEncoder::write_sos(std::vector& output_buf) 538 | { 539 | write_marker(output_buf, 0xDA); 540 | write_2byte(output_buf, 2 * 0x3 + 2 + 1 + 3); 541 | write_byte(output_buf, 0x3); /* number of components */ 542 | 543 | /* Y Channel */ 544 | write_byte(output_buf, 1); /* component id */ 545 | write_byte(output_buf, (0 << 0x4) + 0); /* ( dc_tbl_no << 0x4 ) + ac_tbl_no */ 546 | 547 | /* Cb Channel */ 548 | write_byte(output_buf, 2); /* component id */ 549 | write_byte(output_buf, (1 << 0x4) + 1); /* ( dc_tbl_no << 0x4 ) + ac_tbl_no */ 550 | 551 | /* Cr Channel */ 552 | write_byte(output_buf, 3); /* component id */ 553 | write_byte(output_buf, (1 << 0x4) + 1); /* ( dc_tbl_no << 0x4 ) + ac_tbl_no */ 554 | 555 | /* End of sos section */ 556 | write_byte(output_buf, 0); 557 | write_byte(output_buf, 0x3F); 558 | write_byte(output_buf, 0); 559 | } 560 | 561 | /** 562 | * Write the scan header containing the huffman tables 563 | * 564 | * @param output_buf the output buffer to use 565 | */ 566 | void JPEGEncoder::write_scan_header(std::vector& output_buf) 567 | { 568 | /* Y Channel */ 569 | this->write_huffman_table(output_buf, 0, 0); 570 | this->write_huffman_table(output_buf, 0, 1); 571 | 572 | /* Cb / Cr Channel */ 573 | this->write_huffman_table(output_buf, 1, 0); 574 | this->write_huffman_table(output_buf, 1, 1); 575 | 576 | /* Write sos marker */ 577 | this->write_sos(output_buf); 578 | } 579 | 580 | /** 581 | * Write the SOF Part containing the sampling parameters and image size 582 | * 583 | * @param output_buf the output buffer to use 584 | * @param w image width 585 | * @param h image height 586 | */ 587 | void JPEGEncoder::write_sof(std::vector& output_buf, size_t w, size_t h) 588 | { 589 | write_marker(output_buf, 0xC0); 590 | write_2byte(output_buf, 3 * 0x3 + 2 + 5 + 1); 591 | write_byte(output_buf, 0x8); 592 | write_2byte(output_buf, h); 593 | write_2byte(output_buf, w); 594 | write_byte(output_buf, 0x3); 595 | 596 | /* Y Channel */ 597 | write_byte(output_buf, 0x1); 598 | write_byte(output_buf, (0x2 << 4) + 0x2); 599 | write_byte(output_buf, 0); 600 | 601 | /* Cb Channel */ 602 | write_byte(output_buf, 0x2); 603 | write_byte(output_buf, (0x1 << 4) + 0x1); 604 | write_byte(output_buf, 1); 605 | 606 | /* Cr Channel */ 607 | write_byte(output_buf, 0x3); 608 | write_byte(output_buf, (0x1 << 4) + 0x1); 609 | write_byte(output_buf, 1); 610 | } 611 | 612 | /* 613 | * NOTE: this algorithm is part of the libjpeg-turbo project 614 | * https://github.com/libjpeg-turbo/libjpeg-turbo/ 615 | */ 616 | #define EMIT_BYTE() { \ 617 | unsigned char c; \ 618 | put_bits -= 8; \ 619 | c = (unsigned char)(put_buffer >> put_bits); \ 620 | outputbuf.push_back(c); \ 621 | if (c == 0xFF) /* need to stuff a zero byte? */ \ 622 | outputbuf.push_back((char)0); \ 623 | } 624 | #define CHECKBUF15() { \ 625 | if (put_bits > 0xF) { \ 626 | EMIT_BYTE() \ 627 | EMIT_BYTE() \ 628 | } \ 629 | } 630 | #define PUT_BITS(code, size) { \ 631 | put_bits += size; \ 632 | put_buffer = (put_buffer << size) | code; \ 633 | } 634 | #define EMIT_BITS(code, size) { \ 635 | PUT_BITS(code, size) \ 636 | CHECKBUF15() \ 637 | } 638 | #define EMIT_CODE(code, size) { \ 639 | temp2 &= (((int) 1)<& outputbuf, entropy_state_t& state) 656 | { 657 | int temp, temp2, temp3, r, code, size, put_bits, nbits, code_0xf0, size_0xf0; 658 | size_t put_buffer; 659 | derived_huffman_table_t *dcd; 660 | derived_huffman_table_t *acd; 661 | 662 | /* init values */ 663 | dcd = &this->m_dc_derived_tbls[table_index]; 664 | acd = &this->m_ac_derived_tbls[table_index]; 665 | code_0xf0 = acd->code[0xf0]; 666 | size_0xf0 = acd->length[0xf0]; 667 | 668 | put_buffer = state.buffer; 669 | put_bits = state.bits; 670 | 671 | 672 | temp = temp2 = block[0] - last_dc_val; 673 | temp3 = temp >> (8 * sizeof(int) - 1); 674 | temp ^= temp3; 675 | temp -= temp3; 676 | 677 | temp2 += temp3; 678 | nbits = nbits_table[temp]; 679 | 680 | code = dcd->code[nbits]; 681 | size = dcd->length[nbits]; 682 | EMIT_BITS(code, size) 683 | 684 | temp2 &= (((long) 1) << nbits) - 1; 685 | EMIT_BITS(temp2, nbits) 686 | 687 | /* run length encoding */ 688 | r = 0; 689 | 690 | /* Run length encoding macro */ 691 | #define kloop(k) { \ 692 | if ((temp = block[k]) == 0) { \ 693 | r++; \ 694 | } else { \ 695 | temp2 = temp; \ 696 | temp3 = temp >> (8 * sizeof(int) - 1); \ 697 | temp ^= temp3; \ 698 | temp -= temp3; \ 699 | temp2 += temp3; \ 700 | nbits = nbits_table[temp]; \ 701 | /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \ 702 | while (r > 15) { \ 703 | EMIT_BITS(code_0xf0, size_0xf0) \ 704 | r -= 16; \ 705 | } \ 706 | /* Emit Huffman symbol for run length / number of bits */ \ 707 | temp3 = (r << 4) + nbits; \ 708 | code = acd->code[temp3]; \ 709 | size = acd->length[temp3]; \ 710 | EMIT_CODE(code, size) \ 711 | r = 0; \ 712 | } \ 713 | } 714 | 715 | /* Do run length encoding in zig zag pattern */ 716 | kloop(1); kloop(8); kloop(16); kloop(9); kloop(2); kloop(3); 717 | kloop(10); kloop(17); kloop(24); kloop(32); kloop(25); kloop(18); 718 | kloop(11); kloop(4); kloop(5); kloop(12); kloop(19); kloop(26); 719 | kloop(33); kloop(40); kloop(48); kloop(41); kloop(34); kloop(27); 720 | kloop(20); kloop(13); kloop(6); kloop(7); kloop(14); kloop(21); 721 | kloop(28); kloop(35); kloop(42); kloop(49); kloop(56); kloop(57); 722 | kloop(50); kloop(43); kloop(36); kloop(29); kloop(22); kloop(15); 723 | kloop(23); kloop(30); kloop(37); kloop(44); kloop(51); kloop(58); 724 | kloop(59); kloop(52); kloop(45); kloop(38); kloop(31); kloop(39); 725 | kloop(46); kloop(53); kloop(60); kloop(61); kloop(54); kloop(47); 726 | kloop(55); kloop(62); kloop(63); 727 | 728 | if(r > 0) { 729 | code = acd->code[0]; 730 | size = acd->length[0]; 731 | EMIT_BITS(code, size); 732 | } 733 | 734 | /* Store the current state back in the global one */ 735 | state.bits = put_bits; 736 | state.buffer = put_buffer; 737 | } 738 | 739 | /** 740 | * Do a entropy encoding for a super block containing of four luminance blocks and 741 | * for the cb/cr one chrominance block each 742 | * 743 | * @param mcu_buffer pointer to the blocks 744 | * @param outputbuf the output buffer 745 | * @param state the entropy state 746 | */ 747 | void JPEGEncoder::encode_entropy(short *mcu_buffer[0x6], std::vector& outputbuf, entropy_state_t& state) 748 | { 749 | const static unsigned char mcu_membership[0x6] = {0x0, 0x0, 0x0, 0x0, 0x1, 0x2}; 750 | const static unsigned char table_index[0x6] = {0x0, 0x0, 0x0, 0x0, 0x1, 0x1}; 751 | size_t i, ci; 752 | 753 | /* Perform encoding on each block */ 754 | for(i = 0; i < 0x6; ++i) 755 | { 756 | ci = mcu_membership[i]; 757 | this->encode_entropy_single_block(mcu_buffer[i], table_index[i], state.last_dc_val[ci], outputbuf, state); 758 | state.last_dc_val[ci] = mcu_buffer[i][0]; 759 | } 760 | } 761 | 762 | 763 | 764 | 765 | 766 | // ========================================================================= 767 | // 768 | // Setup routine for the encoder 769 | // 770 | // ========================================================================= 771 | 772 | /** 773 | * Create the encoder 774 | * 775 | * @param quality the quality to use (clamped between 1 and 100) 776 | */ 777 | void JPEGEncoder::create_encoder(unsigned char quality) 778 | { 779 | this->set_quality_setting(quality); 780 | this->create_huffman_tables(); 781 | this->create_dct_division_tables(); 782 | this->create_derived_huffman_tables(); 783 | } 784 | 785 | /** 786 | * Set the quality in the quantification tables 787 | * 788 | * @param quality quality the quality to use (clamped between 1 and 100) 789 | */ 790 | void JPEGEncoder::set_quality_setting(unsigned char quality) 791 | { 792 | unsigned char q; 793 | if(quality <= 0) 794 | { 795 | q = 1; 796 | } 797 | else if(quality > 100) 798 | { 799 | q = 100; 800 | } 801 | else if(quality < 50) 802 | { 803 | q = 5000 / quality; 804 | } 805 | else 806 | { 807 | q = 200 - (quality << 0x1); 808 | } 809 | 810 | /* Create quantification tables for luminance and chrominance */ 811 | this->create_quant_table(0, q, std_luminance_quant_tbl); 812 | this->create_quant_table(1, q, std_chrominance_quant_tbl); 813 | } 814 | 815 | /** 816 | * Create the quantification tables for the encoder based on the given scale factor 817 | * 818 | * @param table_id the id of the quantification table in the encoder 819 | * @param scale the scale to use (quality setting) 820 | * @param base_table the base table to scale 821 | */ 822 | void JPEGEncoder::create_quant_table(int table_idx, unsigned char scale, const unsigned int *base_table) 823 | { 824 | quantification_table_t *tblptr; 825 | size_t i; 826 | long temp; 827 | 828 | tblptr = &this->m_quant_tbls[table_idx]; 829 | for(i = 0; i < 0x40; ++i) 830 | { 831 | temp = ((long)base_table[i] * scale + 50) / 100; 832 | if(temp <= 0) 833 | { 834 | temp = 1; 835 | } 836 | else if(temp > 0xFF) 837 | { 838 | temp = 0xFF; 839 | } 840 | tblptr->value[i] = (unsigned char)temp; 841 | } 842 | } 843 | 844 | /** 845 | * Create the huffman tables for the given encoder 846 | */ 847 | void JPEGEncoder::create_huffman_tables(void) 848 | { 849 | /* Luminance huffman table */ 850 | this->add_huffman_table(&this->m_dc_huff_tbls[0], bits_dc_luminance, value_dc_luminance); 851 | this->add_huffman_table(&this->m_ac_huff_tbls[0], bits_ac_luminance, value_ac_luminance); 852 | 853 | /* Chrominance huffman table */ 854 | this->add_huffman_table(&this->m_dc_huff_tbls[1], bits_dc_chrominance, value_dc_chrominance); 855 | this->add_huffman_table(&this->m_ac_huff_tbls[1], bits_ac_chrominance, value_ac_chrominance); 856 | } 857 | 858 | /** 859 | * Add the huffman table to the encoder, count the number of bits 860 | * and copy the number of values accordingly to the huffman table 861 | * 862 | * @param tblptr huffman table to use 863 | * @param bits length 864 | * @param values the values 865 | */ 866 | void JPEGEncoder::add_huffman_table(huffman_table_t *tblptr, const unsigned char *bits, const unsigned char *values) 867 | { 868 | size_t len; 869 | int n = 0; 870 | 871 | /* copy the bits */ 872 | memcpy(tblptr->bits, bits, sizeof(tblptr->bits)); 873 | 874 | /* count the length */ 875 | for(len = 0; len < 0x11; ++len) 876 | n += bits[len]; 877 | 878 | /* set table to zero */ 879 | memset(tblptr->value, 0, sizeof(tblptr->value)); 880 | 881 | /* copy length many values */ 882 | memcpy(tblptr->value, values, n * sizeof(unsigned char)); 883 | } 884 | 885 | /** 886 | * Create the dct divisor tables 887 | */ 888 | void JPEGEncoder::create_dct_division_tables(void) 889 | { 890 | size_t i; 891 | quantification_table_t *qtblptr; 892 | short *dtblptr; 893 | 894 | /* Y Channel */ 895 | qtblptr = &this->m_quant_tbls[0]; 896 | dtblptr = this->m_fdct_divisors[0]; 897 | for(i = 0; i < 0x40; ++i) 898 | { 899 | compute_reciprocal(qtblptr->value[i] << 0x3, &dtblptr[i]); 900 | } 901 | 902 | /* Cb/Cr channel, since they share the table the computation 903 | * needs to be done only once */ 904 | qtblptr = &this->m_quant_tbls[1]; 905 | dtblptr = this->m_fdct_divisors[1]; 906 | for(i = 0; i < 0x40; ++i) 907 | { 908 | compute_reciprocal(qtblptr->value[i] << 0x3, &dtblptr[i]); 909 | } 910 | } 911 | 912 | /** 913 | * Create the derived huffman tables 914 | */ 915 | void JPEGEncoder::create_derived_huffman_tables(void) 916 | { 917 | /* Y Channel */ 918 | this->derive_huffman_table(1, &this->m_dc_huff_tbls[0], &this->m_dc_derived_tbls[0]); 919 | this->derive_huffman_table(0, &this->m_ac_huff_tbls[0], &this->m_ac_derived_tbls[0]); 920 | 921 | /* Cb/Cr channel, since they share the table the computation 922 | * needs to be done only once */ 923 | this->derive_huffman_table(1, &this->m_dc_huff_tbls[1], &this->m_dc_derived_tbls[1]); 924 | this->derive_huffman_table(0, &this->m_ac_huff_tbls[1], &this->m_ac_derived_tbls[1]); 925 | } 926 | 927 | /** 928 | * Derive the huffman tables 929 | * 930 | * @param is_dc flag whether current table is dc or ac 931 | * @param table_idx table index to use 932 | * @param pointer to the derived huffman table to be filled 933 | */ 934 | void JPEGEncoder::derive_huffman_table(unsigned char is_dc, huffman_table_t *htblptr, derived_huffman_table_t *dhtblptr) 935 | { 936 | int p, i, l, lastp, si; 937 | char huffsize[0x101]; 938 | unsigned int huffcode[0x101]; 939 | unsigned int code; 940 | 941 | /* Figure C.1: make table of Huffman code length for each symbol */ 942 | p = 0; 943 | for (l = 1; l < 0x11; l++) 944 | { 945 | i = (int) htblptr->bits[l]; 946 | while (i--) 947 | { 948 | huffsize[p++] = (char) l; 949 | } 950 | } 951 | huffsize[p] = 0; 952 | lastp = p; 953 | 954 | /* Figure C.2: generate the codes themselves */ 955 | code = 0; 956 | si = huffsize[0]; 957 | p = 0; 958 | while (huffsize[p]) 959 | { 960 | while (((int) huffsize[p]) == si) 961 | { 962 | huffcode[p++] = code; 963 | code++; 964 | } 965 | code <<= 1; 966 | si++; 967 | } 968 | 969 | /* Figure C.3: generate encoding tables */ 970 | memset(dhtblptr->length, 0, sizeof(dhtblptr->length)); 971 | 972 | for (p = 0; p < lastp; ++p) 973 | { 974 | i = htblptr->value[p]; 975 | dhtblptr->code[i] = huffcode[p]; 976 | dhtblptr->length[i] = huffsize[p]; 977 | } 978 | } 979 | 980 | 981 | } /* end of namespace jpeg */ 982 | --------------------------------------------------------------------------------