├── test ├── vp8oclenc.exe └── CPU_kernels.cl ├── bin ├── vp8oclenc_linux32 ├── vp8oclenc_linux64 ├── vp8oclenc_win32.exe └── CPU_kernels.cl ├── src ├── mingw_build_example.bat ├── debug.h ├── encIO.h ├── loop_filter.h ├── entropy_host.h ├── vp8enc.h ├── inter_part.h └── vp8enc.cpp ├── makefile example ├── launch_example ├── README.md └── changelog.txt /test/vp8oclenc.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Aazmp/vp8oclenc/HEAD/test/vp8oclenc.exe -------------------------------------------------------------------------------- /bin/vp8oclenc_linux32: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Aazmp/vp8oclenc/HEAD/bin/vp8oclenc_linux32 -------------------------------------------------------------------------------- /bin/vp8oclenc_linux64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Aazmp/vp8oclenc/HEAD/bin/vp8oclenc_linux64 -------------------------------------------------------------------------------- /bin/vp8oclenc_win32.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Aazmp/vp8oclenc/HEAD/bin/vp8oclenc_win32.exe -------------------------------------------------------------------------------- /src/mingw_build_example.bat: -------------------------------------------------------------------------------- 1 | mkdir ../test & ^ 2 | g++ -m64 -I"C:/Program Files (x86)/AMD APP SDK/2.9/include" -L"C:/Program Files (x86)/AMD APP SDK/2.9/lib/x86_64" vp8enc.cpp entropy_host.cpp -lOpenCL -o ../test/vp8oclenc.exe & ^ 3 | cp CPU_kernels.cl ../test/ & ^ 4 | cp GPU_kernels.cl ../test/ -------------------------------------------------------------------------------- /makefile example: -------------------------------------------------------------------------------- 1 | # for amd app sdk on windows with default path 2 | g++ vp8enc.cpp entropy_host.cpp -I"" -L"" -lOpenCL -o vp8oclenc 3 | 4 | for AMD 5 | default AMD APP SDK path on windows: 6 | C:/Program Files (x86)/AMD APP 7 | on Linux: 8 | /opt/AMDAPP 9 | 10 | then it followed by /lib/x86 or /lib/x86_64 for libs 11 | and /include for headers 12 | 13 | 14 | Also renaming .cpp to .c is ok. 15 | Code is in C, .cpp is just for visual studio, that doesn't have C99 compiler. 16 | -------------------------------------------------------------------------------- /launch_example: -------------------------------------------------------------------------------- 1 | tested on AMD+AMD+Win7 only 2 | 3 | to encode YUV4MPEG2 file: 4 | 5 | this_encoder_binary -i input.y4m -o Quebec.ivf -qmin 0 -qmax 112 -g 450 -partitions 1 -threads 6 -SSIM-target 93 -altref-range 5 -print-info -gpu-preferred-platform-number 0 6 | 7 | to encode any file pipelining from FFMPEG 8 | 9 | ffmpeg -i input.any -f yuv4mpegpipe -pix_fmt yuv420p - | this_encoder_binary -i @ -o Quebec.ivf -qmin 0 -qmax 112 -g 450 -partitions 1 -threads 6 -SSIM-target 93 -altref-range 5 -print-info -gpu-preferred-platform-number 0 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | vp8oclenc 2 | ========= 3 | 4 | upd: 5 | now changelog in changelog.txt 6 | 7 | main: 8 | Don't know what to write here... 9 | This is a VP8 encoder. Simple and not effective. 10 | 11 | Used sources: 12 | http://www.webmproject.org/; http://multimedia.cx/eggs/category/vp8/; 13 | 14 | Uses OpenCL. CPU for coefficient partitions boolean coding, loop filter(if CPU is choosen for the task). 15 | GPU for motion vector search, transform for inter-frames, interpolation and loop filters(if GPU is chosen for the task). 16 | 17 | Launched only on AMD+AMD+Win7(x32). 18 | And with some changes (\ to /, delte getch() or switch from conio.h to ncurses, delete io.h, delete setmode()) tested on AMD+AMD+Linux 32 and 64. 19 | Working binaries in "bin" with corresponding kernels. 20 | Strange part is: 21 | output files on linux 64 and 32 a little different (less then 2KB difference for 32.6 MB video). Maybe it's because of different precision with float SSIM (for x32 compiler can choose x87 command set and for x64 - SSE2). 22 | 23 | Intra coding is done in usual host code part. Has almost no error checking. 24 | 25 | -h gives a list of options 26 | 27 | If input_file is set as @ it will be set to stdin 28 | 29 | Features. 30 | All three reference frames: LAST(updated with each frame), ALTREF(updated with interval set in parameters), GOLDEN (only key). 31 | Motion estimation for 8x8 blocks, but with grouping them into 16x16 if they have equal vectors. 32 | MV search - hierarchical search with fullsearch in small area on downsampled areas (1/4, 1, 2, 4, 8, 16). 33 | Normal loop filter with loop_filter_level set according to quantizer value. Loop filter could be done on GPU and CPU (CPU is faster on almost all frame sizes, maybe 4K+ would benefit from GPU). 34 | Bicubic interpolation (in OpenCL 2D images but is software itself). 35 | Used probabilities are calculated and set in each frame. 36 | 37 | P.S. No benchmarks, because there is no need in them :) Quality of material can't compete with any good encoder. 38 | 39 | 40 | -------------------------------------------------------------------------------- /src/debug.h: -------------------------------------------------------------------------------- 1 | static void open_dump_file() 2 | { 3 | dump_file.path = DUMPPATH; 4 | dump_file.handle = fopen(dump_file.path, "wb"); 5 | fwrite(frames.header, frames.header_sz, 1, dump_file.handle); 6 | } 7 | 8 | static void dump() 9 | { 10 | if (frames.frame_number > 1500) return; //disk space guard 11 | 12 | if (video.do_loop_filter_on_gpu) 13 | { 14 | device.state_gpu = clEnqueueReadBuffer(device.commandQueue1_gpu, device.reconstructed_frame_Y ,CL_TRUE, 0, video.wrk_frame_size_luma, frames.reconstructed_Y, 0, NULL, NULL); 15 | device.state_gpu = clEnqueueReadBuffer(device.commandQueue2_gpu, device.reconstructed_frame_U ,CL_TRUE, 0, video.wrk_frame_size_chroma, frames.reconstructed_U, 0, NULL, NULL); 16 | device.state_gpu = clEnqueueReadBuffer(device.commandQueue3_gpu, device.reconstructed_frame_V ,CL_TRUE, 0, video.wrk_frame_size_chroma, frames.reconstructed_V, 0, NULL, NULL); 17 | } 18 | else 19 | { 20 | device.state_cpu = clEnqueueReadBuffer(device.loopfilterY_commandQueue_cpu, device.cpu_frame_Y ,CL_TRUE, 0, video.wrk_frame_size_luma, frames.reconstructed_Y, 0, NULL, NULL); 21 | device.state_cpu = clEnqueueReadBuffer(device.loopfilterU_commandQueue_cpu, device.cpu_frame_U ,CL_TRUE, 0, video.wrk_frame_size_chroma, frames.reconstructed_U, 0, NULL, NULL); 22 | device.state_cpu = clEnqueueReadBuffer(device.loopfilterV_commandQueue_cpu, device.cpu_frame_V ,CL_TRUE, 0, video.wrk_frame_size_chroma, frames.reconstructed_V, 0, NULL, NULL); 23 | } 24 | char delimiter[] = "FRAME+"; 25 | delimiter[5] = 0x0A; 26 | fwrite(delimiter, 6, 1, dump_file.handle); 27 | 28 | int i; 29 | for (i = 0; i < video.src_height; ++i) 30 | fwrite(&frames.reconstructed_Y[i*video.src_width], video.src_width, 1, dump_file.handle); 31 | for (i = 0; i < video.src_height/2; ++i) 32 | fwrite(&frames.reconstructed_U[i*video.src_width/2], video.src_width/2, 1, dump_file.handle); 33 | for (i = 0; i < video.src_height/2; ++i) 34 | fwrite(&frames.reconstructed_V[i*video.src_width/2], video.src_width/2, 1, dump_file.handle); 35 | 36 | return; 37 | } -------------------------------------------------------------------------------- /changelog.txt: -------------------------------------------------------------------------------- 1 | ################## 2 | ### 30.11.2013 ### 3 | ################## 4 | 5 | 1) restructured code into directories 6 | 2) added linux x64 and x32 gcc builds (tested on the same AMD+AMD but + Mint15(x64)) 7 | 2') for linux builds code has been slightly modified (not in rep) like: change \ to /, delete conio.h::getch(), io.h(windows one), setmode(). 8 | 3) added win32 build from VS2008 9 | 4) kernels in bin directory are the kernels corresponding to binaries, not being developed ones. 10 | 5) "IO.h" renamed to "encIO.h" to move further from 11 | 12 | 13 | ################## 14 | ### 27.11.2013 ### 15 | ################## 16 | 17 | 1) Bugfix with local memory - now work group size is 256 instad of auto. 18 | 2) Bugfix with reseting vectors. 19 | Not all vector nets were set to zero which resulted in corrupted vectors and artifacts along bottom border. 20 | 3) Little change in vector dependencies that allow lower bitrates now. 21 | 22 | 23 | ################## 24 | ### 24.11.2013 ### 25 | ################## 26 | 27 | 1) Loop filter is done on CPU by default now. To filter on GPU use "-loop-filter-on-gpu" option. 28 | No point of using loop filter on GPU on video < 4K. On system FX6300+HD7850 loop filter on CPU gives +50% on 1920x1080 and +250% on 700x400. 29 | Loop filter can be overlapped with boolean encoding with options "-threads" > "-partitions" + 3. 30 | 31 | 2) All types like int32_t, uint8_t... replaced with cl_int, cl_uchar... 32 | 33 | 3) New options. "-partitions" is the old "-t" - number of partitions. "-threads" - thead number limit when doing loop filters and boolean coding at the same time. 34 | 35 | 4) Now GPU GPU kernels execute in parallel where possible (through 3 command queues) 36 | 37 | 38 | P.S. Tried to replace IF-ELSE parts of block tokenizing in boolean coder, got no speed boost 39 | (usually blocks have a lot of zero coeffs after inter prediction and this part takes noticeable time only in key frames, so it's not the thing that should be optimized) 40 | 41 | ################## 42 | ### 21.11.2013 ### 43 | ################## 44 | 45 | 1) Now codec uses image objects for reference frames. 46 | 47 | 2) Interpolation is made on the run but still in sofware (OpenCL doesn't offer bicubic interpolation) 48 | 49 | 3) GPU code divided in some smaller pieces (dct kernel, idct kernel, wht kernel...) no more long kernels => faster compilation and smaller memory usage. 50 | 51 | 4) all 3 reference buffers used: ALTREF, GOLDEN, LAST - for search 52 | 53 | Some results: 54 | 1) memory usage decreased by huge amount. now it fits almost every GPU device. 55 | 56 | 2) speed a little bit slower (but amount of search done is higher) 57 | 58 | 3) -cl-opt-disable slows down working with images by 13x times (maybe because there is no option to read 32 bit from one channel image in OpenCL language and only compiler could improve this, maybe not) 59 | 60 | 4) on E350 (HD6410) performance is veeeeery slow (again because of image usage) 61 | 62 | TODO: pure C optimization (LUT instead of IFs, etc...), asm, visual scene changeand effects detection... 63 | 64 | ################## 65 | ### sometime ### 66 | ################## 67 | 68 | 1) Now instead of one cl_kernel instance per kernel there are one for each argument set for each kernel. 69 | Init part of code become larger, but inter_transform more readable. 70 | Even if deleting clSetKernelArg(...) lower CPU usage, it can't be seen in CPU monitor. 71 | 72 | 2) Deleted bitrate check for key frame. It's not useful. 73 | 74 | 3) Forgot about changelog. 75 | 76 | ################## 77 | ### 05.04.2014 ### 78 | ################## 79 | 80 | 1) Remembered that i have changelog. 81 | 82 | 2) Loop filter on GPU is broken. For now. 83 | Just haven't rewritten it's code to match changes. 84 | Loop filter on CPU is working. 85 | 86 | 3) Replaced complex macroblock buffer with multiple theme buffers (vectors, coefficients, non_zero counts, parts, segment ids...). 87 | Now unnecessary copies are avoided (example: OpenCL CPU device needs coefficient, but doesn't need vectors). 88 | Also GPU read/write has become a little bit more coalesced (coefficients are still in one uge chunk, so not everywhere). 89 | 90 | 4) Coefficients and reconstructed frames are copied from/to OpenCL GPU device, but mapped for CPU device. 91 | Less memory operations (not a major factor, other operations dominate in time consuption). 92 | 93 | 5) Little change in vector search.Now always check for (0;0) vector. 94 | Now hierarchical search does not trick blocks with no real movement. 95 | Note: only (0;0) is checked, no small areas around; 96 | As result: little-little bit better quality, little bit lesser size at the same time on some of test videos 97 | 98 | 6) CPU host part still not optimized even a bit. 99 | -------------------------------------------------------------------------------- /src/encIO.h: -------------------------------------------------------------------------------- 1 | static void gather_frame() 2 | { 3 | // get info about partition sizes 4 | device.state_cpu = clEnqueueReadBuffer(device.boolcoder_commandQueue_cpu, device.partitions_sizes ,CL_TRUE, 0, 8*sizeof(cl_int), frames.partition_sizes, 0, NULL, NULL); 5 | // write partition size data 6 | // each size = 3 byte in little endian (LSB first) 7 | cl_int i; 8 | for (i = 0; i < (video.number_of_partitions-1); ++i) // size of last is not written 9 | { 10 | cl_uint psize = frames.partition_sizes[i]; 11 | cl_uchar psize_b = (cl_uchar)(psize & 0xff); 12 | frames.encoded_frame[frames.encoded_frame_size] = psize_b; 13 | ++frames.encoded_frame_size; 14 | psize_b = (cl_uchar)((psize >> 8) & 0xff); 15 | frames.encoded_frame[frames.encoded_frame_size] = psize_b; 16 | ++frames.encoded_frame_size; 17 | psize_b = (cl_uchar)((psize >> 16) & 0xff); 18 | frames.encoded_frame[frames.encoded_frame_size] = psize_b; 19 | ++frames.encoded_frame_size; 20 | } 21 | //copy coefficient-partitions 22 | for (i = 0; i < video.number_of_partitions; ++i) 23 | { 24 | device.state_gpu = clEnqueueReadBuffer(device.boolcoder_commandQueue_cpu, device.partitions, CL_TRUE, i*video.partition_step, frames.partition_sizes[i], 25 | &frames.encoded_frame[frames.encoded_frame_size], 0, NULL, NULL); 26 | frames.encoded_frame_size += frames.partition_sizes[i]; 27 | } 28 | // now we got encoded frame 29 | return; 30 | } 31 | 32 | static void write_output_file() 33 | { 34 | // clock start in gather frame 35 | // write ivf frame header (12 bytes) LITTLE ENDIAN 36 | cl_uchar byte; 37 | cl_ulong timestamp; 38 | // 0-3 frame(vp8 frame) size 39 | byte = (cl_uchar)(frames.encoded_frame_size & 0xff); 40 | fwrite(&byte, 1, 1, output_file.handle); 41 | byte = (cl_uchar)((frames.encoded_frame_size >> 8) & 0xff); 42 | fwrite(&byte, 1, 1, output_file.handle); 43 | byte = (cl_uchar)((frames.encoded_frame_size >> 16) & 0xff); 44 | fwrite(&byte, 1, 1, output_file.handle); 45 | byte = (cl_uchar)((frames.encoded_frame_size >> 24) & 0xff); 46 | fwrite(&byte, 1, 1, output_file.handle); 47 | // 64bit timestamp 48 | timestamp = ((cl_ulong)(frames.frame_number))*((cl_ulong)video.timestep); 49 | byte = (cl_uchar)(timestamp & 0xff); 50 | fwrite(&byte, 1, 1, output_file.handle); 51 | byte = (cl_uchar)((timestamp >> 8) & 0xff); 52 | fwrite(&byte, 1, 1, output_file.handle); 53 | byte = (cl_uchar)((timestamp >> 16) & 0xff); 54 | fwrite(&byte, 1, 1, output_file.handle); 55 | byte = (cl_uchar)((timestamp >> 24) & 0xff); 56 | fwrite(&byte, 1, 1, output_file.handle); 57 | byte = (cl_uchar)((timestamp >> 32) & 0xff); 58 | fwrite(&byte, 1, 1, output_file.handle); 59 | byte = (cl_uchar)((timestamp >> 40) & 0xff); 60 | fwrite(&byte, 1, 1, output_file.handle); 61 | byte = (cl_uchar)((timestamp >> 48) & 0xff); 62 | fwrite(&byte, 1, 1, output_file.handle); 63 | byte = (cl_uchar)((timestamp >> 56) & 0xff); 64 | fwrite(&byte, 1, 1, output_file.handle); 65 | // now print frame 66 | fwrite(frames.encoded_frame, 1, frames.encoded_frame_size, output_file.handle); 67 | 68 | return; 69 | } 70 | 71 | static void write_output_header() 72 | { 73 | fseek (output_file.handle, 0, SEEK_SET); 74 | // header size 32bytes LITTLE ENDIAN 75 | cl_uchar byte; 76 | // 0-3 "DKIF" 77 | byte = 'D'; fwrite(&byte, 1, 1, output_file.handle); 78 | byte = 'K'; fwrite(&byte, 1, 1, output_file.handle); 79 | byte = 'I'; fwrite(&byte, 1, 1, output_file.handle); 80 | byte = 'F'; fwrite(&byte, 1, 1, output_file.handle); 81 | // 4-5 version (only 0 allowed) 82 | byte = 0; fwrite(&byte, 1, 1, output_file.handle); 83 | fwrite(&byte, 1, 1, output_file.handle); 84 | // 6-7 header length in bytes 85 | byte = 32; fwrite(&byte, 1, 1, output_file.handle); 86 | byte = 0; fwrite(&byte, 1, 1, output_file.handle); 87 | // 9-11 "VP80" 88 | byte = 'V'; fwrite(&byte, 1, 1, output_file.handle); 89 | byte = 'P'; fwrite(&byte, 1, 1, output_file.handle); 90 | byte = '8'; fwrite(&byte, 1, 1, output_file.handle); 91 | byte = '0'; fwrite(&byte, 1, 1, output_file.handle); 92 | // 12-13 width 93 | byte = (cl_uchar)(video.dst_width & 0xff); 94 | fwrite(&byte, 1, 1, output_file.handle); 95 | byte = (cl_uchar)((video.dst_width >> 8) & 0xff); 96 | fwrite(&byte, 1, 1, output_file.handle); 97 | // 14-15 height 98 | byte = (cl_uchar)(video.dst_height & 0xff); 99 | fwrite(&byte, 1, 1, output_file.handle); 100 | byte = (cl_uchar)((video.dst_height >> 8) & 0xff); 101 | fwrite(&byte, 1, 1, output_file.handle); 102 | // 16-19 framerate 103 | cl_uint fr = video.framerate; 104 | byte = (cl_uchar)(fr & 0xff); 105 | fwrite(&byte, 1, 1, output_file.handle); 106 | byte = (cl_uchar)((fr >> 8) & 0xff); 107 | fwrite(&byte, 1, 1, output_file.handle); 108 | byte = (cl_uchar)((fr >> 16) & 0xff); 109 | fwrite(&byte, 1, 1, output_file.handle); 110 | byte = (cl_uchar)((fr >> 24) & 0xff); 111 | fwrite(&byte, 1, 1, output_file.handle); 112 | // 20-23 timescale 113 | byte = (cl_uchar)(video.timescale & 0xff); 114 | fwrite(&byte, 1, 1, output_file.handle); 115 | byte = (cl_uchar)((video.timescale >> 8) & 0xff); 116 | fwrite(&byte, 1, 1, output_file.handle); 117 | byte = (cl_uchar)((video.timescale >> 16) & 0xff); 118 | fwrite(&byte, 1, 1, output_file.handle); 119 | byte = (cl_uchar)((video.timescale >> 24) & 0xff); 120 | fwrite(&byte, 1, 1, output_file.handle); 121 | // 24-27 frame count 122 | ++frames.frame_number; 123 | byte = (cl_uchar)(frames.frame_number & 0xff); 124 | fwrite(&byte, 1, 1, output_file.handle); 125 | byte = (cl_uchar)((frames.frame_number >> 8) & 0xff); 126 | fwrite(&byte, 1, 1, output_file.handle); 127 | byte = (cl_uchar)((frames.frame_number >> 16) & 0xff); 128 | fwrite(&byte, 1, 1, output_file.handle); 129 | byte = (cl_uchar)((frames.frame_number >> 24) & 0xff); 130 | fwrite(&byte, 1, 1, output_file.handle); 131 | --frames.frame_number; 132 | // 28-32 not using 133 | byte = 0; 134 | fwrite(&byte, 1, 1, output_file.handle); 135 | fwrite(&byte, 1, 1, output_file.handle); 136 | fwrite(&byte, 1, 1, output_file.handle); 137 | fwrite(&byte, 1, 1, output_file.handle); 138 | return; 139 | } 140 | 141 | static int copy_with_padding() 142 | { 143 | int i, j; 144 | cl_uchar *srcY, *srcU, *srcV, *dstY, *dstU, *dstV; 145 | cl_uchar ext_pixelY, ext_pixelU, ext_pixelV; 146 | //first line copy 147 | srcY = frames.tmp_Y; srcU = frames.tmp_U; srcV = frames.tmp_V; 148 | dstY = frames.current_Y; dstU = frames.current_U; dstV = frames.current_V; 149 | int wrk_width_chroma = video.wrk_width>>1; 150 | int src_width_chroma = video.src_width>>1; 151 | 152 | for (i = 0; i < video.src_height; i+=2) 153 | { 154 | // two luma lines, one chroma and one chroma line at step 155 | memcpy(dstY, srcY, video.src_width); 156 | ext_pixelY = srcY[video.src_width-1]; 157 | for (j = video.src_width; j < video.wrk_width; ++j) // extend to the right 158 | dstY[j] = ext_pixelY; 159 | srcY += video.src_width; // dst_width/height == src_width/height if this function called 160 | dstY += video.wrk_width; 161 | 162 | memcpy(dstY, srcY, video.src_width); 163 | ext_pixelY = srcY[video.src_width-1]; 164 | for (j = video.src_width; j < video.wrk_width; ++j) // extend to the right 165 | dstY[j] = ext_pixelY; 166 | srcY += video.src_width; 167 | dstY += video.wrk_width; 168 | 169 | memcpy(dstU, srcU, src_width_chroma); 170 | ext_pixelU = srcU[src_width_chroma-1]; 171 | for (j = src_width_chroma; j < wrk_width_chroma; ++j) // extend to the right 172 | dstU[j] = ext_pixelU; 173 | srcU += (src_width_chroma); 174 | dstU += (wrk_width_chroma); 175 | 176 | memcpy(dstV, srcV, src_width_chroma); 177 | ext_pixelV = srcU[src_width_chroma-1]; 178 | for (j = src_width_chroma; j < wrk_width_chroma; ++j) // extend to the right 179 | dstU[j] = ext_pixelV; 180 | srcV += (src_width_chroma); 181 | dstV += (wrk_width_chroma); 182 | } 183 | // now copy last line to all lower lines, so increment only for dst 184 | srcY = dstY - video.wrk_width; 185 | srcU = dstU - wrk_width_chroma; 186 | srcV = dstV - wrk_width_chroma; 187 | 188 | for (i = video.src_height; i < video.wrk_height; i+=2) 189 | { 190 | memcpy(dstY, srcY, video.wrk_width); 191 | dstY += video.wrk_width; 192 | memcpy(dstY, srcY, video.wrk_width); 193 | dstY += video.wrk_width; 194 | memcpy(dstU, srcU, wrk_width_chroma); 195 | dstU += (wrk_width_chroma); 196 | memcpy(dstV, srcV, wrk_width_chroma); 197 | dstV += (wrk_width_chroma); 198 | } 199 | 200 | return 1; 201 | } 202 | 203 | 204 | static int get_yuv420_frame() 205 | { 206 | // if there is padding, could be just pointer switch 207 | if (frames.frame_number > 0) { 208 | memcpy(frames.last_U, frames.current_U, video.wrk_frame_size_chroma); 209 | memcpy(frames.last_V, frames.current_V, video.wrk_frame_size_chroma); 210 | } 211 | 212 | cl_int src_frame_size_full = video.src_frame_size_luma + (video.src_frame_size_chroma << 1); 213 | cl_int i, j, fragment_size = src_frame_size_full; 214 | 215 | i = (cl_int)fread(frames.input_pack, sizeof(cl_uchar), (src_frame_size_full % fragment_size), input_file.handle); 216 | while (i < src_frame_size_full) 217 | { 218 | j = (cl_int)fread(frames.input_pack + i, sizeof(cl_uchar), fragment_size, input_file.handle); 219 | if (j < fragment_size) 220 | return 0; 221 | i += j; 222 | } 223 | 224 | frames.tmp_Y = frames.input_pack; 225 | frames.tmp_U = frames.tmp_Y + video.src_frame_size_luma; 226 | frames.tmp_V = frames.tmp_U + video.src_frame_size_chroma; 227 | 228 | if ((video.src_height == video.dst_height) && (video.src_width == video.dst_width)) //== no resize 229 | { 230 | if ((video.wrk_height == video.dst_height) && (video.wrk_width == video.dst_width)) //== no padding 231 | { 232 | // then our buffers already continious, no need in paddings, just assign raw data to current 233 | frames.current_Y = frames.tmp_Y; 234 | frames.current_U = frames.tmp_U; 235 | frames.current_V = frames.tmp_V; 236 | } 237 | else 238 | { 239 | // malloc for current_YUV was done at runtime 240 | copy_with_padding(); //from tmp_yuv to current_yuv 241 | } 242 | } 243 | char buf[6]; 244 | i = fread(buf, sizeof(cl_uchar), 6, input_file.handle); 245 | if ((i > 0) && ((buf[0] != 'F') || (buf[4] != 'E'))) { 246 | printf("broken stream!\n"); 247 | return -1; 248 | } 249 | 250 | //memset(frames.current_Y, 128, video.wrk_frame_size_luma); 251 | //memset(frames.current_U, 128, video.wrk_frame_size_chroma); //make black and white 252 | //memset(frames.current_V, 128, video.wrk_frame_size_chroma); 253 | return 1; 254 | } -------------------------------------------------------------------------------- /src/loop_filter.h: -------------------------------------------------------------------------------- 1 | static void prepare_on_gpu() 2 | { 3 | int mb_num; 4 | frames.skip_prob = 0; 5 | 6 | // we need to grab transformed data from host memory 7 | device.state_gpu = clEnqueueWriteBuffer(device.commandQueue1_gpu, device.macroblock_coeffs_gpu, CL_FALSE, 0, video.mb_count*sizeof(macroblock_coeffs_t), frames.MB, 0, NULL, NULL); 8 | device.gpu_work_items_per_dim[0] = video.mb_count; 9 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.prepare_filter_mask, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 10 | // need to return info about non_zero coeffs 11 | device.state_gpu = clEnqueueReadBuffer(device.commandQueue1_gpu, device.macroblock_non_zero_coeffs_gpu ,CL_TRUE, 0, video.mb_count*sizeof(cl_int), frames.MB_non_zero_coeffs, 0, NULL, NULL); 12 | 13 | for(mb_num = 0; mb_num < video.mb_count; ++mb_num) 14 | if (frames.MB_non_zero_coeffs[mb_num] > 0) 15 | ++frames.skip_prob; 16 | 17 | frames.skip_prob *= 256; 18 | frames.skip_prob /= video.mb_count; 19 | frames.skip_prob = (frames.skip_prob > 254) ? 254 : frames.skip_prob; 20 | frames.skip_prob = (frames.skip_prob < 2) ? 2 : frames.skip_prob; 21 | //don't do this => frames.skip_prob = 255 - frames.skip_prob; incorrect desription of prob_skip_false 22 | return; 23 | } 24 | 25 | static void prepare_on_cpu() 26 | { 27 | int mb_num; 28 | frames.skip_prob = 0; 29 | 30 | device.gpu_work_items_per_dim[0] = 4; 31 | device.gpu_work_group_size_per_dim[0] = 1; 32 | device.state_cpu = clEnqueueNDRangeKernel(device.loopfilterY_commandQueue_cpu, device.prepare_filter_mask, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 33 | device.state_cpu = clFinish(device.loopfilterY_commandQueue_cpu); 34 | // need to return info about non_zero coeffs 35 | device.state_cpu = clEnqueueReadBuffer(device.boolcoder_commandQueue_cpu, device.macroblock_non_zero_coeffs_cpu ,CL_TRUE, 0, video.mb_count*sizeof(cl_int), frames.MB_non_zero_coeffs, 0, NULL, NULL); 36 | 37 | for(mb_num = 0; mb_num < video.mb_count; ++mb_num) 38 | if (frames.MB_non_zero_coeffs[mb_num] > 0) 39 | ++frames.skip_prob; 40 | 41 | frames.skip_prob *= 256; 42 | frames.skip_prob /= video.mb_count; 43 | frames.skip_prob = (frames.skip_prob > 254) ? 254 : frames.skip_prob; 44 | frames.skip_prob = (frames.skip_prob < 2) ? 2 : frames.skip_prob; 45 | return; 46 | } 47 | 48 | static void prepare_filter_mask_and_non_zero_coeffs() 49 | { 50 | if (video.do_loop_filter_on_gpu) 51 | prepare_on_gpu(); 52 | else 53 | prepare_on_cpu(); 54 | return; 55 | } 56 | 57 | static void do_loop_filter_on_gpu() 58 | { 59 | if (video.GOP_size < 2) return; 60 | 61 | if (frames.replaced > 0) 62 | { 63 | device.state_gpu = clEnqueueWriteBuffer(device.commandQueue1_gpu, device.reconstructed_frame_Y, CL_FALSE, 0, video.wrk_frame_size_luma, frames.reconstructed_Y, 0, NULL, NULL); 64 | device.state_gpu = clEnqueueWriteBuffer(device.commandQueue2_gpu, device.reconstructed_frame_U, CL_FALSE, 0, video.wrk_frame_size_chroma, frames.reconstructed_U, 0, NULL, NULL); 65 | device.state_gpu = clEnqueueWriteBuffer(device.commandQueue3_gpu, device.reconstructed_frame_V, CL_FALSE, 0, video.wrk_frame_size_chroma, frames.reconstructed_V, 0, NULL, NULL); 66 | } 67 | 68 | cl_int stage, mb_size, plane_width; 69 | for (stage = 0; stage < (video.mb_width + (video.mb_height-1)*2); ++stage) 70 | { 71 | device.state_gpu = clSetKernelArg(device.normal_loop_filter_MBH, 5, sizeof(cl_int), &stage); 72 | device.state_gpu = clSetKernelArg(device.normal_loop_filter_MBV, 5, sizeof(cl_int), &stage); 73 | device.gpu_work_items_per_dim[0] = video.mb_height*16; 74 | 75 | mb_size = 16; 76 | plane_width = video.wrk_width; 77 | device.state_gpu = clSetKernelArg(device.normal_loop_filter_MBH, 4, sizeof(cl_int), &mb_size); 78 | device.state_gpu = clSetKernelArg(device.normal_loop_filter_MBH, 1, sizeof(cl_int), &plane_width); 79 | device.state_gpu = clSetKernelArg(device.normal_loop_filter_MBH, 0, sizeof(cl_mem), &device.reconstructed_frame_Y); 80 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.normal_loop_filter_MBH, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 81 | if (device.state_gpu != 0) printf(">error while deblocking : %d", device.state_gpu); 82 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 83 | if (device.state_gpu != 0) printf(">error while deblocking : %d", device.state_gpu); 84 | device.gpu_work_items_per_dim[0] = video.mb_height*8; 85 | mb_size = 8; 86 | plane_width = video.wrk_width/2; 87 | device.state_gpu = clSetKernelArg(device.normal_loop_filter_MBH, 4, sizeof(cl_int), &mb_size); 88 | device.state_gpu = clSetKernelArg(device.normal_loop_filter_MBH, 1, sizeof(cl_int), &plane_width); 89 | device.state_gpu = clSetKernelArg(device.normal_loop_filter_MBH, 0, sizeof(cl_mem), &device.reconstructed_frame_U); 90 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.normal_loop_filter_MBH, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 91 | if (device.state_gpu != 0) printf(">error while deblocking : %d", device.state_gpu); 92 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 93 | if (device.state_gpu != 0) printf(">error while deblocking : %d", device.state_gpu); 94 | device.state_gpu = clSetKernelArg(device.normal_loop_filter_MBH, 0, sizeof(cl_mem), &device.reconstructed_frame_V); 95 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue3_gpu, device.normal_loop_filter_MBH, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 96 | if (device.state_gpu != 0) printf(">error while deblocking : %d", device.state_gpu); 97 | device.state_gpu = ifFlush(device.commandQueue3_gpu); 98 | if (device.state_gpu != 0) printf(">error while deblocking : %d", device.state_gpu); 99 | 100 | device.gpu_work_items_per_dim[0] = video.mb_height*4; 101 | mb_size = 16; 102 | plane_width = video.wrk_width; 103 | device.state_gpu = clSetKernelArg(device.normal_loop_filter_MBV, 4, sizeof(cl_int), &mb_size); 104 | device.state_gpu = clSetKernelArg(device.normal_loop_filter_MBV, 1, sizeof(cl_int), &plane_width); 105 | device.state_gpu = clSetKernelArg(device.normal_loop_filter_MBV, 0, sizeof(cl_mem), &device.reconstructed_frame_Y); 106 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.normal_loop_filter_MBV, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 107 | if (device.state_gpu != 0) printf(">error while deblocking : %d", device.state_gpu); 108 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 109 | if (device.state_gpu != 0) printf(">error while deblocking : %d", device.state_gpu); 110 | device.gpu_work_items_per_dim[0] = video.mb_height*2; 111 | mb_size = 8; 112 | plane_width = video.wrk_width/2; 113 | device.state_gpu = clSetKernelArg(device.normal_loop_filter_MBV, 4, sizeof(cl_int), &mb_size); 114 | device.state_gpu = clSetKernelArg(device.normal_loop_filter_MBV, 1, sizeof(cl_int), &plane_width); 115 | device.state_gpu = clSetKernelArg(device.normal_loop_filter_MBV, 0, sizeof(cl_mem), &device.reconstructed_frame_U); 116 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.normal_loop_filter_MBV, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 117 | if (device.state_gpu != 0) printf(">error while deblocking : %d", device.state_gpu); 118 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 119 | if (device.state_gpu != 0) printf(">error while deblocking : %d", device.state_gpu); 120 | device.state_gpu = clSetKernelArg(device.normal_loop_filter_MBV, 0, sizeof(cl_mem), &device.reconstructed_frame_V); 121 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue3_gpu, device.normal_loop_filter_MBV, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 122 | if (device.state_gpu != 0) printf(">error while deblocking : %d", device.state_gpu); 123 | device.state_gpu = ifFlush(device.commandQueue3_gpu); 124 | if (device.state_gpu != 0) printf(">error while deblocking : %d", device.state_gpu); 125 | 126 | if (((stage + 1) % 16) == 0) 127 | { 128 | device.state_gpu = finalFlush(device.commandQueue1_gpu); 129 | device.state_gpu = finalFlush(device.commandQueue2_gpu); 130 | device.state_gpu = finalFlush(device.commandQueue3_gpu); 131 | device.state_gpu = clFinish(device.commandQueue1_gpu); 132 | device.state_gpu = clFinish(device.commandQueue2_gpu); 133 | device.state_gpu = clFinish(device.commandQueue3_gpu); 134 | } 135 | } 136 | 137 | return; 138 | } 139 | 140 | static void do_loop_filter_on_cpu() 141 | { 142 | // Y 143 | device.cpu_work_items_per_dim[0] = 1; 144 | device.state_cpu = clEnqueueNDRangeKernel(device.loopfilterY_commandQueue_cpu, device.loop_filter_frame_luma, 1, NULL, device.cpu_work_items_per_dim, NULL, 0, NULL, NULL); 145 | if (frames.threads_free > 1) { 146 | --frames.threads_free; 147 | device.state_cpu = ifFlush(device.loopfilterY_commandQueue_cpu); 148 | } 149 | else { 150 | device.state_cpu = clFinish(device.loopfilterY_commandQueue_cpu); 151 | frames.threads_free =video.thread_limit; 152 | } 153 | if (device.state_cpu != 0) 154 | printf(">error while deblocking : %d", device.state_cpu); 155 | // U 156 | device.state_cpu = clEnqueueNDRangeKernel(device.loopfilterU_commandQueue_cpu, device.loop_filter_frame_chroma_U, 1, NULL, device.cpu_work_items_per_dim, NULL, 0, NULL, NULL); 157 | if (frames.threads_free > 1) { 158 | --frames.threads_free; 159 | device.state_cpu = ifFlush(device.loopfilterU_commandQueue_cpu); 160 | } 161 | else { 162 | device.state_cpu = ifFlush(device.loopfilterY_commandQueue_cpu); 163 | device.state_cpu = clFinish(device.loopfilterU_commandQueue_cpu); 164 | frames.threads_free =video.thread_limit; 165 | } 166 | if (device.state_cpu != 0) 167 | printf(">error while deblocking : %d", device.state_cpu); 168 | // V 169 | device.state_cpu = clEnqueueNDRangeKernel(device.loopfilterV_commandQueue_cpu, device.loop_filter_frame_chroma_V, 1, NULL, device.cpu_work_items_per_dim, NULL, 0, NULL, NULL); 170 | if (frames.threads_free > 1) { 171 | --frames.threads_free; 172 | device.state_cpu = ifFlush(device.loopfilterV_commandQueue_cpu); 173 | } 174 | else { 175 | device.state_cpu = ifFlush(device.loopfilterY_commandQueue_cpu); 176 | device.state_cpu = ifFlush(device.loopfilterU_commandQueue_cpu); 177 | device.state_cpu = clFinish(device.loopfilterV_commandQueue_cpu); 178 | frames.threads_free = video.thread_limit; 179 | } 180 | if (device.state_cpu != 0) printf(">error while deblocking : %d", device.state_cpu); 181 | 182 | return; 183 | } 184 | 185 | static void do_loop_filter() 186 | { 187 | if (video.do_loop_filter_on_gpu) do_loop_filter_on_gpu(); 188 | else do_loop_filter_on_cpu(); 189 | return; 190 | } -------------------------------------------------------------------------------- /src/entropy_host.h: -------------------------------------------------------------------------------- 1 | typedef cl_uchar Prob; 2 | typedef cl_char tree_index; 3 | typedef const tree_index Tree[]; 4 | 5 | typedef struct { 6 | int bits; 7 | int size; 8 | } encoding_symbol; 9 | 10 | //-------------------------------------------------------------------------------------------------------------------- 11 | 12 | typedef enum 13 | { 14 | DC_PRED, /* predict DC using row above and column to the left */ 15 | V_PRED, /* predict rows using row above */ 16 | H_PRED, /* predict columns using column to the left */ 17 | TM_PRED, /* propagate second differences a la "True Motion" */ 18 | B_PRED, /* each Y subblock is independently predicted */ 19 | num_uv_modes = B_PRED, /* first four modes apply to chroma */ 20 | num_ymodes /* all modes apply to luma */ 21 | } intra_mbmode; 22 | typedef enum 23 | { 24 | B_DC_PRED, /* predict DC using row above and column to the left */ 25 | B_TM_PRED, /* propagate second differences a la "True Motion" */ 26 | B_VE_PRED, /* predict rows using row above */ 27 | B_HE_PRED, /* predict columns using column to the left */ 28 | B_LD_PRED, /* southwest (left and down) 45 degree diagonal prediction */ 29 | B_RD_PRED, /* southeast (right and down) "" */ 30 | B_VR_PRED, /* SSE (vertical right) diagonal prediction */ 31 | B_VL_PRED, /* SSW (vertical left) "" */ 32 | B_HD_PRED, /* ESE (horizontal down) "" */ 33 | B_HU_PRED, /* ENE (horizontal up) "" */ 34 | num_intra_bmodes 35 | } 36 | intra_bmode; 37 | const tree_index mb_segment_tree [2 * (4-1)] = { 2, 4, /* root: "0", "1" subtrees */ 38 | -0, -1, /* "00" = 0th value, "01" = 1st value */ 39 | -2, -3 /* "10" = 2nd value, "11" = 3rd value */ 40 | }; 41 | //const tree_index ymode_tree [2 * (num_ymodes - 1)] = { -DC_PRED, 2, /* root: DC_PRED = "0", "1" subtree */ 42 | // 4, 6, /* "1" subtree has 2 descendant subtrees */ 43 | // -V_PRED, -H_PRED, /* "10" subtree: V_PRED = "100", H_PRED = "101" */ 44 | // -TM_PRED, -B_PRED /* "11" subtree: TM_PRED = "110", B_PRED = "111" */ 45 | // }; 46 | const tree_index kf_ymode_tree[2*(num_ymodes-1)] = {-B_PRED, 2, /* root: B_PRED = "0", "1" subtree */ 47 | 4, 6, /* "1" subtree has 2 descendant subtrees */ 48 | -DC_PRED, -V_PRED, /* "10" subtree: DC_PRED = "100", V_PRED = "101" */ 49 | -H_PRED, -TM_PRED /* "11" subtree: H_PRED = "110", TM_PRED = "111" */ 50 | }; 51 | const tree_index ymode_tree [2 * (num_ymodes - 1)] = { -DC_PRED, 2, /* root: DC_PRED = "0", "1" subtree */ 52 | 4, 6, /* "1" subtree has 2 descendant subtrees */ 53 | -V_PRED, -H_PRED, /* "10" subtree: V_PRED = "100",H_PRED = "101" */ 54 | -TM_PRED, -B_PRED /* "11" subtree: TM_PRED = "110",B_PRED = "111" */ 55 | }; 56 | const tree_index uv_mode_tree [2 * (num_uv_modes - 1)] = { -DC_PRED, 2, /* root: DC_PRED = "0", "1" subtree */ 57 | -V_PRED, 4, /* "1" subtree: V_PRED = "10", "11" subtree */ 58 | -H_PRED, -TM_PRED /* "11" subtree: H_PRED = "110", TM_PRED = "111" */ 59 | }; 60 | const tree_index bmode_tree [2 * (num_intra_bmodes - 1)] = {-B_DC_PRED, 2, /* B_DC_PRED = "0" */ 61 | -B_TM_PRED, 4, /* B_TM_PRED = "10" */ 62 | -B_VE_PRED, 6, /* B_VE_PRED = "110" */ 63 | 8, 12, 64 | -B_HE_PRED, 10, /* B_HE_PRED = "11100" */ 65 | -B_RD_PRED, -B_VR_PRED, /* B_RD_PRED = "111010", B_VR_PRED = "111011" */ 66 | -B_LD_PRED, 14, /* B_LD_PRED = "111110" */ 67 | -B_VL_PRED, 16, /* B_VL_PRED = "1111110" */ 68 | -B_HD_PRED, -B_HU_PRED /* HD = "11111110", HU = "11111111" */ 69 | }; 70 | Prob new_segment_prob[4] = { 128, 128, 128, 128 }; 71 | const Prob kf_ymode_prob [num_ymodes - 1] = { 145, 156, 163, 128}; 72 | const Prob ymode_prob [num_ymodes - 1] = { 112, 86, 140, 37}; //default 73 | const Prob B_ymode_prob [num_ymodes - 1] = { 0, 0, 0, 0}; //adapted fo B_PRED = "111" 74 | const Prob kf_uv_mode_prob [num_uv_modes - 1] = { 142, 114, 183}; 75 | const Prob uv_mode_prob [num_uv_modes - 1] = { 162, 101, 204}; // default 76 | const Prob TM_uv_mode_prob [num_uv_modes - 1] = { 0, 0, 0}; // adapted for TM_PRED = "111" 77 | const Prob kf_bmode_prob [num_intra_bmodes][num_intra_bmodes][num_intra_bmodes-1] = 78 | { 79 | { 80 | { 231, 120, 48, 89, 115, 113, 120, 152, 112}, 81 | { 152, 179, 64, 126, 170, 118, 46, 70, 95}, 82 | { 175, 69, 143, 80, 85, 82, 72, 155, 103}, 83 | { 56, 58, 10, 171, 218, 189, 17, 13, 152}, 84 | { 144, 71, 10, 38, 171, 213, 144, 34, 26}, 85 | { 114, 26, 17, 163, 44, 195, 21, 10, 173}, 86 | { 121, 24, 80, 195, 26, 62, 44, 64, 85}, 87 | { 170, 46, 55, 19, 136, 160, 33, 206, 71}, 88 | { 63, 20, 8, 114, 114, 208, 12, 9, 226}, 89 | { 81, 40, 11, 96, 182, 84, 29, 16, 36} 90 | }, 91 | { 92 | { 134, 183, 89, 137, 98, 101, 106, 165, 148}, 93 | { 72, 187, 100, 130, 157, 111, 32, 75, 80}, 94 | { 66, 102, 167, 99, 74, 62, 40, 234, 128}, 95 | { 41, 53, 9, 178, 241, 141, 26, 8, 107}, 96 | { 104, 79, 12, 27, 217, 255, 87, 17, 7}, 97 | { 74, 43, 26, 146, 73, 166, 49, 23, 157}, 98 | { 65, 38, 105, 160, 51, 52, 31, 115, 128}, 99 | { 87, 68, 71, 44, 114, 51, 15, 186, 23}, 100 | { 47, 41, 14, 110, 182, 183, 21, 17, 194}, 101 | { 66, 45, 25, 102, 197, 189, 23, 18, 22} 102 | }, 103 | { 104 | { 88, 88, 147, 150, 42, 46, 45, 196, 205}, 105 | { 43, 97, 183, 117, 85, 38, 35, 179, 61}, 106 | { 39, 53, 200, 87, 26, 21, 43, 232, 171}, 107 | { 56, 34, 51, 104, 114, 102, 29, 93, 77}, 108 | { 107, 54, 32, 26, 51, 1, 81, 43, 31}, 109 | { 39, 28, 85, 171, 58, 165, 90, 98, 64}, 110 | { 34, 22, 116, 206, 23, 34, 43, 166, 73}, 111 | { 68, 25, 106, 22, 64, 171, 36, 225, 114}, 112 | { 34, 19, 21, 102, 132, 188, 16, 76, 124}, 113 | { 62, 18, 78, 95, 85, 57, 50, 48, 51} 114 | }, 115 | { 116 | { 193, 101, 35, 159, 215, 111, 89, 46, 111}, 117 | { 60, 148, 31, 172, 219, 228, 21, 18, 111}, 118 | { 112, 113, 77, 85, 179, 255, 38, 120, 114}, 119 | { 40, 42, 1, 196, 245, 209, 10, 25, 109}, 120 | { 100, 80, 8, 43, 154, 1, 51, 26, 71}, 121 | { 88, 43, 29, 140, 166, 213, 37, 43, 154}, 122 | { 61, 63, 30, 155, 67, 45, 68, 1, 209}, 123 | { 142, 78, 78, 16, 255, 128, 34, 197, 171}, 124 | { 41, 40, 5, 102, 211, 183, 4, 1, 221}, 125 | { 51, 50, 17, 168, 209, 192, 23, 25, 82} 126 | }, 127 | { 128 | { 125, 98, 42, 88, 104, 85, 117, 175, 82}, 129 | { 95, 84, 53, 89, 128, 100, 113, 101, 45}, 130 | { 75, 79, 123, 47, 51, 128, 81, 171, 1}, 131 | { 57, 17, 5, 71, 102, 57, 53, 41, 49}, 132 | { 115, 21, 2, 10, 102, 255, 166, 23, 6}, 133 | { 38, 33, 13, 121, 57, 73, 26, 1, 85}, 134 | { 41, 10, 67, 138, 77, 110, 90, 47, 114}, 135 | { 101, 29, 16, 10, 85, 128, 101, 196, 26}, 136 | { 57, 18, 10, 102, 102, 213, 34, 20, 43}, 137 | { 117, 20, 15, 36, 163, 128, 68, 1, 26} 138 | }, 139 | { 140 | { 138, 31, 36, 171, 27, 166, 38, 44, 229}, 141 | { 67, 87, 58, 169, 82, 115, 26, 59, 179}, 142 | { 63, 59, 90, 180, 59, 166, 93, 73, 154}, 143 | { 40, 40, 21, 116, 143, 209, 34, 39, 175}, 144 | { 57, 46, 22, 24, 128, 1, 54, 17, 37}, 145 | { 47, 15, 16, 183, 34, 223, 49, 45, 183}, 146 | { 46, 17, 33, 183, 6, 98, 15, 32, 183}, 147 | { 65, 32, 73, 115, 28, 128, 23, 128, 205}, 148 | { 40, 3, 9, 115, 51, 192, 18, 6, 223}, 149 | { 87, 37, 9, 115, 59, 77, 64, 21, 47} 150 | }, 151 | { 152 | { 104, 55, 44, 218, 9, 54, 53, 130, 226}, 153 | { 64, 90, 70, 205, 40, 41, 23, 26, 57}, 154 | { 54, 57, 112, 184, 5, 41, 38, 166, 213}, 155 | { 30, 34, 26, 133, 152, 116, 10, 32, 134}, 156 | { 75, 32, 12, 51, 192, 255, 160, 43, 51}, 157 | { 39, 19, 53, 221, 26, 114, 32, 73, 255}, 158 | { 31, 9, 65, 234, 2, 15, 1, 118, 73}, 159 | { 88, 31, 35, 67, 102, 85, 55, 186, 85}, 160 | { 56, 21, 23, 111, 59, 205, 45, 37, 192}, 161 | { 55, 38, 70, 124, 73, 102, 1, 34, 98} 162 | }, 163 | { 164 | { 102, 61, 71, 37, 34, 53, 31, 243, 192}, 165 | { 69, 60, 71, 38, 73, 119, 28, 222, 37}, 166 | { 68, 45, 128, 34, 1, 47, 11, 245, 171}, 167 | { 62, 17, 19, 70, 146, 85, 55, 62, 70}, 168 | { 75, 15, 9, 9, 64, 255, 184, 119, 16}, 169 | { 37, 43, 37, 154, 100, 163, 85, 160, 1}, 170 | { 63, 9, 92, 136, 28, 64, 32, 201, 85}, 171 | { 86, 6, 28, 5, 64, 255, 25, 248, 1}, 172 | { 56, 8, 17, 132, 137, 255, 55, 116, 128}, 173 | { 58, 15, 20, 82, 135, 57, 26, 121, 40} 174 | }, 175 | { 176 | { 164, 50, 31, 137, 154, 133, 25, 35, 218}, 177 | { 51, 103, 44, 131, 131, 123, 31, 6, 158}, 178 | { 86, 40, 64, 135, 148, 224, 45, 183, 128}, 179 | { 22, 26, 17, 131, 240, 154, 14, 1, 209}, 180 | { 83, 12, 13, 54, 192, 255, 68, 47, 28}, 181 | { 45, 16, 21, 91, 64, 222, 7, 1, 197}, 182 | { 56, 21, 39, 155, 60, 138, 23, 102, 213}, 183 | { 85, 26, 85, 85, 128, 128, 32, 146, 171}, 184 | { 18, 11, 7, 63, 144, 171, 4, 4, 246}, 185 | { 35, 27, 10, 146, 174, 171, 12, 26, 128} 186 | }, 187 | { 188 | { 190, 80, 35, 99, 180, 80, 126, 54, 45}, 189 | { 85, 126, 47, 87, 176, 51, 41, 20, 32}, 190 | { 101, 75, 128, 139, 118, 146, 116, 128, 85}, 191 | { 56, 41, 15, 176, 236, 85, 37, 9, 62}, 192 | { 146, 36, 19, 30, 171, 255, 97, 27, 20}, 193 | { 71, 30, 17, 119, 118, 255, 17, 18, 138}, 194 | { 101, 38, 60, 138, 55, 70, 43, 26, 142}, 195 | { 138, 45, 61, 62, 219, 1, 81, 188, 64}, 196 | { 32, 41, 20, 117, 151, 142, 20, 21, 163}, 197 | { 112, 19, 12, 61, 195, 128, 48, 4, 24} 198 | } 199 | }; 200 | const Prob bmode_prob [num_intra_bmodes - 1] = { 120, 90, 79, 133, 87, 85, 80, 111, 151 }; 201 | 202 | 203 | 204 | typedef enum // MV modes for whole macroblock vectors 205 | { 206 | NEARESTMV = num_ymodes, /* use "nearest" motion vector for entire MB */ 207 | NEARMV, /* use "next nearest" "" */ 208 | ZEROMV, /* use zero "" */ 209 | NEWMV, /* use explicit offset from implicit "" */ 210 | SPLITMV, /* use multiple motion vectors */ 211 | num_mv_refs = SPLITMV + 1 - NEARESTMV 212 | } mv_ref; 213 | const tree_index mv_ref_tree [2 * (num_mv_refs - 1)] = 214 | { 215 | -ZEROMV, 2, /* zero = "0" */ 216 | -NEARESTMV, 4, /* nearest = "10" */ 217 | -NEARMV, 6, /* near = "110" */ 218 | -NEWMV, -SPLITMV /* new = "1110", split = "1111" */ 219 | }; 220 | const int vp8_mode_contexts[6][4] = { { 7, 1, 1, 143, }, 221 | { 14, 18, 14, 107, }, 222 | { 135, 64, 57, 68, }, 223 | { 60, 56, 128, 65, }, 224 | { 159, 134, 128, 34, }, 225 | { 234, 188, 128, 28, }, 226 | }; 227 | 228 | 229 | 230 | typedef enum 231 | { 232 | MV_TOP_BOTTOM, /* two pieces {0...7} and {8...15} */ 233 | MV_LEFT_RIGHT, /* {0,1,4,5,8,9,12,13} and {2,3,6,7,10,11,14,15} */ 234 | MV_QUARTERS, /* {0,1,4,5}, {2,3,6,7}, {8,9,12,13}, {10,11,14,15} */ 235 | MV_16, /* every subblock gets its own vector {0} ... {15} */ //we have only this one 236 | mv_num_partitions 237 | } MVpartition; 238 | const tree_index split_mv_tree[2 * (mv_num_partitions - 1)] = 239 | { 240 | -MV_16, 2, /* MV_16 = "0" */ 241 | -MV_QUARTERS, 4, /* mv_quarters = "10" */ 242 | -MV_TOP_BOTTOM, -MV_LEFT_RIGHT /* top_bottom = "110", left_right = "111" */ 243 | }; 244 | static const unsigned char split_mv_probs[3] = { 110, 111, 150}; 245 | 246 | typedef enum 247 | { 248 | LEFT4x4 = num_intra_bmodes, /* use already-coded MV to my left */ 249 | ABOVE4x4, /* use already-coded MV above me */ 250 | ZERO4x4, /* use zero MV */ 251 | NEW4x4, /* explicit offset from "best" */ 252 | num_sub_mv_ref 253 | } sub_mv_ref; 254 | const tree_index submv_ref_tree [2 * (num_sub_mv_ref - 1)] = { -LEFT4x4, 2, /* LEFT = "0" */ 255 | -ABOVE4x4, 4, /* ABOVE = "10" */ 256 | -ZERO4x4, -NEW4x4 /* ZERO = "110", NEW = "111" */ 257 | }; 258 | static const unsigned char submv_ref_probs2[5][3] = { { 147, 136, 18 }, 259 | { 106, 145, 1 }, 260 | { 179, 121, 1 }, 261 | { 223, 1, 34 }, 262 | { 208, 1, 1 } 263 | }; 264 | 265 | 266 | 267 | typedef enum 268 | { 269 | mvpis_short, /* short (<= 7) vs long (>= 8) */ 270 | MVPsign, /* sign for non-zero */ 271 | MVPshort, /* 8 short values = 7-position tree */ 272 | MVPbits = MVPshort + 7, /* 8 long value bits w/independent probs */ 273 | MVPcount = MVPbits + 10 /* 19 probabilities in total */ 274 | } MVPindices; 275 | const Prob default_mv_context[2][MVPcount] ={{ // row 276 | 162, // is short 277 | 128, // sign 278 | 225, 146, 172, 147, 214, 39, 156, // short tree 279 | 128, 129, 132, 75, 145, 178, 206, 239, 254, 254 // long bits 280 | }, 281 | { // same for column 282 | 164, // is short 283 | 128, 284 | 204, 170, 119, 235, 140, 230, 228, 285 | 128, 130, 130, 74, 148, 180, 203, 236, 254, 254 // long bits 286 | }}; 287 | Prob new_mv_context[2][MVPcount]; 288 | cl_uint num_mv_context[2][MVPcount]; 289 | cl_uint denom_mv_context[2][MVPcount]; 290 | const tree_index small_mvtree [2 * (8 - 1)] = { 2, 8, /* "0" subtree, "1" subtree */ 291 | 4, 6, /* "00" subtree, "01" subtree */ 292 | -0, -1, /* 0 = "000", 1 = "001" */ 293 | -2, -3, /* 2 = "010", 3 = "011" */ 294 | 10, 12, /* "10" subtree, "11" subtree */ 295 | -4, -5, /* 4 = "100", 5 = "101" */ 296 | -6, -7 /* 6 = "110", 7 = "111" */ 297 | }; 298 | 299 | 300 | 301 | const Prob coeff_update_probs [4] [8] [3] [11] = 302 | { 303 | { 304 | { 305 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 306 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 307 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 308 | }, 309 | { 310 | { 176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 311 | { 223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255}, 312 | { 249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255} 313 | }, 314 | { 315 | { 255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255}, 316 | { 234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 317 | { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 318 | }, 319 | { 320 | { 255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 321 | { 239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 322 | { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255} 323 | }, 324 | { 325 | { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 326 | { 251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 327 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 328 | }, 329 | { 330 | { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 331 | { 251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 332 | { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255} 333 | }, 334 | { 335 | { 255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255}, 336 | { 250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255}, 337 | { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 338 | }, 339 | { 340 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 341 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 342 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 343 | } 344 | }, 345 | { 346 | { 347 | { 217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 348 | { 225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255}, 349 | { 234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255} 350 | }, 351 | { 352 | { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 353 | { 223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 354 | { 238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255} 355 | }, 356 | { 357 | { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 358 | { 249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 359 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 360 | }, 361 | { 362 | { 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 363 | { 247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 364 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 365 | }, 366 | { 367 | { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 368 | { 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 369 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 370 | }, 371 | { 372 | { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 373 | { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 374 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 375 | }, 376 | { 377 | { 255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255}, 378 | { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 379 | { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 380 | }, 381 | { 382 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 383 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 384 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 385 | } 386 | }, 387 | { 388 | { 389 | { 186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255}, 390 | { 234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255}, 391 | { 251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255} 392 | }, 393 | { 394 | { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 395 | { 236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 396 | { 251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255} 397 | }, 398 | { 399 | { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 400 | { 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 401 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 402 | }, 403 | { 404 | { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 405 | { 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 406 | { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 407 | }, 408 | { 409 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 410 | { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 411 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 412 | }, 413 | { 414 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 415 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 416 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 417 | }, 418 | { 419 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 420 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 421 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 422 | }, 423 | { 424 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 425 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 426 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 427 | } 428 | }, 429 | { 430 | { 431 | { 248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 432 | { 250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255}, 433 | { 248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255} 434 | }, 435 | { 436 | { 255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255}, 437 | { 246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255}, 438 | { 252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255} 439 | }, 440 | { 441 | { 255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255}, 442 | { 248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255}, 443 | { 253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255} 444 | }, 445 | { 446 | { 255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 447 | { 245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 448 | { 253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255} 449 | }, 450 | { 451 | { 255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255}, 452 | { 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 453 | { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255} 454 | }, 455 | { 456 | { 255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 457 | { 249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255}, 458 | { 255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255} 459 | }, 460 | { 461 | { 255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255}, 462 | { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 463 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 464 | }, 465 | { 466 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 467 | { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}, 468 | { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255} 469 | } 470 | } 471 | }; 472 | 473 | const Prob vp8_mv_update_probs[2][19] = 474 | { 475 | { 476 | 237, 477 | 246, 478 | 253, 253, 254, 254, 254, 254, 254, 479 | 254, 254, 254, 254, 254, 250, 250, 252, 254, 254 480 | }, 481 | { 482 | 231, 483 | 243, 484 | 245, 253, 254, 254, 254, 254, 254, 485 | 254, 254, 254, 254, 254, 251, 251, 254, 254, 254 486 | } 487 | }; 488 | -------------------------------------------------------------------------------- /src/vp8enc.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #ifdef _WIN32 9 | #include 10 | #pragma warning(disable: 4996) 11 | #endif 12 | 13 | #define QUANT_TO_FILTER_LEVEL 3 14 | #define DEFAULT_ALTREF_RANGE 5 15 | //#define ALLWAYS_FLUSH 16 | 17 | static const cl_uchar vp8_dc_qlookup[128] = 18 | { 19 | 4, 5, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15, 16, 17, 17, 20 | 18, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 25, 25, 26, 27, 28, 21 | 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 22 | 44, 45, 46, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 23 | 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 24 | 75, 76, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 25 | 91, 93, 95, 96, 98, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118, 26 | 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 143, 145, 148, 151, 154, 157, 27 | }; 28 | 29 | static const cl_short vp8_ac_qlookup[128] = 30 | { 31 | 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 32 | 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 33 | 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 34 | 52, 53, 54, 55, 56, 57, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 35 | 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 36 | 110, 112, 114, 116, 119, 122, 125, 128, 131, 134, 137, 140, 143, 146, 149, 152, 37 | 155, 158, 161, 164, 167, 170, 173, 177, 181, 185, 189, 193, 197, 201, 205, 209, 38 | 213, 217, 221, 225, 229, 234, 239, 245, 249, 254, 259, 264, 269, 274, 279, 284, 39 | }; 40 | 41 | //static const cl_int zigzag[16] = { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 }; 42 | //static const cl_int inv_zigzag[16] = { 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 }; 43 | // not only inv zigzag is inverse for zigzag, but 44 | // A[i] = B[zigzag[i]] === A[inv_zigzag[i]] = B[i] 45 | 46 | 47 | #define ERRORPATH "clErrors.txt" 48 | #define DUMPPATH "dump.y4m" 49 | #define CPUPATH "CPU_kernels.cl" 50 | #define GPUPATH "GPU_kernels.cl" 51 | 52 | union mv { 53 | cl_uint raw; 54 | struct { 55 | cl_short x, y; 56 | } d; 57 | }; 58 | 59 | typedef enum { 60 | are16x16 = 0, 61 | are8x8 = 1, 62 | are4x4 = 2 63 | } partition_mode; 64 | 65 | typedef enum { 66 | LAST = 0, 67 | GOLDEN = 1, 68 | ALTREF = 2 69 | } reference_frame_t; 70 | 71 | typedef enum { 72 | intra_segment = 0, 73 | UQ_segment = 0, 74 | HQ_segment = 1, 75 | AQ_segment = 2, 76 | LQ_segment = 3, 77 | SEGMENT_COUNT = 4 78 | } segment_id_t; 79 | 80 | typedef struct { 81 | cl_int y_ac_i; 82 | cl_int y_dc_idelta; 83 | cl_int y2_dc_idelta; 84 | cl_int y2_ac_idelta; 85 | cl_int uv_dc_idelta; 86 | cl_int uv_ac_idelta; 87 | cl_int loop_filter_level; 88 | cl_int mbedge_limit; 89 | cl_int sub_bedge_limit; 90 | cl_int interior_limit; 91 | cl_int hev_threshold; 92 | } segment_data; 93 | 94 | /*typedef struct { //in future resize to short or chars!!! 95 | cl_short coeffs[25][16]; 96 | cl_int vector_x[4]; 97 | cl_int vector_y[4]; 98 | float SSIM; 99 | cl_int non_zero_coeffs; 100 | cl_int parts; //16x16 == 0; 8x8 == 1; 101 | cl_int reference_frame; 102 | cl_int segment_id; 103 | } macroblock;*/ 104 | 105 | typedef struct { 106 | cl_short coeff[16]; 107 | } block_t; 108 | 109 | typedef struct { 110 | block_t block[25]; 111 | } macroblock_coeffs_t; 112 | 113 | typedef struct { 114 | cl_short x; 115 | cl_short y; 116 | } vector_t; 117 | 118 | typedef struct { 119 | vector_t vector[4]; 120 | } macroblock_vectors_t; 121 | 122 | typedef struct { 123 | cl_int vector_x; 124 | cl_int vector_y; 125 | } vector_net; 126 | typedef struct 127 | { 128 | union mv base_mv; 129 | cl_int is_inter_mb; 130 | cl_int parts; 131 | cl_int mode[16]; 132 | } macroblock_extra_data; 133 | 134 | struct deviceContext 135 | { 136 | cl_context context_gpu; 137 | cl_context context_cpu; 138 | cl_platform_id *platforms; 139 | cl_device_id *device_cpu; 140 | cl_device_id *device_gpu; 141 | cl_device_type gpu_device_type; 142 | cl_program program_cpu; 143 | cl_program program_gpu; 144 | cl_command_queue boolcoder_commandQueue_cpu; 145 | cl_command_queue loopfilterY_commandQueue_cpu; 146 | cl_command_queue loopfilterU_commandQueue_cpu; 147 | cl_command_queue loopfilterV_commandQueue_cpu; 148 | cl_command_queue commandQueue1_gpu; 149 | cl_command_queue commandQueue2_gpu; 150 | cl_command_queue commandQueue3_gpu; 151 | cl_command_queue dataCopy_gpu; 152 | cl_int state_cpu; 153 | cl_int state_gpu; 154 | cl_kernel reset_vectors; 155 | cl_kernel luma_search_last_16x; 156 | cl_kernel luma_search_last_8x; 157 | cl_kernel luma_search_last_4x; 158 | cl_kernel luma_search_last_2x; 159 | cl_kernel luma_search_last_1x; 160 | cl_kernel luma_search_altref_16x; 161 | cl_kernel luma_search_altref_8x; 162 | cl_kernel luma_search_altref_4x; 163 | cl_kernel luma_search_altref_2x; 164 | cl_kernel luma_search_altref_1x; 165 | cl_kernel luma_search_golden_16x; 166 | cl_kernel luma_search_golden_8x; 167 | cl_kernel luma_search_golden_4x; 168 | cl_kernel luma_search_golden_2x; 169 | cl_kernel luma_search_golden_1x; 170 | cl_kernel luma_search_last_d4x; 171 | cl_kernel luma_search_golden_d4x; 172 | cl_kernel luma_search_altref_d4x; 173 | cl_kernel downsample_current_1x_to_2x; 174 | cl_kernel downsample_current_2x_to_4x; 175 | cl_kernel downsample_current_4x_to_8x; 176 | cl_kernel downsample_current_8x_to_16x; 177 | cl_kernel downsample_last_1x_to_2x; 178 | cl_kernel downsample_last_2x_to_4x; 179 | cl_kernel downsample_last_4x_to_8x; 180 | cl_kernel downsample_last_8x_to_16x; 181 | cl_kernel select_reference; 182 | cl_kernel prepare_predictors_and_residual_last_Y; 183 | cl_kernel prepare_predictors_and_residual_last_U; 184 | cl_kernel prepare_predictors_and_residual_last_V; 185 | cl_kernel prepare_predictors_and_residual_golden_Y; 186 | cl_kernel prepare_predictors_and_residual_golden_U; 187 | cl_kernel prepare_predictors_and_residual_golden_V; 188 | cl_kernel prepare_predictors_and_residual_altref_Y; 189 | cl_kernel prepare_predictors_and_residual_altref_U; 190 | cl_kernel prepare_predictors_and_residual_altref_V; 191 | cl_kernel pack_8x8_into_16x16; 192 | cl_kernel dct4x4_Y[4]; 193 | cl_kernel dct4x4_U[4]; 194 | cl_kernel dct4x4_V[4]; 195 | cl_kernel wht4x4_iwht4x4[4]; 196 | cl_kernel idct4x4_Y[4]; 197 | cl_kernel idct4x4_U[4]; 198 | cl_kernel idct4x4_V[4]; 199 | cl_kernel chroma_transform; 200 | cl_kernel encode_coefficients; 201 | cl_kernel count_probs; 202 | cl_kernel num_div_denom; 203 | cl_kernel normal_loop_filter_MBH; 204 | cl_kernel normal_loop_filter_MBV; 205 | cl_kernel loop_filter_frame_luma; 206 | cl_kernel loop_filter_frame_chroma_U; 207 | cl_kernel loop_filter_frame_chroma_V; 208 | cl_kernel count_SSIM_luma[4]; 209 | cl_kernel count_SSIM_chroma_U[4]; 210 | cl_kernel count_SSIM_chroma_V[4]; 211 | cl_kernel gather_SSIM; 212 | cl_kernel prepare_filter_mask; 213 | /* add kernels */ 214 | 215 | // these are frame data padded to be devisible by 16 and converted to normalized int16 216 | cl_mem current_frame_Y; 217 | cl_mem current_frame_Y_downsampled_by2; 218 | cl_mem current_frame_Y_downsampled_by4; 219 | cl_mem current_frame_Y_downsampled_by8; 220 | cl_mem current_frame_Y_downsampled_by16; 221 | cl_mem current_frame_U; 222 | cl_mem current_frame_V; 223 | //images 224 | cl_image_format image_format; 225 | cl_mem last_frame_Y_image; 226 | cl_mem last_frame_U_image; 227 | cl_mem last_frame_V_image; 228 | cl_mem altref_frame_Y_image; 229 | cl_mem altref_frame_U_image; 230 | cl_mem altref_frame_V_image; 231 | cl_mem golden_frame_Y_image; 232 | cl_mem golden_frame_U_image; 233 | cl_mem golden_frame_V_image; 234 | //instead of original size we use reconstructed frame 235 | cl_mem last_frame_Y_downsampled_by2; 236 | cl_mem last_frame_Y_downsampled_by4; 237 | cl_mem last_frame_Y_downsampled_by8; 238 | cl_mem last_frame_Y_downsampled_by16; 239 | cl_mem golden_frame_Y_downsampled_by2; 240 | cl_mem golden_frame_Y_downsampled_by4; 241 | cl_mem golden_frame_Y_downsampled_by8; 242 | cl_mem golden_frame_Y_downsampled_by16; 243 | cl_mem altref_frame_Y_downsampled_by2; 244 | cl_mem altref_frame_Y_downsampled_by4; 245 | cl_mem altref_frame_Y_downsampled_by8; 246 | cl_mem altref_frame_Y_downsampled_by16; 247 | cl_mem reconstructed_frame_Y; 248 | cl_mem reconstructed_frame_U; 249 | cl_mem reconstructed_frame_V; 250 | cl_mem predictors_Y; 251 | cl_mem predictors_U; 252 | cl_mem predictors_V; 253 | cl_mem residual_Y; 254 | cl_mem residual_U; 255 | cl_mem residual_V; 256 | cl_mem golden_frame_Y; 257 | cl_mem altref_frame_Y; 258 | cl_mem cpu_frame_Y; //for a filter 259 | cl_mem cpu_frame_U; 260 | cl_mem cpu_frame_V; 261 | cl_mem last_vnet1; 262 | cl_mem golden_vnet1; 263 | cl_mem altref_vnet1; 264 | cl_mem last_vnet2; 265 | cl_mem golden_vnet2; 266 | cl_mem altref_vnet2; 267 | cl_mem metrics1; 268 | cl_mem metrics2; 269 | cl_mem metrics3; 270 | cl_mem mb_mask; 271 | cl_mem segments_data_gpu; 272 | cl_mem segments_data_cpu; 273 | cl_mem third_context; 274 | cl_mem coeff_probs; 275 | cl_mem coeff_probs_denom; 276 | 277 | cl_mem macroblock_coeffs_gpu; 278 | cl_mem macroblock_coeffs_cpu; 279 | cl_mem macroblock_vectors_gpu; 280 | cl_mem macroblock_vectors_cpu; 281 | cl_mem macroblock_reference_frame_gpu; 282 | //cl_mem macroblock_reference_frame_cpu; 283 | cl_mem macroblock_parts_gpu; 284 | cl_mem macroblock_parts_cpu; 285 | cl_mem macroblock_SSIM_gpu; 286 | cl_mem macroblock_segment_id_gpu; 287 | cl_mem macroblock_segment_id_cpu; 288 | cl_mem macroblock_non_zero_coeffs_gpu; 289 | cl_mem macroblock_non_zero_coeffs_cpu; 290 | 291 | cl_mem transformed_blocks_cpu; 292 | cl_mem transformed_blocks_gpu; 293 | cl_mem partitions; 294 | cl_mem partitions_sizes; 295 | 296 | size_t gpu_work_items_per_dim[1]; 297 | size_t gpu_work_group_size_per_dim[1]; 298 | size_t cpu_work_items_per_dim[1]; 299 | size_t cpu_work_group_size_per_dim[1]; 300 | 301 | cl_uint gpu_preferred_platform_number; 302 | 303 | }; 304 | 305 | struct videoContext 306 | { 307 | // size of input frame 308 | cl_int src_width; 309 | cl_int src_height; 310 | cl_int src_frame_size_luma; 311 | cl_int src_frame_size_chroma; 312 | // size of output frame 313 | cl_int dst_width; 314 | cl_int dst_height; 315 | cl_int dst_frame_size_luma; 316 | cl_int dst_frame_size_chroma; 317 | // size of padded frame 318 | cl_int wrk_width; 319 | cl_int wrk_height; 320 | cl_int wrk_frame_size_luma; 321 | cl_int wrk_frame_size_chroma; 322 | 323 | cl_int mb_width; 324 | cl_int mb_height; 325 | cl_int mb_count; 326 | cl_int GOP_size; 327 | cl_int altref_range; 328 | 329 | cl_int qi_min; 330 | cl_int qi_max; 331 | cl_int altrefqi[4]; 332 | cl_int lastqi[4]; 333 | 334 | cl_int loop_filter_type; 335 | cl_int loop_filter_sharpness; 336 | 337 | cl_int number_of_partitions; 338 | cl_int number_of_partitions_ind; 339 | cl_int partition_step; 340 | 341 | cl_uint timestep; 342 | cl_uint timescale; 343 | cl_uint framerate; 344 | 345 | int do_loop_filter_on_gpu; 346 | int thread_limit; 347 | int print_info; 348 | 349 | float SSIM_target; 350 | 351 | }; 352 | 353 | struct hostFrameBuffers 354 | { 355 | cl_int frame_number; 356 | cl_int altref_frame_number; 357 | cl_int golden_frame_number; 358 | cl_int last_key_detect; 359 | cl_int frames_until_key; 360 | cl_int frames_until_altref; 361 | cl_int replaced; 362 | cl_int input_pack_size; 363 | cl_uchar *input_pack; // allbytes for YUV in one 364 | cl_uchar *current_Y; 365 | cl_uchar *current_U; 366 | cl_uchar *current_V; 367 | cl_uchar *tmp_Y; 368 | cl_uchar *tmp_U; 369 | cl_uchar *tmp_V; 370 | cl_uchar *reconstructed_Y; 371 | cl_uchar *reconstructed_U; 372 | cl_uchar *reconstructed_V; 373 | cl_uchar *last_U; 374 | cl_uchar *last_V; 375 | macroblock_extra_data *e_data; 376 | macroblock_coeffs_t *MB; 377 | macroblock_vectors_t *MB_vectors; 378 | float *MB_SSIM; 379 | cl_int *MB_segment_id; 380 | cl_int *MB_parts; 381 | cl_int *MB_non_zero_coeffs; 382 | cl_int *MB_reference_frame; 383 | 384 | segment_data segments_data[4]; 385 | cl_uchar *encoded_frame; 386 | cl_uint encoded_frame_size; 387 | cl_uint prev_frame_size; 388 | cl_ulong video_size; 389 | cl_uchar *current_frame_pos_in_pack; 390 | cl_int current_is_key_frame; 391 | cl_int current_is_altref_frame; 392 | cl_int current_is_golden_frame; 393 | cl_int prev_is_key_frame; 394 | cl_int prev_is_altref_frame; 395 | cl_int prev_is_golden_frame; 396 | cl_int skip_prob; 397 | float new_SSIM; 398 | 399 | cl_int partition_sizes[8]; 400 | cl_uchar *partitions; 401 | cl_uchar *partition_0; 402 | cl_int partition_0_size; 403 | 404 | cl_uint new_probs[4][8][3][11]; 405 | cl_uint new_probs_denom[4][8][3][11]; 406 | 407 | cl_int y_dc_q[4], y_ac_q[4], y2_dc_q[4], y2_ac_q[4], uv_dc_q[4], uv_ac_q[4]; 408 | 409 | cl_uchar header[128]; 410 | int header_sz; 411 | 412 | int threads_free; 413 | }; 414 | 415 | struct fileContext 416 | { 417 | FILE * handle; 418 | char * path; 419 | int cur_pos; 420 | }; 421 | 422 | struct encoderStatistics 423 | { 424 | int scene_changes_by_color; 425 | int scene_changes_by_ssim; 426 | int scene_changes_by_bitrate; 427 | int scene_changes_by_replaced; 428 | }; 429 | 430 | static const unsigned char k_default_coeff_probs [4][8][3][11] = 431 | { 432 | { /* block type 0 */ 433 | { /* coeff band 0 */ 434 | { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, 435 | { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, 436 | { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} 437 | }, 438 | { /* coeff band 1 */ 439 | { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128}, 440 | { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128}, 441 | { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128} 442 | }, 443 | { /* coeff band 2 */ 444 | { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128}, 445 | { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128}, 446 | { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128} 447 | }, 448 | { /* coeff band 3 */ 449 | { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128}, 450 | { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128}, 451 | { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128} 452 | }, 453 | { /* coeff band 4 */ 454 | { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128}, 455 | { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128}, 456 | { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128} 457 | }, 458 | { /* coeff band 5 */ 459 | { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128}, 460 | { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128}, 461 | { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128} 462 | }, 463 | { /* coeff band 6 */ 464 | { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128}, 465 | { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128}, 466 | { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128} 467 | }, 468 | { /* coeff band 7 */ 469 | { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128}, 470 | { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128}, 471 | { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} 472 | } 473 | }, 474 | { /* block type 1 */ 475 | { /* coeff band 0 */ 476 | { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62}, 477 | { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1}, 478 | { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128} 479 | }, 480 | { /* coeff band 1 */ 481 | { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128}, 482 | { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128}, 483 | { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128} 484 | }, 485 | { /* coeff band 2 */ 486 | { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128}, 487 | { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128}, 488 | { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128} 489 | }, 490 | { /* coeff band 3 */ 491 | { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128}, 492 | { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128}, 493 | { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128} 494 | }, 495 | { /* coeff band 4 */ 496 | { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128}, 497 | { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128}, 498 | { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128} 499 | }, 500 | { /* coeff band 5 */ 501 | { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128}, 502 | { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128}, 503 | { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128} 504 | }, 505 | { /* coeff band 6 */ 506 | { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128}, 507 | { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128}, 508 | { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128} 509 | }, 510 | { /* coeff band 7 */ 511 | { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128}, 512 | { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128}, 513 | { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128} 514 | } 515 | }, 516 | { /* block type 2 */ 517 | { /* coeff band 0 */ 518 | { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128}, 519 | { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128}, 520 | { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128} 521 | }, 522 | { /* coeff band 1 */ 523 | { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128}, 524 | { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128}, 525 | { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128} 526 | }, 527 | { /* coeff band 2 */ 528 | { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128}, 529 | { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128}, 530 | { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128} 531 | }, 532 | { /* coeff band 3 */ 533 | { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128}, 534 | { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128}, 535 | { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128} 536 | }, 537 | { /* coeff band 4 */ 538 | { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128}, 539 | { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128}, 540 | { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128} 541 | }, 542 | { /* coeff band 5 */ 543 | { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128}, 544 | { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128}, 545 | { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128} 546 | }, 547 | { /* coeff band 6 */ 548 | { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128}, 549 | { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128}, 550 | { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128} 551 | }, 552 | { /* coeff band 7 */ 553 | { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, 554 | { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, 555 | { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} 556 | } 557 | }, 558 | { /* block type 3 */ 559 | { /* coeff band 0 */ 560 | { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255}, 561 | { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128}, 562 | { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128} 563 | }, 564 | { /* coeff band 1 */ 565 | { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128}, 566 | { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128}, 567 | { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128} 568 | }, 569 | { /* coeff band 2 */ 570 | { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128}, 571 | { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128}, 572 | { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128} 573 | }, 574 | { /* coeff band 3 */ 575 | { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128}, 576 | { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128}, 577 | { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128} 578 | }, 579 | { /* coeff band 4 */ 580 | { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128}, 581 | { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128}, 582 | { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128} 583 | }, 584 | { /* coeff band 5 */ 585 | { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128}, 586 | { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128}, 587 | { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128} 588 | }, 589 | { /* coeff band 6 */ 590 | { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128}, 591 | { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128}, 592 | { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128} 593 | }, 594 | { /* coeff band 7 */ 595 | { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128}, 596 | { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128}, 597 | { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128} 598 | } 599 | } 600 | }; 601 | -------------------------------------------------------------------------------- /src/inter_part.h: -------------------------------------------------------------------------------- 1 | static void prepare_GPU_buffers() 2 | { 3 | frames.threads_free = video.thread_limit; // by this time boolcoder definetly already finished 4 | // first reset vector nets to zeros 5 | device.gpu_work_items_per_dim[0] = video.mb_count*4; 6 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.reset_vectors, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 7 | device.state_gpu = clFinish(device.commandQueue1_gpu); 8 | 9 | // now prepare downsampled LAST buffers 10 | //prepare downsampled by 2 11 | device.gpu_work_items_per_dim[0] = video.wrk_width*video.wrk_height/4; 12 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.downsample_last_1x_to_2x, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 13 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 14 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.downsample_current_1x_to_2x, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 15 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 16 | //prepare downsampled by 4 17 | device.gpu_work_items_per_dim[0] = video.wrk_width/2*video.wrk_height/2/4; 18 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.downsample_last_2x_to_4x, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 19 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 20 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.downsample_current_2x_to_4x, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 21 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 22 | //prepare downsampled by 8 23 | device.gpu_work_items_per_dim[0] = video.wrk_width/4*video.wrk_height/4/4; 24 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.downsample_last_4x_to_8x, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 25 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 26 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.downsample_current_4x_to_8x, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 27 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 28 | //prepare downsampled by 16 29 | device.gpu_work_items_per_dim[0] = video.wrk_width/8*video.wrk_height/8/4; 30 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.downsample_last_8x_to_16x, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 31 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 32 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.downsample_current_8x_to_16x, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 33 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 34 | 35 | if (frames.prev_is_golden_frame) 36 | { 37 | device.state_gpu = clEnqueueCopyBuffer(device.commandQueue1_gpu, device.reconstructed_frame_Y, device.golden_frame_Y, 0, 0, video.wrk_frame_size_luma, 0, NULL, NULL); 38 | device.state_gpu = clEnqueueCopyBuffer(device.commandQueue1_gpu, device.last_frame_Y_downsampled_by2, device.golden_frame_Y_downsampled_by2, 0, 0, video.wrk_frame_size_luma/4, 0, NULL, NULL); 39 | device.state_gpu = clEnqueueCopyBuffer(device.commandQueue1_gpu, device.last_frame_Y_downsampled_by4, device.golden_frame_Y_downsampled_by4, 0, 0, video.wrk_frame_size_luma/16, 0, NULL, NULL); 40 | device.state_gpu = clEnqueueCopyBuffer(device.commandQueue1_gpu, device.last_frame_Y_downsampled_by8, device.golden_frame_Y_downsampled_by8, 0, 0, video.wrk_frame_size_luma/64, 0, NULL, NULL); 41 | device.state_gpu = clEnqueueCopyBuffer(device.commandQueue1_gpu, device.last_frame_Y_downsampled_by16, device.golden_frame_Y_downsampled_by16, 0, 0, video.wrk_frame_size_luma/256, 0, NULL, NULL); 42 | } 43 | if (frames.prev_is_altref_frame) 44 | { 45 | device.state_gpu = clEnqueueCopyBuffer(device.commandQueue1_gpu, device.reconstructed_frame_Y, device.altref_frame_Y, 0, 0, video.wrk_frame_size_luma, 0, NULL, NULL); 46 | device.state_gpu = clEnqueueCopyBuffer(device.commandQueue1_gpu, device.last_frame_Y_downsampled_by2, device.altref_frame_Y_downsampled_by2, 0, 0, video.wrk_frame_size_luma/4, 0, NULL, NULL); 47 | device.state_gpu = clEnqueueCopyBuffer(device.commandQueue1_gpu, device.last_frame_Y_downsampled_by4, device.altref_frame_Y_downsampled_by4, 0, 0, video.wrk_frame_size_luma/16, 0, NULL, NULL); 48 | device.state_gpu = clEnqueueCopyBuffer(device.commandQueue1_gpu, device.last_frame_Y_downsampled_by8, device.altref_frame_Y_downsampled_by8, 0, 0, video.wrk_frame_size_luma/64, 0, NULL, NULL); 49 | device.state_gpu = clEnqueueCopyBuffer(device.commandQueue1_gpu, device.last_frame_Y_downsampled_by16, device.altref_frame_Y_downsampled_by16, 0, 0, video.wrk_frame_size_luma/256, 0, NULL, NULL); 50 | } 51 | 52 | // prepare images (if they need to be renewed) 53 | const size_t origin[3] = {0, 0, 0}; 54 | const size_t region_y[3] = {video.wrk_width, video.wrk_height, 1}; 55 | const size_t region_uv[3] = {video.wrk_width/2, video.wrk_height/2, 1}; 56 | 57 | device.state_gpu = finalFlush(device.commandQueue1_gpu); 58 | device.state_gpu = finalFlush(device.commandQueue2_gpu); 59 | clFinish(device.commandQueue3_gpu); 60 | 61 | if (video.do_loop_filter_on_gpu) 62 | { 63 | device.state_gpu = clEnqueueReadBuffer(device.commandQueue1_gpu, device.reconstructed_frame_Y ,CL_TRUE, 0, video.wrk_frame_size_luma, frames.reconstructed_Y, 0, NULL, NULL); 64 | device.state_gpu = clEnqueueReadBuffer(device.commandQueue2_gpu, device.reconstructed_frame_U ,CL_TRUE, 0, video.wrk_frame_size_chroma, frames.reconstructed_U, 0, NULL, NULL); 65 | device.state_gpu = clEnqueueReadBuffer(device.commandQueue3_gpu, device.reconstructed_frame_V ,CL_TRUE, 0, video.wrk_frame_size_chroma, frames.reconstructed_V, 0, NULL, NULL); 66 | 67 | device.state_gpu = clEnqueueWriteImage(device.commandQueue1_gpu, device.last_frame_Y_image, CL_FALSE, origin, region_y, 0, 0, frames.reconstructed_Y, 0, NULL, NULL); 68 | device.state_gpu = clEnqueueWriteImage(device.commandQueue2_gpu, device.last_frame_U_image, CL_FALSE, origin, region_uv, 0, 0, frames.reconstructed_U, 0, NULL, NULL); 69 | device.state_gpu = clEnqueueWriteImage(device.commandQueue3_gpu, device.last_frame_V_image, CL_FALSE, origin, region_uv, 0, 0, frames.reconstructed_V, 0, NULL, NULL); 70 | } 71 | 72 | if (frames.prev_is_golden_frame) 73 | { 74 | device.state_gpu = clEnqueueCopyImage(device.commandQueue1_gpu, device.last_frame_Y_image, device.golden_frame_Y_image, origin, origin, region_y, 0, NULL, NULL); 75 | device.state_gpu = clEnqueueCopyImage(device.commandQueue2_gpu, device.last_frame_U_image, device.golden_frame_U_image, origin, origin, region_uv, 0, NULL, NULL); 76 | device.state_gpu = clEnqueueCopyImage(device.commandQueue3_gpu, device.last_frame_V_image, device.golden_frame_V_image, origin, origin, region_uv, 0, NULL, NULL); 77 | } 78 | if (frames.prev_is_altref_frame) 79 | { 80 | device.state_gpu = clEnqueueCopyImage(device.commandQueue1_gpu, device.last_frame_Y_image, device.altref_frame_Y_image, origin, origin, region_y, 0, NULL, NULL); 81 | device.state_gpu = clEnqueueCopyImage(device.commandQueue2_gpu, device.last_frame_U_image, device.altref_frame_U_image, origin, origin, region_uv, 0, NULL, NULL); 82 | device.state_gpu = clEnqueueCopyImage(device.commandQueue3_gpu, device.last_frame_V_image, device.altref_frame_V_image, origin, origin, region_uv, 0, NULL, NULL); 83 | } 84 | 85 | device.state_gpu = finalFlush(device.commandQueue1_gpu); 86 | device.state_gpu = finalFlush(device.commandQueue2_gpu); 87 | device.state_gpu = finalFlush(device.commandQueue3_gpu); 88 | 89 | //device.state_gpu = clFinish(device.commandQueue1_gpu); 90 | device.state_gpu = clFinish(device.commandQueue2_gpu); 91 | //device.state_gpu = clFinish(device.commandQueue3_gpu); 92 | 93 | return; 94 | } 95 | 96 | static void inter_transform() 97 | { 98 | const int width = video.wrk_width; 99 | const int height = video.wrk_height; 100 | 101 | // if golden and altref buffers represent different from last buffer frame 102 | // and altref is not the same as altref 103 | const cl_int use_golden = !frames.prev_is_golden_frame; 104 | const cl_int use_altref = (!frames.prev_is_altref_frame) && (frames.altref_frame_number != frames.golden_frame_number); 105 | //prepare downsampled frames and image objects 106 | prepare_GPU_buffers(); 107 | 108 | 109 | //now search in downsampled by 16 110 | device.gpu_work_items_per_dim[0] = ((video.wrk_width/16)/8)*((video.wrk_height/16)/8); 111 | device.gpu_work_items_per_dim[0] += (device.gpu_work_items_per_dim[0] % 256) > 0 ? 112 | (256 - (device.gpu_work_items_per_dim[0]%256)) : 113 | 0; 114 | // LAST 115 | // we use local memory in kernel, so we have to explicitly set work group size 116 | //max work group size for this kernel is 256! (each work-group use 16kb (defined in kernel code) and each kernel-thread needs 64b => 16kb/64b == 256 117 | if (device.gpu_device_type == CL_DEVICE_TYPE_GPU) 118 | device.gpu_work_group_size_per_dim[0] = 256; 119 | else 120 | // just for tests on cpu (useful to control memory). Some CPU won't work with 256 kernels in one hardware thread 121 | device.gpu_work_group_size_per_dim[0] = 8; 122 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.luma_search_last_16x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 123 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 124 | // GOLDEN 125 | if (use_golden) 126 | { 127 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.luma_search_golden_16x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 128 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 129 | } 130 | // ALTREF 131 | if (use_altref) 132 | { 133 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue3_gpu, device.luma_search_altref_16x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 134 | device.state_gpu = ifFlush(device.commandQueue3_gpu); 135 | } 136 | 137 | //now search in downsampled by 8 138 | device.gpu_work_items_per_dim[0] = ((video.wrk_width/8)/8)*((video.wrk_height/8)/8); 139 | device.gpu_work_items_per_dim[0] += (device.gpu_work_items_per_dim[0] % 256) > 0 ? 140 | (256 - (device.gpu_work_items_per_dim[0]%256)) : 141 | 0; 142 | // LAST 143 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.luma_search_last_8x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 144 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 145 | // GOLDEN 146 | if (use_golden) 147 | { 148 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.luma_search_golden_8x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 149 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 150 | } 151 | // ALTREF 152 | if (use_altref) 153 | { 154 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue3_gpu, device.luma_search_altref_8x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 155 | device.state_gpu = ifFlush(device.commandQueue3_gpu); 156 | } 157 | 158 | //now search in downsampled by 4 159 | device.gpu_work_items_per_dim[0] = ((video.wrk_width/4)/8)*((video.wrk_height/4)/8); 160 | device.gpu_work_items_per_dim[0] += (device.gpu_work_items_per_dim[0] % 256) > 0 ? 161 | (256 - (device.gpu_work_items_per_dim[0]%256)) : 162 | 0; 163 | // LAST 164 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.luma_search_last_4x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 165 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 166 | // GOLDEN 167 | if (use_golden) 168 | { 169 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.luma_search_golden_4x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 170 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 171 | } 172 | // ALTREF 173 | if (use_altref) 174 | { 175 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue3_gpu, device.luma_search_altref_4x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 176 | device.state_gpu = ifFlush(device.commandQueue3_gpu); 177 | } 178 | 179 | //now search in downsampled by 2 180 | device.gpu_work_items_per_dim[0] = ((video.wrk_width/2)/8)*((video.wrk_height/2)/8); 181 | device.gpu_work_items_per_dim[0] += (device.gpu_work_items_per_dim[0] % 256) > 0 ? 182 | (256 - (device.gpu_work_items_per_dim[0]%256)) : 183 | 0; 184 | // LAST 185 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.luma_search_last_2x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 186 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 187 | // GOLDEN 188 | if (use_golden) 189 | { 190 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.luma_search_golden_2x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 191 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 192 | } 193 | // ALTREF 194 | if (use_altref) 195 | { 196 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue3_gpu, device.luma_search_altref_2x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 197 | device.state_gpu = ifFlush(device.commandQueue3_gpu); 198 | } 199 | 200 | //now search in original size 201 | device.gpu_work_items_per_dim[0] = video.mb_count*4; 202 | device.gpu_work_items_per_dim[0] += (device.gpu_work_items_per_dim[0] % 256) > 0 ? 203 | (256 - (device.gpu_work_items_per_dim[0]%256)) : 204 | 0; 205 | // LAST 206 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.luma_search_last_1x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 207 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 208 | // GOLDEN 209 | if (use_golden) 210 | { 211 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.luma_search_golden_1x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 212 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 213 | } 214 | // ALTREF 215 | if (use_altref) 216 | { 217 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue3_gpu, device.luma_search_altref_1x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 218 | device.state_gpu = ifFlush(device.commandQueue3_gpu); 219 | } 220 | 221 | // search in image with interpolation on the run 222 | // LAST 223 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.luma_search_last_d4x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 224 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 225 | // GOLDEN 226 | if (use_golden) 227 | { 228 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.luma_search_golden_d4x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 229 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 230 | } 231 | // ALTREF 232 | if (use_altref) 233 | { 234 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue3_gpu, device.luma_search_altref_d4x, 1, NULL, device.gpu_work_items_per_dim, device.gpu_work_group_size_per_dim, 0, NULL, NULL); 235 | device.state_gpu = ifFlush(device.commandQueue3_gpu); 236 | } 237 | 238 | //device.state_gpu = clFinish(device.commandQueue1_gpu); 239 | if (use_golden||use_altref) 240 | device.state_gpu = finalFlush(device.commandQueue1_gpu); 241 | if (use_golden) 242 | device.state_gpu = finalFlush(device.commandQueue2_gpu); 243 | if (use_altref) 244 | device.state_gpu = finalFlush(device.commandQueue3_gpu); 245 | if (use_golden) 246 | device.state_gpu = clFinish(device.commandQueue2_gpu); 247 | if (use_altref) 248 | device.state_gpu = clFinish(device.commandQueue3_gpu); 249 | 250 | // now set each MB with the best reference 251 | device.gpu_work_items_per_dim[0] = video.mb_count; 252 | device.state_gpu = clSetKernelArg(device.select_reference, 9, sizeof(cl_int), &use_golden); 253 | device.state_gpu = clSetKernelArg(device.select_reference, 10, sizeof(cl_int), &use_altref); 254 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.select_reference, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 255 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 256 | // set 16x16 mode for macroblocks, whose blocks have identical vectors 257 | device.gpu_work_items_per_dim[0] = video.mb_count; 258 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.pack_8x8_into_16x16, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 259 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 260 | 261 | device.state_gpu = clFinish(device.commandQueue1_gpu); //we need to finish packing before start preparing predictors for golden or altref and before reading buffers 262 | 263 | device.state_gpu = clEnqueueReadBuffer(device.dataCopy_gpu, device.macroblock_parts_gpu, CL_FALSE, 0, video.mb_count*sizeof(cl_int), frames.MB_parts, 0, NULL, NULL); 264 | device.state_gpu = clEnqueueReadBuffer(device.dataCopy_gpu, device.macroblock_reference_frame_gpu, CL_FALSE, 0, video.mb_count*sizeof(cl_int), frames.MB_reference_frame, 0, NULL, NULL); 265 | device.state_gpu = clEnqueueReadBuffer(device.dataCopy_gpu, device.macroblock_vectors_gpu, CL_FALSE, 0, video.mb_count*sizeof(macroblock_vectors_t), frames.MB_vectors, 0, NULL, NULL); 266 | device.state_gpu = clFlush(device.dataCopy_gpu); 267 | 268 | // now for each plane and reference frame fill predictors and residual buffers 269 | cl_int ref; 270 | cl_int cwidth = video.wrk_width/2; 271 | // Y 272 | device.gpu_work_items_per_dim[0] = video.mb_count*16; 273 | // LAST 274 | ref = LAST; 275 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.prepare_predictors_and_residual_last_Y, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 276 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 277 | // GOLDEN 278 | if (use_golden) 279 | { 280 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.prepare_predictors_and_residual_golden_Y, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 281 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 282 | } 283 | // ALTREF 284 | if (use_altref) 285 | { 286 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue3_gpu, device.prepare_predictors_and_residual_altref_Y, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 287 | device.state_gpu = ifFlush(device.commandQueue3_gpu); 288 | } 289 | // U 290 | device.gpu_work_items_per_dim[0] = video.mb_count*4; 291 | // LAST 292 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.prepare_predictors_and_residual_last_U, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 293 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 294 | // GOLDEN 295 | if (use_golden) 296 | { 297 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.prepare_predictors_and_residual_golden_U, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 298 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 299 | } 300 | // ALTREF 301 | if (use_altref) 302 | { 303 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue3_gpu, device.prepare_predictors_and_residual_altref_U, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 304 | device.state_gpu = ifFlush(device.commandQueue3_gpu); 305 | } 306 | // V 307 | // LAST 308 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.prepare_predictors_and_residual_last_V, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 309 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 310 | // GOLDEN 311 | if (use_golden) 312 | { 313 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.prepare_predictors_and_residual_golden_V, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 314 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 315 | } 316 | // ALTREF 317 | if (use_altref) 318 | { 319 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue3_gpu, device.prepare_predictors_and_residual_altref_V, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 320 | device.state_gpu = ifFlush(device.commandQueue3_gpu); 321 | } 322 | 323 | device.state_gpu = finalFlush(device.commandQueue1_gpu); 324 | device.state_gpu = finalFlush(device.commandQueue3_gpu); 325 | device.state_gpu = clFinish(device.commandQueue2_gpu); 326 | device.state_gpu = clFinish(device.commandQueue3_gpu); 327 | 328 | // now for each segment (begin with highest quantizer (last index)) 329 | for (cl_int seg_id = LQ_segment; seg_id >= UQ_segment; --seg_id) 330 | { 331 | device.state_gpu = clFinish(device.commandQueue1_gpu); 332 | 333 | //dct Y 334 | device.gpu_work_items_per_dim[0] = video.mb_count*16; 335 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.dct4x4_Y[seg_id], 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 336 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 337 | //dct U 338 | device.gpu_work_items_per_dim[0] = video.mb_count*4; 339 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.dct4x4_U[seg_id], 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 340 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 341 | //dct V 342 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue3_gpu, device.dct4x4_V[seg_id], 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 343 | device.state_gpu = ifFlush(device.commandQueue3_gpu); 344 | //wht and iwht 345 | device.gpu_work_items_per_dim[0] = video.mb_count; 346 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.wht4x4_iwht4x4[seg_id], 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 347 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 348 | //idct Y 349 | device.gpu_work_items_per_dim[0] = video.mb_count*16; 350 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.idct4x4_Y[seg_id], 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 351 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 352 | //idct U 353 | device.gpu_work_items_per_dim[0] = video.mb_count*4; 354 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.idct4x4_U[seg_id], 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 355 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 356 | //idct V 357 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue3_gpu, device.idct4x4_V[seg_id], 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 358 | device.state_gpu = ifFlush(device.commandQueue3_gpu); 359 | 360 | //count SSIM 361 | device.gpu_work_items_per_dim[0] = video.mb_count; 362 | //Y 363 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.count_SSIM_luma[seg_id], 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 364 | device.state_gpu = ifFlush(device.commandQueue1_gpu); 365 | //U 366 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue2_gpu, device.count_SSIM_chroma_U[seg_id], 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 367 | device.state_gpu = ifFlush(device.commandQueue2_gpu); 368 | //V 369 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue3_gpu, device.count_SSIM_chroma_V[seg_id], 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 370 | device.state_gpu = ifFlush(device.commandQueue3_gpu); 371 | 372 | device.state_gpu = finalFlush(device.commandQueue1_gpu); 373 | device.state_gpu = finalFlush(device.commandQueue3_gpu); 374 | device.state_gpu = clFinish(device.commandQueue2_gpu); 375 | device.state_gpu = clFinish(device.commandQueue3_gpu); 376 | 377 | device.state_gpu = clEnqueueNDRangeKernel(device.commandQueue1_gpu, device.gather_SSIM, 1, NULL, device.gpu_work_items_per_dim, NULL, 0, NULL, NULL); 378 | } 379 | 380 | if (device.state_gpu != 0) 381 | printf("bad kernel %d",device.state_gpu); 382 | 383 | return; 384 | } -------------------------------------------------------------------------------- /src/vp8enc.cpp: -------------------------------------------------------------------------------- 1 | // all global structure definitions (fileContext, videoContext, deviceContext...) 2 | #include "vp8enc.h" 3 | 4 | //these are global variables used all over the encoder 5 | struct fileContext input_file, //YUV4MPEG2 6 | output_file, //IVF 7 | error_file, //TXT for OpenCl compiler errors 8 | dump_file; //YUV4MPEG2 dump of reconstructed frames 9 | struct deviceContext device; //both GPU and CPU OpenCL-devices (different handles, memory-objects, commlines...) 10 | struct videoContext video; //properties of the video (sizes, indicies, vector limits...) 11 | struct hostFrameBuffers frames; // host buffers, frame number, current/previous frame flags... 12 | struct encoderStatistics encStat; 13 | 14 | static cl_int ifFlush(cl_command_queue comm) 15 | { 16 | #ifdef ALLWAYS_FLUSH 17 | const cl_int ret = clFlush(comm); 18 | //const cl_int ret = clFinish(comm); 19 | if (ret < 0) 20 | printf("flush fail\n"); 21 | return ret; 22 | #endif //ALLWAYS_FLUSH 23 | return 0; 24 | } 25 | 26 | static cl_int finalFlush(cl_command_queue comm) 27 | { 28 | #ifndef ALLWAYS_FLUSH 29 | const cl_int ret = clFlush(comm); 30 | if (ret < 0) 31 | printf("flush fail\n"); 32 | return ret; 33 | #endif //ALLWAYS_FLUSH 34 | return 0; 35 | } 36 | 37 | #include "encIO.h" 38 | #include "init.h" 39 | #include "intra_part.h" 40 | #include "inter_part.h" 41 | #include "loop_filter.h" 42 | #include "debug.h" 43 | 44 | ////////////////// transforms are taken from multimedia mike's encoder version 45 | 46 | extern void encode_header(cl_uchar *const partition); 47 | 48 | static void entropy_encode() 49 | { 50 | if (frames.threads_free < video.number_of_partitions) { 51 | device.state_gpu = finalFlush(device.loopfilterU_commandQueue_cpu); 52 | device.state_gpu = finalFlush(device.loopfilterV_commandQueue_cpu); 53 | device.state_cpu = clFinish(device.loopfilterY_commandQueue_cpu); 54 | device.state_cpu = clFinish(device.loopfilterU_commandQueue_cpu); 55 | device.state_cpu = clFinish(device.loopfilterV_commandQueue_cpu); 56 | frames.threads_free = video.thread_limit; 57 | } 58 | 59 | // here we start preparing DCT coefficient probabilities for frame 60 | // by calculating average for all situations 61 | // count_probs - accumulate numerators(num) and denominators(denom) 62 | // for each context 63 | // num[i][j][k][l] - amount of ZEROs which must be coded in i,j,k,l context 64 | // denom[i][j][k][l] - amount of bits(both 0 and 1) in i,j,k,l context to be coded 65 | device.cpu_work_items_per_dim[0] = video.number_of_partitions; 66 | device.cpu_work_group_size_per_dim[0] = 1; 67 | clEnqueueNDRangeKernel(device.boolcoder_commandQueue_cpu, device.count_probs, 1, NULL, device.cpu_work_items_per_dim, device.cpu_work_group_size_per_dim, 0, NULL, NULL); 68 | frames.threads_free -= video.number_of_partitions; 69 | 70 | // just dividing nums by denoms and getting probability of bit being ZERO 71 | clEnqueueNDRangeKernel(device.boolcoder_commandQueue_cpu, device.num_div_denom, 1, NULL, device.cpu_work_items_per_dim, device.cpu_work_group_size_per_dim, 0, NULL, NULL); 72 | 73 | // read calculated values 74 | clEnqueueReadBuffer(device.boolcoder_commandQueue_cpu, device.coeff_probs ,CL_TRUE, 0, 11*3*8*4*sizeof(cl_uint), frames.new_probs, 0, NULL, NULL); 75 | clEnqueueReadBuffer(device.boolcoder_commandQueue_cpu, device.coeff_probs_denom ,CL_TRUE, 0, 11*3*8*4*sizeof(cl_uint), frames.new_probs_denom, 0, NULL, NULL); 76 | { int i,j,k,l; 77 | for (i = 0; i < 4; ++i) 78 | for (j = 0; j < 8; ++j) 79 | for (k = 0; k < 3; ++k) 80 | for (l = 0; l < 11; ++l) 81 | if (frames.new_probs_denom[i][j][k][l] < 2) // this situation never happened (no bit encoded with this context) 82 | frames.new_probs[i][j][k][l] = k_default_coeff_probs[i][j][k][l]; 83 | } 84 | device.state_gpu = clEnqueueWriteBuffer(device.boolcoder_commandQueue_cpu, device.coeff_probs, CL_FALSE, 0, 11*3*8*4*sizeof(cl_uint), frames.new_probs, 0, NULL, NULL); 85 | 86 | // start of encoding coefficients 87 | clEnqueueNDRangeKernel(device.boolcoder_commandQueue_cpu, device.encode_coefficients, 1, NULL, device.cpu_work_items_per_dim, device.cpu_work_group_size_per_dim, 0, NULL, NULL); 88 | ifFlush(device.boolcoder_commandQueue_cpu); // we don't need result until gather_frame(), so no block now 89 | 90 | // encoding header is done as a part of HOST code placed in entropy_host.c[pp]|entropy_host.h 91 | encode_header(frames.encoded_frame); 92 | 93 | return; 94 | } 95 | 96 | static void get_loopfilter_strength(int *const __restrict red, cl_int *const __restrict sh) 97 | { 98 | int i,j, avg = 0, div = 0; 99 | for(i = 0; i < video.wrk_frame_size_luma; ++i) 100 | avg += frames.current_Y[i]; 101 | avg += video.wrk_frame_size_luma/2; 102 | avg /= video.wrk_frame_size_luma; 103 | *red = (avg*5/255) + 3; 104 | 105 | for(i = 1; i < video.wrk_height - 1; ++i) 106 | for(j = 1; j < video.wrk_width - 1; ++j) 107 | { 108 | const int p = i*video.wrk_width + j; 109 | avg = frames.current_Y[p - video.wrk_width - 1] + 110 | frames.current_Y[p - video.wrk_width] + 111 | frames.current_Y[p - video.wrk_width + 1] + 112 | frames.current_Y[p - 1] + 113 | frames.current_Y[p + 1] + 114 | frames.current_Y[p + video.wrk_width - 1] + 115 | frames.current_Y[p + video.wrk_width] + 116 | frames.current_Y[p + video.wrk_width + 1]; 117 | avg /= 8; 118 | div += (frames.current_Y[p] - avg)*(frames.current_Y[p] - avg); 119 | } 120 | div += (video.wrk_height - 1)*(video.wrk_width - 1)/2; 121 | div /= (video.wrk_height - 1)*(video.wrk_width - 1); 122 | 123 | *sh = div/8; 124 | *sh = (*sh > 7) ? 7 : *sh; 125 | 126 | return; 127 | } 128 | 129 | static void prepare_segments_data(const int update_filter = 0, const int shrpnss = 0) 130 | { 131 | int i,qi; 132 | int *refqi; 133 | if (frames.current_is_key_frame) 134 | { 135 | frames.segments_data[0].y_dc_idelta = 15; //these ones are equal for all segments 136 | frames.segments_data[0].y2_dc_idelta = 0; // but i am lazy to create new buffer for them 137 | frames.segments_data[0].y2_ac_idelta = 0; // because it's also should be copied to GPU 138 | frames.segments_data[0].uv_dc_idelta = 0; 139 | frames.segments_data[0].uv_ac_idelta = 0; 140 | } 141 | else 142 | { 143 | frames.segments_data[0].y_dc_idelta = 15; 144 | frames.segments_data[0].y2_dc_idelta = 0; 145 | frames.segments_data[0].y2_ac_idelta = 0; 146 | frames.segments_data[0].uv_dc_idelta = -15; 147 | frames.segments_data[0].uv_ac_idelta = -15; 148 | } 149 | if (frames.current_is_altref_frame) 150 | refqi = video.altrefqi; 151 | else refqi = video.lastqi; 152 | 153 | int reductor; 154 | get_loopfilter_strength(&reductor, &video.loop_filter_sharpness); 155 | if (update_filter) 156 | { 157 | reductor *= 2; 158 | video.loop_filter_sharpness = shrpnss; 159 | } 160 | 161 | 162 | for (i = 0; i < 4; ++i) 163 | { 164 | frames.segments_data[i].y_ac_i = (frames.current_is_key_frame) ? video.qi_min : refqi[i]; 165 | frames.y_ac_q[i] = vp8_ac_qlookup[frames.segments_data[i].y_ac_i]; 166 | qi = frames.segments_data[i].y_ac_i + frames.segments_data[0].y_dc_idelta; 167 | qi = (qi > 127) ? 127 : ((qi < 0) ? 0 : qi); 168 | frames.y_dc_q[i] = vp8_dc_qlookup[qi]; 169 | qi = frames.segments_data[i].y_ac_i + frames.segments_data[0].y2_dc_idelta; 170 | qi = (qi > 127) ? 127 : ((qi < 0) ? 0 : qi); 171 | frames.y2_dc_q[i] = (vp8_dc_qlookup[qi]) << 1; // *2 172 | qi = frames.segments_data[i].y_ac_i + frames.segments_data[0].y2_ac_idelta; 173 | qi = (qi > 127) ? 127 : ((qi < 0) ? 0 : qi); 174 | frames.y2_ac_q[i] = 31 * (vp8_ac_qlookup[qi]) / 20; // *155/100 175 | qi = frames.segments_data[i].y_ac_i + frames.segments_data[0].uv_dc_idelta; 176 | qi = (qi > 127) ? 127 : ((qi < 0) ? 0 : qi); 177 | frames.uv_dc_q[i] = vp8_dc_qlookup[qi]; 178 | qi = frames.segments_data[i].y_ac_i + frames.segments_data[0].uv_ac_idelta; 179 | qi = (qi > 127) ? 127 : ((qi < 0) ? 0 : qi); 180 | frames.uv_ac_q[i] = vp8_ac_qlookup[qi]; 181 | 182 | if (frames.y2_ac_q[i] < 8) 183 | frames.y2_ac_q[i] = 8; 184 | if (frames.uv_dc_q[i] > 132) 185 | frames.uv_dc_q[i] = 132; 186 | 187 | frames.segments_data[i].loop_filter_level = frames.y_dc_q[i]/reductor; 188 | frames.segments_data[i].loop_filter_level = (frames.segments_data[i].loop_filter_level > 63) ? 63 : frames.segments_data[i].loop_filter_level; 189 | frames.segments_data[i].loop_filter_level = (frames.segments_data[i].loop_filter_level < 0) ? 0 : frames.segments_data[i].loop_filter_level; 190 | 191 | 192 | frames.segments_data[i].interior_limit = frames.segments_data[i].loop_filter_level; 193 | if (video.loop_filter_sharpness) { 194 | frames.segments_data[i].interior_limit >>= video.loop_filter_sharpness > 4 ? 2 : 1; 195 | if (frames.segments_data[i].interior_limit > 9 - video.loop_filter_sharpness) 196 | frames.segments_data[i].interior_limit = 9 - video.loop_filter_sharpness; 197 | } 198 | if (!frames.segments_data[i].interior_limit) 199 | frames.segments_data[i].interior_limit = 1; 200 | 201 | frames.segments_data[i].mbedge_limit = ((frames.segments_data[i].loop_filter_level + 2) * 2) + frames.segments_data[i].interior_limit; 202 | frames.segments_data[i].sub_bedge_limit = (frames.segments_data[i].loop_filter_level * 2) + frames.segments_data[i].interior_limit; 203 | 204 | frames.segments_data[i].hev_threshold = 0; 205 | if (frames.current_is_key_frame) 206 | { 207 | if (frames.segments_data[i].loop_filter_level >= 40) 208 | frames.segments_data[i].hev_threshold = 2; 209 | else if (frames.segments_data[i].loop_filter_level >= 15) 210 | frames.segments_data[i].hev_threshold = 1; 211 | } 212 | else /* current frame is an interframe */ 213 | { 214 | if (frames.segments_data[i].loop_filter_level >= 40) 215 | frames.segments_data[i].hev_threshold = 3; 216 | else if (frames.segments_data[i].loop_filter_level >= 20) 217 | frames.segments_data[i].hev_threshold = 2; 218 | else if (frames.segments_data[i].loop_filter_level >= 15) 219 | frames.segments_data[i].hev_threshold = 1; 220 | } 221 | } 222 | if (video.GOP_size < 2) return; 223 | //always to gpu 224 | device.state_gpu = clEnqueueWriteBuffer(device.commandQueue1_gpu, device.segments_data_gpu, CL_FALSE, 0, sizeof(segment_data)*4, frames.segments_data, 0, NULL, NULL); 225 | // and for loop filter on cpu 226 | if (!video.do_loop_filter_on_gpu) 227 | device.state_cpu = clEnqueueWriteBuffer(device.loopfilterY_commandQueue_cpu, device.segments_data_cpu, CL_FALSE, 0, sizeof(segment_data)*4, frames.segments_data, 0, NULL, NULL); 228 | return; 229 | } 230 | 231 | static void check_SSIM() 232 | { 233 | //device.state_gpu = clEnqueueReadBuffer(device.commandQueue1_gpu, device.reconstructed_frame_Y ,CL_TRUE, 0, video.wrk_frame_size_luma, frames.reconstructed_Y, 0, NULL, NULL); 234 | //device.state_gpu = clEnqueueReadBuffer(device.commandQueue2_gpu, device.reconstructed_frame_U ,CL_TRUE, 0, video.wrk_frame_size_chroma, frames.reconstructed_U, 0, NULL, NULL); 235 | //device.state_gpu = clEnqueueReadBuffer(device.commandQueue3_gpu, device.reconstructed_frame_V ,CL_TRUE, 0, video.wrk_frame_size_chroma, frames.reconstructed_V, 0, NULL, NULL); 236 | 237 | frames.new_SSIM = 0; 238 | float min1 = 2.0f, min2 = 2.0f; 239 | int mb_num; 240 | frames.replaced = 0; 241 | 242 | for (mb_num = 0; mb_num < video.mb_count; ++mb_num) 243 | { 244 | min2 = (frames.MB_SSIM[mb_num] < min2) ? frames.MB_SSIM[mb_num] : min2; 245 | if (frames.MB_SSIM[mb_num] < video.SSIM_target) 246 | frames.e_data[mb_num].is_inter_mb = (test_inter_on_intra(mb_num, AQ_segment) == 0) ? 0 : frames.e_data[mb_num].is_inter_mb; 247 | if (frames.MB_SSIM[mb_num] < video.SSIM_target) 248 | frames.e_data[mb_num].is_inter_mb = (test_inter_on_intra(mb_num, HQ_segment) == 0) ? 0 : frames.e_data[mb_num].is_inter_mb; 249 | if (frames.MB_SSIM[mb_num] < video.SSIM_target) 250 | frames.e_data[mb_num].is_inter_mb = (test_inter_on_intra(mb_num, UQ_segment) == 0) ? 0 : frames.e_data[mb_num].is_inter_mb; 251 | frames.replaced+=(frames.e_data[mb_num].is_inter_mb==0); 252 | 253 | frames.new_SSIM += frames.MB_SSIM[mb_num]; 254 | min1 = (frames.MB_SSIM[mb_num] < min1) ? frames.MB_SSIM[mb_num] : min1; 255 | } 256 | 257 | frames.new_SSIM /= (float)video.mb_count; 258 | if (video.print_info) 259 | printf("%d>AvgSSIM=%f; MinSSIM=%f(%f); repl:%d ", frames.frame_number,frames.new_SSIM,min1,min2,frames.replaced); 260 | if (min1 > 0.95) 261 | prepare_segments_data(1, 7); 262 | return; 263 | } 264 | 265 | static int scene_change() 266 | { 267 | static int holdover = 0; 268 | int detect; 269 | // something more sophisticated is desirable 270 | int Udiff = 0, Vdiff = 0, diff, pix; 271 | for (pix = 0; pix < video.wrk_frame_size_chroma; ++pix) 272 | { 273 | diff = (int)frames.last_U[pix] - (int)frames.current_U[pix]; 274 | diff = (diff < 0) ? -diff : diff; 275 | Udiff += diff; 276 | } 277 | Udiff /= video.wrk_frame_size_chroma; 278 | for (pix = 0; pix < video.wrk_frame_size_chroma; ++pix) 279 | { 280 | diff = (int)frames.last_V[pix] - (int)frames.current_V[pix]; 281 | diff = (diff < 0) ? -diff : diff; 282 | Vdiff += diff; 283 | } 284 | Vdiff /= video.wrk_frame_size_chroma; 285 | detect = ((Udiff > 7) || (Vdiff > 7) || (Udiff+Vdiff > 10)); 286 | //workaround to exclude serial intra_frames 287 | // could shrink V 288 | if ((detect) && ((frames.frame_number - frames.last_key_detect) < 4)) 289 | { 290 | frames.last_key_detect = frames.frame_number; 291 | holdover = 1; 292 | return 0; 293 | } 294 | if ((detect) && ((frames.frame_number - frames.last_key_detect) >= 4)) 295 | { 296 | //frames.last_key_detect will be set in intra_transform() 297 | return 1; 298 | } 299 | // then detect == 0 300 | if ((holdover) && ((frames.frame_number - frames.last_key_detect) < 4)) 301 | { 302 | return 0; 303 | } 304 | if ((holdover) && ((frames.frame_number - frames.last_key_detect) >= 4)) 305 | { 306 | holdover = 0; 307 | return 1; 308 | } 309 | 310 | return 0; //no detection and no hold over from previous detections 311 | } 312 | 313 | static void finalize(); 314 | 315 | int main(int argc, char *argv[]) 316 | { 317 | cl_int mb_num; 318 | printf("\n"); 319 | 320 | error_file.path = ERRORPATH; 321 | if (ParseArgs(argc, argv) < 0) 322 | { 323 | return -1; 324 | } 325 | 326 | OpenYUV420FileAndParseHeader(); 327 | 328 | printf("initialization started;\n"); 329 | if ((init_all() < 0) || (device.state_cpu != 0) || (device.state_gpu != 0)) 330 | { 331 | return -1; 332 | } 333 | printf("initialization complete;\n"); 334 | 335 | encStat.scene_changes_by_color = 0; 336 | encStat.scene_changes_by_ssim = 0; 337 | encStat.scene_changes_by_replaced = 0; 338 | encStat.scene_changes_by_bitrate = 0; 339 | 340 | frames.frames_until_key = 1; 341 | frames.frames_until_altref = 2; 342 | frames.frame_number = 0; 343 | frames.golden_frame_number = -1; 344 | frames.altref_frame_number = -1; 345 | 346 | write_output_header(); 347 | //open_dump_file(); 348 | 349 | frames.video_size = 0; 350 | frames.encoded_frame_size = 0; 351 | while (get_yuv420_frame() > 0) 352 | { 353 | //grab buffer for host (without copy, we will write it from scratch) 354 | clFinish(device.boolcoder_commandQueue_cpu); 355 | frames.MB = (macroblock_coeffs_t*)clEnqueueMapBuffer(device.loopfilterY_commandQueue_cpu, device.macroblock_coeffs_cpu, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, sizeof(macroblock_coeffs_t)*video.mb_count, 0, NULL, NULL, &device.state_cpu); 356 | if (!video.do_loop_filter_on_gpu) 357 | { 358 | // we can use invalidate for key frames, but read/write will have the same speed (zero-copy) and so we can start memory transfer to GPU a little bit earlier (in case of inter frame) 359 | frames.reconstructed_Y = (unsigned char*)clEnqueueMapBuffer(device.loopfilterY_commandQueue_cpu, device.cpu_frame_Y, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, sizeof(unsigned char)*video.wrk_frame_size_luma, 0, NULL, NULL, &device.state_cpu); 360 | frames.reconstructed_U = (unsigned char*)clEnqueueMapBuffer(device.loopfilterU_commandQueue_cpu, device.cpu_frame_U, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, sizeof(unsigned char)*video.wrk_frame_size_chroma, 0, NULL, NULL, &device.state_cpu); 361 | frames.reconstructed_V = (unsigned char*)clEnqueueMapBuffer(device.loopfilterV_commandQueue_cpu, device.cpu_frame_V, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, sizeof(unsigned char)*video.wrk_frame_size_chroma, 0, NULL, NULL, &device.state_cpu); 362 | } 363 | 364 | frames.prev_is_key_frame = frames.current_is_key_frame; 365 | frames.prev_is_golden_frame = frames.current_is_golden_frame; 366 | frames.prev_is_altref_frame = frames.current_is_altref_frame; 367 | --frames.frames_until_key; 368 | --frames.frames_until_altref; 369 | frames.current_is_key_frame = (frames.frames_until_key < 1); 370 | frames.current_is_golden_frame = frames.current_is_key_frame; 371 | frames.current_is_altref_frame = (frames.frames_until_altref < 1) || frames.current_is_key_frame; 372 | frames.frames_until_altref = ((frames.frames_until_altref < 1) || frames.current_is_key_frame) ? video.altref_range :frames.frames_until_altref; 373 | frames.golden_frame_number = (frames.current_is_golden_frame) ? frames.frame_number : frames.golden_frame_number; 374 | frames.altref_frame_number = (frames.current_is_altref_frame) ? frames.frame_number : frames.altref_frame_number; 375 | 376 | frames.prev_frame_size = frames.encoded_frame_size; 377 | frames.video_size += frames.encoded_frame_size; 378 | 379 | if (frames.current_is_key_frame) 380 | { 381 | prepare_segments_data(); 382 | intra_transform(); 383 | } 384 | else 385 | { 386 | device.state_gpu = clEnqueueWriteBuffer(device.commandQueue2_gpu, device.current_frame_Y, CL_FALSE, 0, video.wrk_frame_size_luma, frames.current_Y, 0, NULL, NULL); 387 | device.state_gpu = clEnqueueWriteBuffer(device.commandQueue3_gpu, device.current_frame_U, CL_FALSE, 0, video.wrk_frame_size_chroma, frames.current_U, 0, NULL, NULL); 388 | device.state_gpu = clEnqueueWriteBuffer(device.commandQueue3_gpu, device.current_frame_V, CL_FALSE, 0, video.wrk_frame_size_chroma, frames.current_V, 0, NULL, NULL); 389 | if (!video.do_loop_filter_on_gpu) 390 | { 391 | const size_t origin[3] = {0, 0, 0}; 392 | const size_t region_y[3] = {video.wrk_width, video.wrk_height, 1}; 393 | const size_t region_uv[3] = {video.wrk_width/2, video.wrk_height/2, 1}; 394 | 395 | device.state_gpu = clEnqueueWriteBuffer(device.commandQueue1_gpu, device.reconstructed_frame_Y, CL_FALSE, 0, video.wrk_frame_size_luma, frames.reconstructed_Y, 0, NULL, NULL); 396 | device.state_gpu = clEnqueueWriteBuffer(device.commandQueue3_gpu, device.reconstructed_frame_U, CL_FALSE, 0, video.wrk_frame_size_chroma, frames.reconstructed_U, 0, NULL, NULL); 397 | device.state_gpu = clEnqueueWriteBuffer(device.commandQueue3_gpu, device.reconstructed_frame_V, CL_FALSE, 0, video.wrk_frame_size_chroma, frames.reconstructed_V, 0, NULL, NULL); 398 | 399 | device.state_gpu = clEnqueueWriteImage(device.commandQueue3_gpu, device.last_frame_Y_image, CL_FALSE, origin, region_y, 0, 0, frames.reconstructed_Y, 0, NULL, NULL); 400 | device.state_gpu = clEnqueueWriteImage(device.commandQueue3_gpu, device.last_frame_U_image, CL_FALSE, origin, region_uv, 0, 0, frames.reconstructed_U, 0, NULL, NULL); 401 | device.state_gpu = clEnqueueWriteImage(device.commandQueue3_gpu, device.last_frame_V_image, CL_FALSE, origin, region_uv, 0, 0, frames.reconstructed_V, 0, NULL, NULL); 402 | 403 | clFlush(device.commandQueue2_gpu); 404 | } 405 | clFlush(device.commandQueue1_gpu); 406 | clFlush(device.commandQueue3_gpu); 407 | 408 | const int new_scene = scene_change(); 409 | if (new_scene) 410 | { 411 | ++encStat.scene_changes_by_color; 412 | frames.current_is_key_frame = 1; 413 | prepare_segments_data(); // redo because loop filtering differs 414 | intra_transform(); 415 | printf("key frame FORCED by chroma color difference!\n"); 416 | } 417 | else 418 | { 419 | prepare_segments_data(); 420 | inter_transform(); 421 | // copy transformed_blocks to host 422 | device.state_gpu = clEnqueueReadBuffer(device.commandQueue1_gpu, device.macroblock_coeffs_gpu, CL_FALSE, 0, video.mb_count*sizeof(macroblock_coeffs_t), frames.MB, 0, NULL, NULL); 423 | device.state_gpu = clEnqueueReadBuffer(device.commandQueue1_gpu, device.macroblock_segment_id_gpu, CL_FALSE, 0, video.mb_count*sizeof(cl_int), frames.MB_segment_id, 0, NULL, NULL); 424 | device.state_gpu = clEnqueueReadBuffer(device.commandQueue1_gpu, device.macroblock_SSIM_gpu, CL_FALSE, 0, video.mb_count*sizeof(cl_float), frames.MB_SSIM, 0, NULL, NULL); 425 | 426 | // these buffers are copied inside inter_transform() as soon as they are ready 427 | //device.state_gpu = clEnqueueReadBuffer(device.commandQueue1_gpu, device.macroblock_parts_gpu, CL_FALSE, 0, video.mb_count*sizeof(cl_int), frames.MB_parts, 0, NULL, NULL); 428 | //device.state_gpu = clEnqueueReadBuffer(device.commandQueue1_gpu, device.macroblock_reference_frame_gpu, CL_FALSE, 0, video.mb_count*sizeof(cl_int), frames.MB_reference_frame, 0, NULL, NULL); 429 | //device.state_gpu = clEnqueueReadBuffer(device.commandQueue1_gpu, device.macroblock_vectors_gpu, CL_FALSE, 0, video.mb_count*sizeof(macroblock_vectors_t), frames.MB_vectors, 0, NULL, NULL); 430 | 431 | device.state_gpu = clEnqueueReadBuffer(device.commandQueue1_gpu, device.reconstructed_frame_Y ,CL_FALSE, 0, video.wrk_frame_size_luma, frames.reconstructed_Y, 0, NULL, NULL); 432 | device.state_gpu = clEnqueueReadBuffer(device.commandQueue1_gpu, device.reconstructed_frame_U ,CL_FALSE, 0, video.wrk_frame_size_chroma, frames.reconstructed_U, 0, NULL, NULL); 433 | device.state_gpu = clEnqueueReadBuffer(device.commandQueue1_gpu, device.reconstructed_frame_V ,CL_FALSE, 0, video.wrk_frame_size_chroma, frames.reconstructed_V, 0, NULL, NULL); 434 | clFlush(device.commandQueue1_gpu); 435 | 436 | for(mb_num = 0; mb_num < video.mb_count; ++mb_num) 437 | frames.e_data[mb_num].is_inter_mb = 1; 438 | 439 | clFinish(device.dataCopy_gpu); 440 | clFinish(device.commandQueue1_gpu); 441 | 442 | check_SSIM(); 443 | if ((frames.replaced > (video.mb_count/6)) || (frames.new_SSIM < video.SSIM_target)) 444 | { 445 | if (frames.new_SSIM < video.SSIM_target) ++encStat.scene_changes_by_ssim; 446 | else ++encStat.scene_changes_by_replaced; 447 | // redo as intra 448 | frames.current_is_key_frame = 1; 449 | prepare_segments_data(); 450 | intra_transform(); 451 | if (video.print_info) 452 | printf("\nkey frame FORCED by bad inter-result: replaced(%d) and SSIM(%f)!\n",frames.replaced,frames.new_SSIM); 453 | } 454 | 455 | } 456 | } 457 | // searching for MBs to be skiped 458 | // copy (unmap) coefficients back to pinned cpu device memory 459 | // we will need it for loop filter(if on cpu) and for entropy encoding (always) 460 | clEnqueueUnmapMemObject(device.loopfilterY_commandQueue_cpu, device.macroblock_coeffs_cpu, frames.MB, 0, NULL, NULL); 461 | if (!video.do_loop_filter_on_gpu) 462 | { 463 | clEnqueueUnmapMemObject(device.loopfilterY_commandQueue_cpu, device.cpu_frame_Y, frames.reconstructed_Y, 0, NULL, NULL); 464 | clEnqueueUnmapMemObject(device.loopfilterY_commandQueue_cpu, device.cpu_frame_U, frames.reconstructed_U, 0, NULL, NULL); 465 | clEnqueueUnmapMemObject(device.loopfilterY_commandQueue_cpu, device.cpu_frame_V, frames.reconstructed_V, 0, NULL, NULL); 466 | } 467 | // we need not to block here, even if two queues are using this object (prepare process will always wait for results) 468 | device.state_gpu = clEnqueueWriteBuffer(device.loopfilterY_commandQueue_cpu, device.macroblock_parts_cpu, CL_FALSE, 0, video.mb_count*sizeof(cl_int), frames.MB_parts, 0, NULL, NULL); 469 | device.state_gpu = clEnqueueWriteBuffer(device.loopfilterY_commandQueue_cpu, device.macroblock_segment_id_cpu, CL_FALSE, 0, video.mb_count*sizeof(cl_int), frames.MB_segment_id, 0, NULL, NULL); 470 | clFinish(device.loopfilterY_commandQueue_cpu); 471 | 472 | prepare_filter_mask_and_non_zero_coeffs(); 473 | do_loop_filter(); 474 | 475 | //TODO if (video.do_loop_filter_on_gpu) 476 | //TODO device.state_cpu = clEnqueueWriteBuffer(device.boolcoder_commandQueue_cpu, device.transformed_blocks_cpu, CL_FALSE, 0, video.mb_count*sizeof(macroblock), frames.transformed_blocks, 0, NULL, NULL); 477 | // else already there because were uploaded before loop filter 478 | entropy_encode(); 479 | 480 | //if ((frames.frame_number % video.framerate) == 0) printf("second %d encoded\n", frames.frame_number/video.framerate); 481 | gather_frame(); 482 | if (video.print_info) 483 | printf("br=%dk, frame~%dk\n", (int)(frames.video_size*video.framerate*8/(frames.frame_number+1)/1024), (frames.encoded_frame_size+512)/1024); 484 | 485 | //dump(); 486 | write_output_file(); 487 | ++frames.frame_number; 488 | } 489 | write_output_header(); 490 | //fclose(dump_file.handle); 491 | finalize(); 492 | 493 | printf("%d scene changes detected by color change\n", encStat.scene_changes_by_color); 494 | printf("%d scene changes detected by low ssim value\n", encStat.scene_changes_by_ssim); 495 | printf("%d scene changes detected by high amount of replaced blocks\n", encStat.scene_changes_by_replaced); 496 | printf("%d scene changes detected by bitrate raise\n", encStat.scene_changes_by_bitrate); 497 | //getch(); 498 | return 777; 499 | } 500 | 501 | void finalize() 502 | { 503 | fclose(input_file.handle); 504 | fclose(output_file.handle); 505 | 506 | clReleaseMemObject(device.coeff_probs); 507 | clReleaseMemObject(device.coeff_probs_denom); 508 | clReleaseMemObject(device.partitions); 509 | clReleaseMemObject(device.partitions_sizes); 510 | clReleaseMemObject(device.third_context); 511 | clReleaseMemObject(device.macroblock_coeffs_cpu); 512 | clReleaseMemObject(device.macroblock_non_zero_coeffs_cpu); 513 | clReleaseMemObject(device.macroblock_parts_cpu); 514 | clReleaseMemObject(device.macroblock_segment_id_cpu); 515 | clReleaseMemObject(device.segments_data_cpu); 516 | if (video.GOP_size > 1) 517 | { 518 | clReleaseMemObject(device.current_frame_U); 519 | clReleaseMemObject(device.current_frame_V); 520 | clReleaseMemObject(device.current_frame_Y); 521 | clReleaseMemObject(device.current_frame_Y_downsampled_by2); 522 | clReleaseMemObject(device.current_frame_Y_downsampled_by4); 523 | clReleaseMemObject(device.current_frame_Y_downsampled_by8); 524 | clReleaseMemObject(device.current_frame_Y_downsampled_by16); 525 | clReleaseMemObject(device.reconstructed_frame_U); 526 | clReleaseMemObject(device.reconstructed_frame_V); 527 | clReleaseMemObject(device.reconstructed_frame_Y); 528 | clReleaseMemObject(device.last_frame_Y_image); 529 | clReleaseMemObject(device.last_frame_U_image); 530 | clReleaseMemObject(device.last_frame_V_image); 531 | clReleaseMemObject(device.last_frame_Y_downsampled_by2); 532 | clReleaseMemObject(device.last_frame_Y_downsampled_by4); 533 | clReleaseMemObject(device.last_frame_Y_downsampled_by8); 534 | clReleaseMemObject(device.last_frame_Y_downsampled_by16); 535 | clReleaseMemObject(device.golden_frame_Y); 536 | clReleaseMemObject(device.golden_frame_Y_image); 537 | clReleaseMemObject(device.golden_frame_U_image); 538 | clReleaseMemObject(device.golden_frame_V_image); 539 | clReleaseMemObject(device.golden_frame_Y_downsampled_by2); 540 | clReleaseMemObject(device.golden_frame_Y_downsampled_by4); 541 | clReleaseMemObject(device.golden_frame_Y_downsampled_by8); 542 | clReleaseMemObject(device.golden_frame_Y_downsampled_by16); 543 | clReleaseMemObject(device.altref_frame_Y); 544 | clReleaseMemObject(device.altref_frame_Y_image); 545 | clReleaseMemObject(device.altref_frame_U_image); 546 | clReleaseMemObject(device.altref_frame_V_image); 547 | clReleaseMemObject(device.altref_frame_Y_downsampled_by2); 548 | clReleaseMemObject(device.altref_frame_Y_downsampled_by4); 549 | clReleaseMemObject(device.altref_frame_Y_downsampled_by8); 550 | clReleaseMemObject(device.altref_frame_Y_downsampled_by16); 551 | clReleaseMemObject(device.predictors_Y); 552 | clReleaseMemObject(device.predictors_U); 553 | clReleaseMemObject(device.predictors_V); 554 | clReleaseMemObject(device.residual_Y); 555 | clReleaseMemObject(device.residual_U); 556 | clReleaseMemObject(device.residual_V); 557 | clReleaseMemObject(device.cpu_frame_Y); 558 | clReleaseMemObject(device.cpu_frame_U); 559 | clReleaseMemObject(device.cpu_frame_V); 560 | clReleaseMemObject(device.macroblock_coeffs_gpu); 561 | clReleaseMemObject(device.macroblock_non_zero_coeffs_gpu); 562 | clReleaseMemObject(device.macroblock_parts_gpu); 563 | clReleaseMemObject(device.macroblock_reference_frame_gpu); 564 | clReleaseMemObject(device.macroblock_segment_id_gpu); 565 | clReleaseMemObject(device.macroblock_SSIM_gpu); 566 | clReleaseMemObject(device.macroblock_vectors_gpu); 567 | clReleaseMemObject(device.segments_data_gpu); 568 | clReleaseMemObject(device.last_vnet1); 569 | clReleaseMemObject(device.last_vnet2); 570 | clReleaseMemObject(device.golden_vnet1); 571 | clReleaseMemObject(device.golden_vnet2); 572 | clReleaseMemObject(device.altref_vnet1); 573 | clReleaseMemObject(device.altref_vnet2); 574 | clReleaseMemObject(device.mb_mask); 575 | clReleaseMemObject(device.metrics1); 576 | clReleaseMemObject(device.metrics2); 577 | clReleaseMemObject(device.metrics3); 578 | clReleaseKernel(device.reset_vectors); 579 | clReleaseKernel(device.downsample_last_1x_to_2x); 580 | clReleaseKernel(device.downsample_last_2x_to_4x); 581 | clReleaseKernel(device.downsample_last_4x_to_8x); 582 | clReleaseKernel(device.downsample_last_8x_to_16x); 583 | clReleaseKernel(device.downsample_current_1x_to_2x); 584 | clReleaseKernel(device.downsample_current_2x_to_4x); 585 | clReleaseKernel(device.downsample_current_4x_to_8x); 586 | clReleaseKernel(device.downsample_current_8x_to_16x); 587 | clReleaseKernel(device.luma_search_last_16x); 588 | clReleaseKernel(device.luma_search_golden_16x); 589 | clReleaseKernel(device.luma_search_altref_16x); 590 | clReleaseKernel(device.luma_search_last_8x); 591 | clReleaseKernel(device.luma_search_golden_8x); 592 | clReleaseKernel(device.luma_search_altref_8x); 593 | clReleaseKernel(device.luma_search_last_4x); 594 | clReleaseKernel(device.luma_search_golden_4x); 595 | clReleaseKernel(device.luma_search_altref_4x); 596 | clReleaseKernel(device.luma_search_last_2x); 597 | clReleaseKernel(device.luma_search_golden_2x); 598 | clReleaseKernel(device.luma_search_altref_2x); 599 | clReleaseKernel(device.luma_search_last_1x); 600 | clReleaseKernel(device.luma_search_golden_1x); 601 | clReleaseKernel(device.luma_search_altref_1x); 602 | clReleaseKernel(device.luma_search_last_d4x); 603 | clReleaseKernel(device.luma_search_golden_d4x); 604 | clReleaseKernel(device.luma_search_altref_d4x); 605 | clReleaseKernel(device.select_reference); 606 | clReleaseKernel(device.prepare_predictors_and_residual_last_Y); 607 | clReleaseKernel(device.prepare_predictors_and_residual_last_U); 608 | clReleaseKernel(device.prepare_predictors_and_residual_last_V); 609 | clReleaseKernel(device.prepare_predictors_and_residual_golden_Y); 610 | clReleaseKernel(device.prepare_predictors_and_residual_golden_U); 611 | clReleaseKernel(device.prepare_predictors_and_residual_golden_V); 612 | clReleaseKernel(device.prepare_predictors_and_residual_altref_Y); 613 | clReleaseKernel(device.prepare_predictors_and_residual_altref_U); 614 | clReleaseKernel(device.prepare_predictors_and_residual_altref_V); 615 | clReleaseKernel(device.pack_8x8_into_16x16); 616 | clReleaseKernel(device.dct4x4_Y[UQ_segment]); 617 | clReleaseKernel(device.dct4x4_U[UQ_segment]); 618 | clReleaseKernel(device.dct4x4_V[UQ_segment]); 619 | clReleaseKernel(device.dct4x4_Y[HQ_segment]); 620 | clReleaseKernel(device.dct4x4_U[HQ_segment]); 621 | clReleaseKernel(device.dct4x4_V[HQ_segment]); 622 | clReleaseKernel(device.dct4x4_Y[AQ_segment]); 623 | clReleaseKernel(device.dct4x4_U[AQ_segment]); 624 | clReleaseKernel(device.dct4x4_V[AQ_segment]); 625 | clReleaseKernel(device.dct4x4_Y[LQ_segment]); 626 | clReleaseKernel(device.dct4x4_U[LQ_segment]); 627 | clReleaseKernel(device.dct4x4_V[LQ_segment]); 628 | clReleaseKernel(device.wht4x4_iwht4x4[UQ_segment]); 629 | clReleaseKernel(device.wht4x4_iwht4x4[HQ_segment]); 630 | clReleaseKernel(device.wht4x4_iwht4x4[AQ_segment]); 631 | clReleaseKernel(device.wht4x4_iwht4x4[LQ_segment]); 632 | clReleaseKernel(device.idct4x4_Y[UQ_segment]); 633 | clReleaseKernel(device.idct4x4_U[UQ_segment]); 634 | clReleaseKernel(device.idct4x4_V[UQ_segment]); 635 | clReleaseKernel(device.idct4x4_Y[HQ_segment]); 636 | clReleaseKernel(device.idct4x4_U[HQ_segment]); 637 | clReleaseKernel(device.idct4x4_V[HQ_segment]); 638 | clReleaseKernel(device.idct4x4_Y[AQ_segment]); 639 | clReleaseKernel(device.idct4x4_U[AQ_segment]); 640 | clReleaseKernel(device.idct4x4_V[AQ_segment]); 641 | clReleaseKernel(device.idct4x4_Y[LQ_segment]); 642 | clReleaseKernel(device.idct4x4_U[LQ_segment]); 643 | clReleaseKernel(device.idct4x4_V[LQ_segment]); 644 | clReleaseKernel(device.count_SSIM_luma[UQ_segment]); 645 | clReleaseKernel(device.count_SSIM_luma[HQ_segment]); 646 | clReleaseKernel(device.count_SSIM_luma[AQ_segment]); 647 | clReleaseKernel(device.count_SSIM_luma[LQ_segment]); 648 | clReleaseKernel(device.count_SSIM_chroma_U[UQ_segment]); 649 | clReleaseKernel(device.count_SSIM_chroma_U[HQ_segment]); 650 | clReleaseKernel(device.count_SSIM_chroma_U[AQ_segment]); 651 | clReleaseKernel(device.count_SSIM_chroma_U[LQ_segment]); 652 | clReleaseKernel(device.count_SSIM_chroma_V[UQ_segment]); 653 | clReleaseKernel(device.count_SSIM_chroma_V[HQ_segment]); 654 | clReleaseKernel(device.count_SSIM_chroma_V[AQ_segment]); 655 | clReleaseKernel(device.count_SSIM_chroma_V[LQ_segment]); 656 | clReleaseKernel(device.gather_SSIM); 657 | clReleaseKernel(device.prepare_filter_mask); 658 | clReleaseKernel(device.normal_loop_filter_MBH); 659 | clReleaseKernel(device.normal_loop_filter_MBV); 660 | clReleaseKernel(device.loop_filter_frame_luma); 661 | clReleaseKernel(device.loop_filter_frame_chroma_U); 662 | clReleaseKernel(device.loop_filter_frame_chroma_V); 663 | clReleaseCommandQueue(device.commandQueue1_gpu); 664 | clReleaseCommandQueue(device.commandQueue2_gpu); 665 | clReleaseCommandQueue(device.commandQueue3_gpu); 666 | clReleaseCommandQueue(device.dataCopy_gpu); 667 | clReleaseProgram(device.program_gpu); 668 | clReleaseContext(device.context_gpu); 669 | free(device.device_gpu); 670 | } 671 | 672 | clReleaseKernel(device.count_probs); 673 | clReleaseKernel(device.encode_coefficients); 674 | clReleaseKernel(device.num_div_denom); 675 | clReleaseCommandQueue(device.loopfilterY_commandQueue_cpu); 676 | clReleaseCommandQueue(device.loopfilterU_commandQueue_cpu); 677 | clReleaseCommandQueue(device.loopfilterV_commandQueue_cpu); 678 | clReleaseCommandQueue(device.boolcoder_commandQueue_cpu); 679 | clReleaseProgram(device.program_cpu); 680 | clReleaseContext(device.context_cpu); 681 | 682 | free(frames.input_pack); 683 | free(frames.last_U); 684 | free(frames.last_V); 685 | free(frames.MB_parts); 686 | free(frames.MB_non_zero_coeffs); 687 | free(frames.MB_reference_frame); 688 | free(frames.MB_segment_id); 689 | free(frames.MB_SSIM); 690 | free(frames.MB_vectors); 691 | free(frames.e_data); 692 | free(frames.encoded_frame); 693 | free(frames.partition_0); 694 | free(frames.partitions); 695 | 696 | if (((video.src_height != video.dst_height) || (video.src_width != video.dst_width)) || 697 | ((video.wrk_height != video.dst_height) || (video.wrk_width != video.dst_width))) 698 | { 699 | free(frames.current_Y); 700 | free(frames.current_U); 701 | free(frames.current_V); 702 | } 703 | 704 | free(device.platforms); 705 | free(device.device_cpu); 706 | 707 | return; 708 | } -------------------------------------------------------------------------------- /bin/CPU_kernels.cl: -------------------------------------------------------------------------------- 1 | #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable 2 | //#pragma OPENCL EXTENSION cl_amd_printf : enable 3 | 4 | typedef short int16_t; 5 | typedef int int32_t; 6 | typedef unsigned int uint32_t; 7 | typedef unsigned char uint8_t; 8 | typedef signed char int8_t; 9 | typedef unsigned short uint16_t; 10 | typedef uint8_t Prob; 11 | typedef int8_t tree_index; 12 | typedef const tree_index Tree[]; 13 | 14 | typedef enum { 15 | are16x16 = 0, 16 | are8x8 = 1, 17 | are4x4 = 2 18 | } partition_mode; 19 | 20 | typedef enum { 21 | intra_segment = 0, 22 | UQ_segment = 0, 23 | HQ_segment = 1, 24 | AQ_segment = 2, 25 | LQ_segment = 3 26 | } segment_ids; 27 | 28 | typedef enum { 29 | LAST = 0, 30 | GOLDEN = 1, 31 | ALTREF = 2 32 | } ref_frame; 33 | 34 | typedef struct { 35 | int16_t coeffs[25][16]; 36 | int32_t vector_x[4]; 37 | int32_t vector_y[4]; 38 | float SSIM; 39 | int non_zero_coeffs; 40 | int parts; 41 | int reference_frame; 42 | segment_ids segment_id; 43 | } macroblock; 44 | 45 | typedef struct { 46 | int y_ac_i; 47 | int y_dc_idelta; 48 | int y2_dc_idelta; 49 | int y2_ac_idelta; 50 | int uv_dc_idelta; 51 | int uv_ac_idelta; 52 | int loop_filter_level; 53 | int mbedge_limit; 54 | int sub_bedge_limit; 55 | int interior_limit; 56 | int hev_threshold; 57 | } segment_data; 58 | 59 | typedef struct { 60 | __global uint8_t *output; /* ptr to next byte to be written */ 61 | uint32_t range; /* 128 <= range <= 255 */ 62 | uint32_t bottom; /* minimum value of remaining output */ 63 | int32_t bit_count; /* # of shifts before an output byte is available */ 64 | uint32_t count; 65 | } vp8_bool_encoder; 66 | 67 | void init_bool_encoder(vp8_bool_encoder *e, __global uint8_t *start_partition) 68 | { 69 | e->output = start_partition; 70 | e->range = 255; 71 | e->bottom = 0; 72 | e->bit_count = 24; 73 | e->count = 0; 74 | } 75 | 76 | void add_one_to_output(__global uint8_t *q) 77 | { 78 | while( *--q == 255) 79 | *q = 0; 80 | ++*q; 81 | } 82 | 83 | void write_bool(vp8_bool_encoder *e, int prob, int bool_value) 84 | { 85 | /* split is approximately (range * prob) / 256 and, crucially, 86 | is strictly bigger than zero and strictly smaller than range */ 87 | uint32_t split = 1 + ( ((e->range - 1) * prob) >> 8); 88 | if( bool_value) { 89 | e->bottom += split; /* move up bottom of interval */ 90 | e->range -= split; /* with corresponding decrease in range */ 91 | } else 92 | e->range = split; 93 | while( e->range < 128) 94 | { 95 | e->range <<= 1; 96 | if( e->bottom & ((uint32_t)1 << 31)) {/* detect carry */ 97 | add_one_to_output(e->output); 98 | } 99 | e->bottom <<= 1; 100 | if( !--e->bit_count) { 101 | *e->output++ = (uint8_t) (e->bottom >> 24); 102 | e->count++; 103 | e->bottom &= (1 << 24) - 1; 104 | e->bit_count = 8; 105 | } 106 | } 107 | } 108 | 109 | void write_flag(vp8_bool_encoder *e, int b) 110 | { 111 | write_bool(e, 128, (b)?1:0); 112 | } 113 | 114 | void write_literal(vp8_bool_encoder *e, int i, int size) 115 | { 116 | int mask = 1 << (size - 1); 117 | while (mask) 118 | { 119 | write_flag(e, !((i & mask) == 0)); 120 | mask >>= 1; 121 | } 122 | } 123 | 124 | void flush_bool_encoder(vp8_bool_encoder *e) 125 | { 126 | int c = e->bit_count; 127 | uint32_t v = e->bottom; 128 | if( v & (1 << (32 - c))) 129 | add_one_to_output(e->output); 130 | v <<= c & 7; 131 | c >>= 3; 132 | while( --c >= 0) 133 | v <<= 8; 134 | c = 4; 135 | while( --c >= 0) { 136 | /* write remaining data, possibly padded */ 137 | *e->output++ = (uint8_t) (v >> 24); 138 | e->count++; 139 | v <<= 8; 140 | } 141 | } 142 | 143 | typedef enum 144 | { DCT_0, /* value 0 */ 145 | DCT_1, /* 1 */ 146 | DCT_2, /* 2 */ 147 | DCT_3, /* 3 */ 148 | DCT_4, /* 4 */ 149 | dct_cat1, /* range 5 - 6 (size 2) */ 150 | dct_cat2, /* 7 - 10 (4) */ 151 | dct_cat3, /* 11 - 18 (8) */ 152 | dct_cat4, /* 19 - 34 (16) */ 153 | dct_cat5, /* 35 - 66 (32) */ 154 | dct_cat6, /* 67 - 2048 (1982) */ 155 | dct_eob, /* end of block */ 156 | num_dct_tokens /* 12 */ 157 | } dct_token; 158 | typedef struct { 159 | int sign; 160 | int bits; 161 | int size; 162 | int extra_bits; 163 | int extra_size; 164 | __constant Prob* pcat; 165 | } token; 166 | 167 | __constant tree_index coeff_tree [2 * (num_dct_tokens - 1)] = { -dct_eob, 2, /* eob = "0" */ 168 | -DCT_0, 4, /* 0 = "10" */ 169 | -DCT_1, 6, /* 1 = "110" */ 170 | 8, 12, 171 | -DCT_2, 10, /* 2 = "11100" */ 172 | -DCT_3, -DCT_4, /* 3 = "111010", 4 = "111011" */ 173 | 14, 16, 174 | -dct_cat1, -dct_cat2, /* cat1 = "111100", cat2 = "111101" */ 175 | 18, 20, 176 | -dct_cat3, -dct_cat4, /* cat3 = "1111100", cat4 = "1111101" */ 177 | -dct_cat5, -dct_cat6 /* cat5 = "1111110", cat6 = "1111111" */ 178 | }; 179 | __constant Prob Pcat1[] = { 159, 0}; 180 | __constant Prob Pcat2[] = { 165, 145, 0}; 181 | __constant Prob Pcat3[] = { 173, 148, 140, 0}; 182 | __constant Prob Pcat4[] = { 176, 155, 140, 135, 0}; 183 | __constant Prob Pcat5[] = { 180, 157, 141, 134, 130, 0}; 184 | __constant Prob Pcat6[] = { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0}; 185 | __constant int coeff_bands[16] = { 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7}; 186 | 187 | void encode_block(vp8_bool_encoder *vbe, __global uint *coeff_probs, int mb_num, int b_num, token *tokens, int ctx1, uchar ctx3) 188 | { 189 | // ctx1 = 0 for Y beggining at coefficient 1 (when y2 exists) 190 | // = 1 for Y2 191 | // = 2 for U or V 192 | // = 3 for Y beggining at coefficient 0 (when Y2 is absent) 193 | // ctx2 = coefficient position in block {(0), 1, 2, 3, ... 15} 194 | // chooses value from coeff_bands[16] = { 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7}; 195 | // ctx3 = for the first(second when ctx1 = 0) coefficient it is equal to number of nearby(above and left only counts) blocks 196 | // with non-zero coefficients 197 | // = for next coefficients it equals: to 0, when previous is zero: to 1, when previous is +1 or -1; to 2 in other cases 198 | // ctx4 = token tree position 199 | int ctx2; 200 | tree_index ctx4; 201 | 202 | int i = ((ctx1 == 0) ? 1 : 0); // maybe (!ctx1) 203 | int prev_is_zero = 0; 204 | 205 | for (; i < 16; ++i) 206 | { 207 | ctx2=coeff_bands[i]; 208 | 209 | // if previous coefficient was DCT_0, then current can't be EOB (inneficient to have 0,0,0,eob) 210 | // since EOB the only token, that has ZERO as highest bit 211 | // then ONE in first bit becomes implicit and doesn't require encoding 212 | 213 | // to handle this we must lower encoding bits size by 1 214 | // and tree_index at 2, instead of 0 (the route is tree[0+1]==2 when we encode "1") 215 | if (prev_is_zero) { 216 | ctx4 = 2; 217 | --(tokens[i].size); 218 | } else ctx4 = 0; 219 | 220 | do { 221 | const int b = (tokens[i].bits >> (--(tokens[i].size))) & 1; 222 | write_bool(vbe, (uchar)coeff_probs[(((ctx1<<3) + ctx2)*3 + ctx3)*11 + (ctx4>>1)], b); 223 | ctx4 = coeff_tree[ctx4+b]; 224 | } while (tokens[i].size); 225 | 226 | if (tokens[i].bits == 0) return; // EOB == "0" 227 | 228 | //now we maybe we have extra bits to encode (if previously dct_catx was encoded) 229 | if (tokens[i].extra_size > 0) 230 | { 231 | int mask = 1 << (tokens[i].extra_size-1); 232 | int j = 0; 233 | while (tokens[i].pcat[j]) 234 | { 235 | write_bool(vbe, tokens[i].pcat[j], (tokens[i].extra_bits & mask) ? 1 : 0); 236 | ++j; 237 | mask >>= 1; 238 | } 239 | } 240 | 241 | ctx3 = 2; 242 | if (tokens[i].bits == 6) ctx3 = 1; /* DCT__1 = "110" */ 243 | if (tokens[i].bits == 2) { //DCT__0 == "10" 244 | prev_is_zero = 1; 245 | ctx3 = 0; 246 | } 247 | else { 248 | write_bool(vbe, 128, tokens[i].sign); //sign 249 | prev_is_zero = 0; 250 | } 251 | 252 | } 253 | 254 | return; 255 | } 256 | 257 | void tokenize_block(__global macroblock *MBs, int mb_num, int b_num, token tokens[16]) //IF-ELSE 258 | { 259 | int next = 0; // imaginary 17th element 260 | int i; 261 | for (i = 15; i >= 0; --i) // tokenize block 262 | { 263 | int coeff = (int)MBs[mb_num].coeffs[b_num][i]; 264 | tokens[i].sign = (coeff < 0) ? 1 : 0; 265 | coeff = (coeff < 0) ? -coeff : coeff; 266 | tokens[i].extra_bits = 0; 267 | tokens[i].extra_size = 0; 268 | tokens[i].pcat = Pcat1; 269 | if (coeff == 0) { 270 | if (next == 0) { 271 | tokens[i].bits = 0; //dct_eob = "0" 272 | tokens[i].size = 1; 273 | } else { 274 | tokens[i].bits = 2; /* 0 = "10" */ 275 | tokens[i].size = 2; 276 | } 277 | } 278 | else if (coeff == 1) { 279 | tokens[i].bits = 6; /* 1 = "110" */ 280 | tokens[i].size = 3; 281 | } 282 | else if (coeff == 2) { 283 | tokens[i].bits = 28; /* 2 = "11100" */ 284 | tokens[i].size = 5; 285 | } 286 | else if (coeff == 3) { 287 | tokens[i].bits = 58; /* 3 = "111010" */ 288 | tokens[i].size = 6; 289 | } 290 | else if (coeff == 4) { 291 | tokens[i].bits = 59; /* 4 = "111011" */ 292 | tokens[i].size = 6; 293 | } 294 | else if (coeff <= 6) { 295 | tokens[i].bits = 60; /* cat1 = "111100" */ 296 | tokens[i].size = 6; /* range 5 - 6 (size 2) */ 297 | tokens[i].extra_bits = coeff - 5; 298 | tokens[i].extra_size = 1; 299 | //Pcat1 already assigned 300 | } 301 | else if (coeff <= 10) { 302 | tokens[i].bits = 61; /* cat2 = "111101" */ 303 | tokens[i].size = 6; /* 7 - 10 (4) */ 304 | tokens[i].extra_bits = coeff - 7; 305 | tokens[i].extra_size = 2; 306 | tokens[i].pcat = Pcat2; 307 | } 308 | else if (coeff <= 18) { 309 | tokens[i].bits = 124; /* cat3 = "1111100" */ 310 | tokens[i].size = 7; /* 11 - 18 (8) */ 311 | tokens[i].extra_bits = coeff - 11; 312 | tokens[i].extra_size = 3; 313 | tokens[i].pcat = Pcat3; 314 | } 315 | else if (coeff <= 34) { 316 | tokens[i].bits = 125; /* cat4 = "1111101" */ 317 | tokens[i].size = 7; /* 19 - 34 (16) */ 318 | tokens[i].extra_bits = coeff - 19; 319 | tokens[i].extra_size = 4; 320 | tokens[i].pcat = Pcat4; 321 | } 322 | else if (coeff <= 66) { 323 | tokens[i].bits = 126; /* cat5 = "1111110" */ 324 | tokens[i].size = 7; /* 35 - 66 (32) */ 325 | tokens[i].extra_bits = coeff - 35; 326 | tokens[i].extra_size = 5; 327 | tokens[i].pcat = Pcat5; 328 | } 329 | else { 330 | tokens[i].bits = 127; /* cat6 = "1111111" */ 331 | tokens[i].size = 7; /* 67 - 2048 (1982) */ 332 | tokens[i].extra_bits = coeff - 67; 333 | tokens[i].extra_size = 11; 334 | tokens[i].pcat = Pcat6; 335 | } 336 | next = tokens[i].bits; 337 | } 338 | return; 339 | } 340 | 341 | __kernel void encode_coefficients( __global macroblock *MBs, 342 | __global uchar *output, 343 | __global int *partition_sizes, 344 | __global uchar *third_context, 345 | __global uint *coeff_probs, 346 | __global uint *coeff_probs_denom, 347 | int mb_height, 348 | int mb_width, 349 | int num_partitions, 350 | int key_frame, 351 | int partition_step, 352 | int skip_prob) 353 | { 354 | int part_num = get_global_id(0); 355 | int mb_row, mb_num, mb_col, b_num; 356 | int first_context; 357 | vp8_bool_encoder vbe[1]; 358 | 359 | token tokens[16]; 360 | 361 | init_bool_encoder(vbe, output + partition_step*part_num); 362 | 363 | for (mb_row = part_num; mb_row < mb_height; mb_row+= num_partitions) 364 | { 365 | for (mb_col = 0; mb_col < mb_width; ++mb_col) 366 | { 367 | mb_num = mb_col + mb_row * mb_width; 368 | if (MBs[mb_num].non_zero_coeffs == 0) 369 | continue; 370 | if (MBs[mb_num].parts == are16x16) 371 | { 372 | first_context = 1; // for Y2 373 | tokenize_block(MBs, mb_num, 24, tokens); 374 | encode_block(vbe, coeff_probs, mb_num, 24, tokens, first_context, *(third_context + mb_num*25 + 24)); 375 | first_context = 0; //for Y, when Y2 exists 376 | } else { 377 | first_context = 3; //for Y, when Y2 is absent 378 | } 379 | // then always goes Y 380 | // 16 of them 381 | for (b_num = 0; b_num < 16; ++b_num) 382 | { 383 | tokenize_block(MBs, mb_num, b_num, tokens); 384 | encode_block(vbe, coeff_probs, mb_num, b_num, tokens, first_context, *(third_context + mb_num*25 + b_num)); 385 | } 386 | //now 8 U-blocks 387 | first_context = 2; // for all chromas 388 | for (b_num = 16; b_num < 20; ++b_num) 389 | { 390 | tokenize_block(MBs, mb_num, b_num, tokens); 391 | encode_block(vbe, coeff_probs, mb_num, b_num, tokens, first_context, *(third_context + mb_num*25 + b_num)); 392 | } 393 | //now 8 V-blocks 394 | for (b_num = 20; b_num < 24; ++b_num) 395 | { 396 | tokenize_block(MBs, mb_num, b_num, tokens); 397 | encode_block(vbe, coeff_probs, mb_num, b_num, tokens, first_context, *(third_context + mb_num*25 + b_num)); 398 | } 399 | } 400 | } 401 | flush_bool_encoder(vbe); 402 | partition_sizes[part_num] = vbe->count; 403 | 404 | return; 405 | } 406 | 407 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////// 408 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////// 409 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////// 410 | void tokenize_block_cut(__global macroblock *MBs, int mb_num, int b_num, token tokens[16]) 411 | { 412 | int next = 0; // imaginary 17th element 413 | int i; 414 | for (i = 15; i >= 0; --i) // tokenize block 415 | { 416 | int coeff = (int)MBs[mb_num].coeffs[b_num][i]; 417 | coeff = (coeff < 0) ? -coeff : coeff; 418 | if (coeff == 0) { 419 | if (next == 0) { 420 | tokens[i].bits = 0; //dct_eob = "0" 421 | tokens[i].size = 1; 422 | } else { 423 | tokens[i].bits = 2; /* 0 = "10" */ 424 | tokens[i].size = 2; 425 | } 426 | } 427 | else if (coeff == 1) { 428 | tokens[i].bits = 6; /* 1 = "110" */ 429 | tokens[i].size = 3; 430 | } 431 | else if (coeff == 2) { 432 | tokens[i].bits = 28; /* 2 = "11100" */ 433 | tokens[i].size = 5; 434 | } 435 | else if (coeff == 3) { 436 | tokens[i].bits = 58; /* 3 = "111010" */ 437 | tokens[i].size = 6; 438 | } 439 | else if (coeff == 4) { 440 | tokens[i].bits = 59; /* 4 = "111011" */ 441 | tokens[i].size = 6; 442 | } 443 | else if (coeff <= 6) { 444 | tokens[i].bits = 60; /* cat1 = "111100" */ 445 | tokens[i].size = 6; /* range 5 - 6 (size 2) */ 446 | } 447 | else if (coeff <= 10) { 448 | tokens[i].bits = 61; /* cat2 = "111101" */ 449 | tokens[i].size = 6; /* 7 - 10 (4) */ 450 | } 451 | else if (coeff <= 18) { 452 | tokens[i].bits = 124; /* cat3 = "1111100" */ 453 | tokens[i].size = 7; /* 11 - 18 (8) */ 454 | } 455 | else if (coeff <= 34) { 456 | tokens[i].bits = 125; /* cat4 = "1111101" */ 457 | tokens[i].size = 7; /* 19 - 34 (16) */ 458 | } 459 | else if (coeff <= 66) { 460 | tokens[i].bits = 126; /* cat5 = "1111110" */ 461 | tokens[i].size = 7; /* 35 - 66 (32) */ 462 | } 463 | else { 464 | tokens[i].bits = 127; /* cat6 = "1111111" */ 465 | tokens[i].size = 7; /* 67 - 2048 (1982) */ 466 | } 467 | next = tokens[i].bits; 468 | } 469 | return; 470 | } 471 | 472 | 473 | void count_probs_in_block( __global uint *const coeff_probs, 474 | __global uint *const coeff_probs_denom, 475 | const int part_num, 476 | const int mb_num, const int b_num, 477 | token *tokens, 478 | const int ctx1,const int in_ctx3) 479 | { 480 | // ctx1 = 0 for Y beggining at coefficient 1 (when y2 exists) 481 | // = 1 for Y2 482 | // = 2 for U or V 483 | // = 3 for Y beggining at coefficient 0 (when Y2 is absent) 484 | // ctx2 = coefficient position in block {(0), 1, 2, 3, ... 15} 485 | // chooses value from coeff_bands[16] = { 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7}; 486 | // ctx3 = for the first(second when ctx1 = 0) coefficient it is equal to number of nearby(above and left only counts) blocks 487 | // with non-zero coefficients 488 | // = for next coefficients it equals: to 0, when previous is zero: to 1, when previous is +1 or -1; to 2 in other cases 489 | // ctx4 = token tree position 490 | int ctx2; 491 | int ctx3 = in_ctx3; 492 | tree_index ctx4; 493 | 494 | int i = ((ctx1 == 0) ? 1 : 0); // maybe (!ctx1) 495 | int prev_is_zero = 0; 496 | 497 | for (; i < 16; ++i) 498 | { 499 | ctx2=coeff_bands[i]; 500 | 501 | // if previous coefficient was DCT_0, then current can't be EOB (inneficient to have 0,0,0,eob) 502 | // since EOB the only token, that has ZERO as highest bit 503 | // then ONE in first bit becomes implicit and doesn't require encoding 504 | 505 | // to handle this we must lower encoding bits size by 1 506 | // and tree_index at 2, instead of 0 (the route is tree[0+1]==2 when we encode "1") 507 | if (prev_is_zero) { 508 | ctx4 = 2; 509 | --(tokens[i].size); 510 | } else ctx4 = 0; 511 | 512 | //__constant Prob *const p = default_coeff_probs[ctx1][ctx2][ctx3]; 513 | 514 | do { 515 | const uchar b = (tokens[i].bits >> (--(tokens[i].size))) & 1; 516 | coeff_probs[((((part_num<<5) + (ctx1<<3)) + ctx2)*3 + ctx3)*11 + (ctx4>>1)] += (1 - b); //increase numerator when b == 0 517 | ++(coeff_probs_denom[((((part_num<<5) + (ctx1<<3)) + ctx2)*3 + ctx3)*11 + (ctx4>>1)]); // increase denominator 518 | ctx4 = coeff_tree[ctx4+b]; 519 | } while (tokens[i].size); 520 | 521 | ctx3 = 2; 522 | if (tokens[i].bits == 6) ctx3 = 1; /* DCT__1 = "110" */ 523 | if (tokens[i].bits == 2) { //DCT__0 == "10" 524 | prev_is_zero = 1; 525 | ctx3 = 0; 526 | } 527 | else { 528 | prev_is_zero = 0; 529 | } 530 | } 531 | return; 532 | } 533 | 534 | __kernel void count_probs( __global macroblock *MBs, 535 | __global uint *coeff_probs, 536 | __global uint *coeff_probs_denom, 537 | __global uchar *third_context, 538 | int mb_height, 539 | int mb_width, 540 | int num_partitions, 541 | int key_frame, 542 | int partition_step) 543 | { 544 | int part_num = get_global_id(0); 545 | int mb_row, mb_num, mb_col, b_num; 546 | int prev_mb, prev_b; 547 | int i; 548 | int first_context, firstCoeff; 549 | token tokens[16]; 550 | 551 | { 552 | // we have to work with 1-dimensional array in global memory, so 553 | // coeff_probs[p][ctx1][ctx2][ctx3][ctx4] => *(coeff_probs + p*11*3*8*4 + ctx1*11*3*8 + ctx2*11*3 + ctx3*11 + ctx4) 554 | // or coeff_probs[((((p<<5) + (ctx1<<3)) + ctx2)*3 + ctx3)*11 + ctx4] 555 | int ctx1, ctx2, ctx3, ctx4; 556 | for (ctx1 = 0; ctx1 < 4; ++ctx1) 557 | for (ctx2 = 0; ctx2 < 8; ++ctx2) 558 | for (ctx3 = 0; ctx3 < 3; ++ctx3) 559 | for (ctx4 = 0; ctx4 < 11; ++ctx4) { 560 | coeff_probs[((((part_num<<5) + (ctx1<<3)) + ctx2)*3 + ctx3)*11 + ctx4] = 0; 561 | coeff_probs_denom[((((part_num<<5) + (ctx1<<3)) + ctx2)*3 + ctx3)*11 + ctx4] = 1; 562 | } 563 | } 564 | 565 | for (mb_row = part_num; mb_row < mb_height; mb_row+= num_partitions) 566 | { 567 | for (mb_col = 0; mb_col < mb_width; ++mb_col) 568 | { 569 | mb_num = mb_col + mb_row * mb_width; 570 | if (MBs[mb_num].non_zero_coeffs == 0) 571 | continue; 572 | if (MBs[mb_num].parts == are16x16) 573 | { 574 | first_context = 1; // for Y2 575 | *(third_context + mb_num*25 + 24) = 0; 576 | if (mb_row > 0) { // check if "above" Y2 has non zero 577 | // we go up until we find MB with Y2 mode enabled 578 | prev_mb = mb_num-mb_width; 579 | while(prev_mb>=0) { 580 | if (MBs[prev_mb].parts == are16x16) break; 581 | prev_mb-=mb_width; 582 | } 583 | if (prev_mb >= 0) 584 | for (i = 0; i < 16; ++i) { 585 | if (MBs[prev_mb].coeffs[24][i] != 0) { 586 | ++(*(third_context + mb_num*25 + 24)); 587 | break; 588 | } 589 | } 590 | } 591 | if (mb_col > 0) { // check if "left" Y2 has non zero 592 | prev_mb = mb_num-1; 593 | while (prev_mb >= (mb_row*mb_width)) { 594 | if (MBs[prev_mb].parts == are16x16) break; 595 | --prev_mb; 596 | } 597 | if (prev_mb >= (mb_row*mb_width)) 598 | for (i = 0; i < 16; ++i) { 599 | if (MBs[prev_mb].coeffs[24][i] != 0) { 600 | ++(*(third_context + mb_num*25 + 24)); 601 | break; 602 | } 603 | } 604 | } 605 | tokenize_block_cut(MBs, mb_num, 24, tokens); 606 | count_probs_in_block(coeff_probs, coeff_probs_denom, part_num, mb_num, 24, tokens, first_context, *(third_context + mb_num*25 + 24)); 607 | first_context = 0; //for Y, when Y2 exists 608 | } else first_context = 3; //for Y, when Y2 is absent 609 | // then always goes Y 610 | // 16 of them 611 | for (b_num = 0; b_num < 16; ++b_num) 612 | { 613 | *(third_context + mb_num*25 + b_num) = 0; 614 | // look above: 615 | prev_mb = -1; // as flag, that above is empty 616 | if ((b_num >> 2) > 0) { // /4 617 | prev_mb = mb_num; 618 | prev_b = b_num - 4; 619 | } 620 | else if (mb_row > 0) { 621 | prev_mb = mb_num - mb_width; 622 | prev_b = b_num + 12; 623 | } 624 | if (prev_mb >= 0) { 625 | firstCoeff = (MBs[prev_mb].parts == are16x16) ? 1 : 0; 626 | for (i = firstCoeff; i < 16; ++i) { 627 | if (MBs[prev_mb].coeffs[prev_b][i] != 0) { 628 | ++(*(third_context + mb_num*25 + b_num)); 629 | break; 630 | } 631 | } 632 | } 633 | // look to the left 634 | prev_mb = -1; 635 | if ((b_num & 3) > 0) { // %4 636 | prev_mb = mb_num; 637 | prev_b = b_num - 1; 638 | } 639 | else if (mb_col > 0) { 640 | prev_mb = mb_num - 1; 641 | prev_b = b_num + 3; 642 | } 643 | if (prev_mb >= 0) { 644 | firstCoeff = (MBs[prev_mb].parts == are16x16) ? 1 : 0; 645 | for (i = firstCoeff; i < 16; ++i) { 646 | if (MBs[prev_mb].coeffs[prev_b][i] != 0) { 647 | ++(*(third_context + mb_num*25 + b_num)); 648 | break; 649 | } 650 | } 651 | } 652 | tokenize_block_cut(MBs, mb_num, b_num, tokens); 653 | count_probs_in_block(coeff_probs, coeff_probs_denom, part_num, mb_num, b_num, tokens, first_context, *(third_context + mb_num*25 + b_num)); 654 | } 655 | //now 8 U-blocks 656 | first_context = 2; // for all chromas 657 | for (b_num = 16; b_num < 20; ++b_num) 658 | { 659 | *(third_context + mb_num*25 + b_num) = 0; 660 | // look above: 661 | prev_mb = -1; // as flag, that above is empty 662 | if (((b_num-16) >> 1) > 0) { // /2 663 | prev_mb = mb_num; 664 | prev_b = b_num - 2; 665 | } 666 | else if (mb_row > 0) { 667 | prev_mb = mb_num - mb_width; 668 | prev_b = b_num + 2; 669 | } 670 | if (prev_mb >= 0) { 671 | for (i = 0; i < 16; ++i) { 672 | if (MBs[prev_mb].coeffs[prev_b][i] != 0) { 673 | ++(*(third_context + mb_num*25 + b_num)); 674 | break; 675 | } 676 | } 677 | } 678 | // look to the left 679 | prev_mb = -1; 680 | if (((b_num-16) & 1) > 0) { // %2 681 | prev_mb = mb_num; 682 | prev_b = b_num - 1; 683 | } 684 | else if (mb_col > 0) { 685 | prev_mb = mb_num - 1; 686 | prev_b = b_num + 1; 687 | } 688 | if (prev_mb >= 0) { 689 | for (i = 0; i < 16; ++i) { 690 | if (MBs[prev_mb].coeffs[prev_b][i] != 0) { 691 | ++(*(third_context + mb_num*25 + b_num)); 692 | break; 693 | } 694 | } 695 | } 696 | tokenize_block_cut(MBs, mb_num, b_num, tokens); 697 | count_probs_in_block(coeff_probs, coeff_probs_denom, part_num, mb_num, b_num, tokens, first_context, *(third_context + mb_num*25 + b_num)); 698 | } 699 | //now 8 V-blocks 700 | for (b_num = 20; b_num < 24; ++b_num) 701 | { 702 | *(third_context + mb_num*25 + b_num) = 0; 703 | // look above: 704 | prev_mb = -1; // as flag, that above is empty 705 | if (((b_num-20) >> 1) > 0) { // /2 706 | prev_mb = mb_num; 707 | prev_b = b_num - 2; 708 | } 709 | else if (mb_row > 0) { 710 | prev_mb = mb_num - mb_width; 711 | prev_b = b_num + 2; 712 | } 713 | if (prev_mb >= 0) { 714 | for (i = 0; i < 16; ++i) { 715 | if (MBs[prev_mb].coeffs[prev_b][i] != 0) { 716 | ++(*(third_context + mb_num*25 + b_num)); 717 | break; 718 | } 719 | } 720 | } 721 | // look to the left 722 | prev_mb = -1; 723 | if (((b_num-20) & 1) > 0) { // %2 724 | prev_mb = mb_num; 725 | prev_b = b_num - 1; 726 | } 727 | else if (mb_col > 0) { 728 | prev_mb = mb_num - 1; 729 | prev_b = b_num + 1; 730 | } 731 | if (prev_mb >= 0) { 732 | for (i = 0; i < 16; ++i) { 733 | if (MBs[prev_mb].coeffs[prev_b][i] != 0) { 734 | ++(*(third_context + mb_num*25 + b_num)); 735 | break; 736 | } 737 | } 738 | } 739 | tokenize_block_cut(MBs, mb_num, b_num, tokens); 740 | count_probs_in_block(coeff_probs, coeff_probs_denom, part_num, mb_num, b_num, tokens, first_context, *(third_context + mb_num*25 + b_num)); 741 | } 742 | } 743 | 744 | } 745 | 746 | return; 747 | } 748 | 749 | __kernel void num_div_denom(__global uint *coeff_probs, 750 | __global uint *coeff_probs_denom, 751 | int num_partitions) 752 | { 753 | int part_num = get_global_id(0); 754 | int ctx1, ctx2, ctx3, ctx4, p; 755 | uint num, denom; 756 | for (ctx1 = 0; ctx1 < 4; ++ctx1) 757 | for (ctx2 = part_num; ctx2 < 8; ctx2 += num_partitions) 758 | for (ctx3 = 0; ctx3 < 3; ++ctx3) 759 | for (ctx4 = 0; ctx4 < 11; ++ctx4) { 760 | num = 0; 761 | denom = 0; 762 | for (p = 0; p < num_partitions; ++p) { 763 | num += coeff_probs[((((p<<5) + (ctx1<<3)) + ctx2)*3 + ctx3)*11 + ctx4]; 764 | denom += coeff_probs_denom[((((p<<5) + (ctx1<<3)) + ctx2)*3 + ctx3)*11 + ctx4]; 765 | } 766 | num = (num << 8) / denom; 767 | coeff_probs[(((ctx1<<3) + ctx2)*3 + ctx3)*11 + ctx4] = (num > 255) ? 255 : ((num == 0) ? 1 : num); 768 | } 769 | return; 770 | } 771 | 772 | 773 | #ifdef LOOP_FILTER 774 | __kernel void prepare_filter_mask(__global macroblock *const MBs, //0 775 | __global int *const mb_mask, //1 776 | const int width, //2 777 | const int height, //3 778 | const int parts) //4 779 | { 780 | __private int mb_num, b_num, mb_row, mb_col, mb_height, mb_width, i, mask, coeffs, split_mode; 781 | mb_height = height/16; 782 | mb_width = width/16; 783 | 784 | for (mb_row = get_global_id(0); mb_row < mb_height; mb_row += parts) 785 | { 786 | for (mb_col = 0; mb_col < mb_width; ++mb_col) 787 | { 788 | mb_num = mb_row * mb_width + mb_col; 789 | //printf((__constant char*)"%d\n",mb_num); 790 | mask = 0; coeffs = 0; split_mode = MBs[mb_num].parts; 791 | for (b_num = 0; b_num < 16; ++b_num) { 792 | for (i = 1; i < 16; ++i) { 793 | coeffs += (int)abs(MBs[mb_num].coeffs[b_num][i]); 794 | } 795 | } 796 | for (b_num = 16; b_num < 24; ++b_num) { 797 | for (i = 0; i < 16; ++i) { 798 | coeffs += (int)abs(MBs[mb_num].coeffs[b_num][i]); 799 | } 800 | } 801 | if (split_mode == are16x16) { 802 | for (i = 0; i < 16; ++i) { 803 | coeffs += (int)abs(MBs[mb_num].coeffs[24][i]); 804 | } 805 | } 806 | else { 807 | for (b_num = 0; b_num < 16; ++b_num) { 808 | coeffs += (int)abs(MBs[mb_num].coeffs[b_num][0]); 809 | } 810 | } 811 | 812 | MBs[mb_num].non_zero_coeffs = coeffs; 813 | mask = ((split_mode != are16x16) || (coeffs > 0)) ? -1 : 0; 814 | mb_mask[mb_num] = mask; 815 | } 816 | } 817 | return; 818 | } 819 | 820 | void filter_mb_edge(short8 *const p3, short8 *const p2, short8 *const p1, short8 *const p0, 821 | short8 *const q0, short8 *const q1, short8 *const q2, short8 *const q3, 822 | ushort mb_lim, ushort int_lim, ushort hev_thr) 823 | { 824 | short8 mask, hev, a, b, w; 825 | 826 | mask = (abs(*p3 - *p2) > int_lim); 827 | mask |= (abs(*p2 - *p1) > int_lim); 828 | mask |= (abs(*p1 - *p0) > int_lim); 829 | mask |= (abs(*q1 - *q0) > int_lim); 830 | mask |= (abs(*q2 - *q1) > int_lim); 831 | mask |= (abs(*q3 - *q2) > int_lim); 832 | mask |= ((abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2) > mb_lim); 833 | mask = ~mask; // for vectors in OpenCL TRUE means -1 (all bits set) 834 | hev = (abs(*p1 - *p0) > hev_thr); 835 | hev |= (abs(*q1 - *q0) > hev_thr); 836 | //w = clamp128(clamp128(p1 - q1) + 3*(q0 - p0)); 837 | w = *p1 - *q1; 838 | w = select(w,-128,w<-128); 839 | w = select(w,127,w>127); 840 | w += (*q0 - *p0) * (short)3; 841 | w = select(w,-128,w<-128); 842 | w = select(w,127,w>127); 843 | w &= mask; 844 | a = w & hev; 845 | // b = clamp128(a+3) >> 3 846 | b = a + 3; 847 | b = select(b,-128,b<-128); 848 | b = select(b,127,b>127); 849 | b >>= 3; 850 | // a = clamp128(a+4) >> 3 851 | a = a + 4; 852 | a = select(a,-128,a<-128); 853 | a = select(a,127,a>127); 854 | a >>= 3; 855 | *q0 -= a; *p0 += b; 856 | w &= ~hev; 857 | //a = clamp128((27*w + 63) >> 7); 858 | a = (w * (short)27 + (short)63) >> 7; 859 | a = select(a,-128,a<-128); 860 | a = select(a,127,a>127); 861 | *q0 -= a; *p0 += a; 862 | //a = clamp128((18*w + 63) >> 7); 863 | a = (w * (short)18 + (short)63) >> 7; 864 | a = select(a,-128,a<-128); 865 | a = select(a,127,a>127); 866 | *q1 -= a; *p1 += a; 867 | //a = clamp128((9*w + 63) >> 7); 868 | a = (w * (short)9 + (short)63) >> 7; 869 | a = select(a,-128,a<-128); 870 | a = select(a,127,a>127); 871 | *q2 -= a; *p2 += a; 872 | 873 | return; 874 | } 875 | 876 | void filter_b_edge(short8 *const p3, short8 *const p2, short8 *const p1, short8 *const p0, 877 | short8 *const q0, short8 *const q1, short8 *const q2, short8 *const q3, 878 | ushort b_lim, ushort int_lim, ushort hev_thr) 879 | { 880 | short8 mask, hev, a, b; 881 | 882 | mask = (abs(*p3 - *p2) > int_lim); 883 | mask |= (abs(*p2 - *p1) > int_lim); 884 | mask |= (abs(*p1 - *p0) > int_lim); 885 | mask |= (abs(*q1 - *q0) > int_lim); 886 | mask |= (abs(*q2 - *q1) > int_lim); 887 | mask |= (abs(*q3 - *q2) > int_lim); 888 | mask |= ((abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2) > b_lim); 889 | mask = ~mask; // for vectors in OpenCL TRUE means -1 (all bits set) 890 | hev = (abs(*p1 - *p0) > hev_thr); 891 | hev |= (abs(*q1 - *q0) > hev_thr); 892 | //a = clamp128((use_outer_taps? clamp128(p1 - q1) : 0) + 3*(q0 - p0)); 893 | a = *p1 - *q1; 894 | a = select(a,-128,a<-128); 895 | a = select(a,127,a>127); 896 | a &= hev; 897 | a += (*q0 - *p0) * (short)3; 898 | a = select(a,-128,a<-128); 899 | a = select(a,127,a>127); 900 | a &= mask; 901 | // b = clamp128(a+3) >> 3 902 | b = a + 3; 903 | b = select(b,-128,b<-128); 904 | b = select(b,127,b>127); 905 | b >>= 3; 906 | // a = clamp128(a+4) >> 3 907 | a = a + 4; 908 | a = select(a,-128,a<-128); 909 | a = select(a,127,a>127); 910 | a >>= 3; 911 | *q0 -= a; *p0 += b; 912 | a = (a + 1) >> 1; 913 | a &= ~hev; 914 | *q1 -= a; *p1 += a; 915 | 916 | return; 917 | } 918 | 919 | void read8p(__global uchar *const frame, const int pos, const int step, short8 *const V) 920 | { 921 | int i = pos; 922 | (*V).s0 = (short)frame[i] - 128; i += step; 923 | (*V).s1 = (short)frame[i] - 128; i += step; 924 | (*V).s2 = (short)frame[i] - 128; i += step; 925 | (*V).s3 = (short)frame[i] - 128; i += step; 926 | (*V).s4 = (short)frame[i] - 128; i += step; 927 | (*V).s5 = (short)frame[i] - 128; i += step; 928 | (*V).s6 = (short)frame[i] - 128; i += step; 929 | (*V).s7 = (short)frame[i] - 128; 930 | return; 931 | } 932 | 933 | void write8p(__global uchar *const frame, const int pos, const int step, short8 *const V) 934 | { 935 | int i = pos; 936 | uchar8 buf; 937 | buf = convert_uchar8_sat(*V + 128); 938 | frame[i] = buf.s0; i += step; 939 | frame[i] = buf.s1; i += step; 940 | frame[i] = buf.s2; i += step; 941 | frame[i] = buf.s3; i += step; 942 | frame[i] = buf.s4; i += step; 943 | frame[i] = buf.s5; i += step; 944 | frame[i] = buf.s6; i += step; 945 | frame[i] = buf.s7; 946 | return; 947 | } 948 | 949 | __kernel void loop_filter_frame(__global uchar *const frame, //0 950 | __global macroblock *const MBs, //1 951 | __global int *const mb_mask, //2 952 | __constant const segment_data *const SD, //3 953 | const int width, //4 954 | const int height, //5 955 | const int mb_size) //6 956 | { 957 | __private int mb_num, mb_width, mb_count; 958 | __private int x0,y0,x,y,i; 959 | __private short int_lim, mb_lim, b_lim, hev_thr; 960 | __private short8 p3,p2,p1,p0,q0,q1,q2,q3; 961 | 962 | if (get_global_id(0) != 0) return; //there can be only one 963 | 964 | mb_width = width/mb_size; 965 | mb_count = mb_width*(height/mb_size); 966 | 967 | for (mb_num = 0; mb_num < mb_count; ++mb_num) 968 | { 969 | i = MBs[mb_num].segment_id; 970 | if (SD[i].loop_filter_level == 0) return; 971 | int_lim = (short)SD[i].interior_limit; 972 | mb_lim = (short)SD[i].mbedge_limit; 973 | b_lim = (short)SD[i].sub_bedge_limit; 974 | hev_thr = (short)SD[i].hev_threshold; 975 | 976 | x0 = (mb_num%mb_width)*mb_size; 977 | y0 = (mb_num/mb_width)*mb_size; 978 | 979 | // horizontal 980 | for (y = y0; (y-y0) < mb_size; y += 8) 981 | { 982 | x = x0; 983 | i = y * width + x; read8p(frame,i,width,&q0); 984 | ++i; read8p(frame,i,width,&q1); 985 | ++i; read8p(frame,i,width,&q2); 986 | ++i; read8p(frame,i,width,&q3); 987 | if (x0>0) 988 | { 989 | i = y * width + x-4; read8p(frame,i,width,&p3); 990 | ++i; read8p(frame,i,width,&p2); 991 | ++i; read8p(frame,i,width,&p1); 992 | ++i; read8p(frame,i,width,&p0); 993 | filter_mb_edge(&p3,&p2,&p1,&p0,&q0,&q1,&q2,&q3,mb_lim,int_lim,hev_thr); 994 | i = y * width + x-3; write8p(frame,i,width,&p2); 995 | ++i; write8p(frame,i,width,&p1); 996 | ++i; write8p(frame,i,width,&p0); 997 | ++i; write8p(frame,i,width,&q0); 998 | ++i; write8p(frame,i,width,&q1); 999 | ++i; write8p(frame,i,width,&q2); 1000 | } 1001 | 1002 | for (x = x0 + 4; ((x-x0) < mb_size) && (mb_mask[mb_num]); x += 4) 1003 | { 1004 | p3 = q0; p2 = q1; p1 = q2; p0 = q3; 1005 | i = y * width + x; read8p(frame,i,width,&q0); 1006 | ++i; read8p(frame,i,width,&q1); 1007 | ++i; read8p(frame,i,width,&q2); 1008 | ++i; read8p(frame,i,width,&q3); 1009 | filter_b_edge(&p3,&p2,&p1,&p0,&q0,&q1,&q2,&q3,b_lim,int_lim,hev_thr); 1010 | i = y * width + x-2; write8p(frame,i,width,&p1); 1011 | ++i; write8p(frame,i,width,&p0); 1012 | ++i; write8p(frame,i,width,&q0); 1013 | ++i; write8p(frame,i,width,&q1); 1014 | } 1015 | } 1016 | 1017 | // vertically 1018 | 1019 | for (x = x0; (x-x0) < mb_size; x += 8) 1020 | { 1021 | y = y0; 1022 | i = y * width + x; q0 = convert_short8(vload8(0, frame + i)) - 128; 1023 | i += width; q1 = convert_short8(vload8(0, frame + i)) - 128; 1024 | i += width; q2 = convert_short8(vload8(0, frame + i)) - 128; 1025 | i += width; q3 = convert_short8(vload8(0, frame + i)) - 128; 1026 | if (y0 > 0) 1027 | { 1028 | i = (y-4) * width + x; p3 = convert_short8(vload8(0, frame + i)) - 128; 1029 | i += width; p2 = convert_short8(vload8(0, frame + i)) - 128; 1030 | i += width; p1 = convert_short8(vload8(0, frame + i)) - 128; 1031 | i += width; p0 = convert_short8(vload8(0, frame + i)) - 128; 1032 | filter_mb_edge(&p3,&p2,&p1,&p0,&q0,&q1,&q2,&q3,mb_lim,int_lim,hev_thr); 1033 | i = (y-3) * width + x; vstore8(convert_uchar8_sat(p2 + 128),0,frame + i); 1034 | i += width; vstore8(convert_uchar8_sat(p1 + 128), 0, frame + i); 1035 | i += width; vstore8(convert_uchar8_sat(p0 + 128), 0, frame + i); 1036 | i += width; vstore8(convert_uchar8_sat(q0 + 128), 0, frame + i); 1037 | i += width; vstore8(convert_uchar8_sat(q1 + 128), 0, frame + i); 1038 | i += width; vstore8(convert_uchar8_sat(q2 + 128), 0, frame + i); 1039 | } 1040 | 1041 | for (y = y0 + 4; ((y - y0) < mb_size) && (mb_mask[mb_num]); y += 4) 1042 | { 1043 | p3 = q0; p2 = q1; p1 = q2; p0 = q3; 1044 | i = y * width + x; q0 = convert_short8(vload8(0, frame + i)) - 128; 1045 | i += width; q1 = convert_short8(vload8(0, frame + i)) - 128; 1046 | i += width; q2 = convert_short8(vload8(0, frame + i)) - 128; 1047 | i += width; q3 = convert_short8(vload8(0, frame + i)) - 128; 1048 | filter_b_edge(&p3,&p2,&p1,&p0,&q0,&q1,&q2,&q3,b_lim,int_lim,hev_thr); 1049 | i = (y-2) * width + x; vstore8(convert_uchar8_sat(p1 + 128), 0, frame + i); 1050 | i += width; vstore8(convert_uchar8_sat(p0 + 128), 0, frame + i); 1051 | i += width; vstore8(convert_uchar8_sat(q0 + 128), 0, frame + i); 1052 | i += width; vstore8(convert_uchar8_sat(q1 + 128), 0, frame + i); 1053 | } 1054 | } 1055 | 1056 | } 1057 | 1058 | } 1059 | #endif 1060 | -------------------------------------------------------------------------------- /test/CPU_kernels.cl: -------------------------------------------------------------------------------- 1 | #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable 2 | //#pragma OPENCL EXTENSION cl_amd_printf : enable 3 | 4 | typedef short int16_t; 5 | typedef int int32_t; 6 | typedef unsigned int uint32_t; 7 | typedef unsigned char uint8_t; 8 | typedef signed char int8_t; 9 | typedef unsigned short uint16_t; 10 | typedef uint8_t Prob; 11 | typedef int8_t tree_index; 12 | typedef const tree_index Tree[]; 13 | 14 | typedef enum { 15 | are16x16 = 0, 16 | are8x8 = 1, 17 | are4x4 = 2 18 | } partition_mode; 19 | 20 | typedef enum { 21 | intra_segment = 0, 22 | UQ_segment = 0, 23 | HQ_segment = 1, 24 | AQ_segment = 2, 25 | LQ_segment = 3 26 | } segment_ids; 27 | 28 | typedef enum { 29 | LAST = 0, 30 | GOLDEN = 1, 31 | ALTREF = 2 32 | } ref_frame; 33 | 34 | typedef struct { 35 | int16_t coeffs[25][16]; 36 | int32_t vector_x[4]; 37 | int32_t vector_y[4]; 38 | float SSIM; 39 | int non_zero_coeffs; 40 | int parts; 41 | int reference_frame; 42 | segment_ids segment_id; 43 | } macroblock; 44 | 45 | typedef struct { 46 | int y_ac_i; 47 | int y_dc_idelta; 48 | int y2_dc_idelta; 49 | int y2_ac_idelta; 50 | int uv_dc_idelta; 51 | int uv_ac_idelta; 52 | int loop_filter_level; 53 | int mbedge_limit; 54 | int sub_bedge_limit; 55 | int interior_limit; 56 | int hev_threshold; 57 | } segment_data; 58 | 59 | typedef struct { 60 | __global uint8_t *output; /* ptr to next byte to be written */ 61 | uint32_t range; /* 128 <= range <= 255 */ 62 | uint32_t bottom; /* minimum value of remaining output */ 63 | int32_t bit_count; /* # of shifts before an output byte is available */ 64 | uint32_t count; 65 | } vp8_bool_encoder; 66 | 67 | void init_bool_encoder(vp8_bool_encoder *e, __global uint8_t *start_partition) 68 | { 69 | e->output = start_partition; 70 | e->range = 255; 71 | e->bottom = 0; 72 | e->bit_count = 24; 73 | e->count = 0; 74 | } 75 | 76 | void add_one_to_output(__global uint8_t *q) 77 | { 78 | while( *--q == 255) 79 | *q = 0; 80 | ++*q; 81 | } 82 | 83 | void write_bool(vp8_bool_encoder *e, int prob, int bool_value) 84 | { 85 | /* split is approximately (range * prob) / 256 and, crucially, 86 | is strictly bigger than zero and strictly smaller than range */ 87 | uint32_t split = 1 + ( ((e->range - 1) * prob) >> 8); 88 | if( bool_value) { 89 | e->bottom += split; /* move up bottom of interval */ 90 | e->range -= split; /* with corresponding decrease in range */ 91 | } else 92 | e->range = split; 93 | while( e->range < 128) 94 | { 95 | e->range <<= 1; 96 | if( e->bottom & ((uint32_t)1 << 31)) {/* detect carry */ 97 | add_one_to_output(e->output); 98 | } 99 | e->bottom <<= 1; 100 | if( !--e->bit_count) { 101 | *e->output++ = (uint8_t) (e->bottom >> 24); 102 | e->count++; 103 | e->bottom &= (1 << 24) - 1; 104 | e->bit_count = 8; 105 | } 106 | } 107 | } 108 | 109 | void write_flag(vp8_bool_encoder *e, int b) 110 | { 111 | write_bool(e, 128, (b)?1:0); 112 | } 113 | 114 | void write_literal(vp8_bool_encoder *e, int i, int size) 115 | { 116 | int mask = 1 << (size - 1); 117 | while (mask) 118 | { 119 | write_flag(e, !((i & mask) == 0)); 120 | mask >>= 1; 121 | } 122 | } 123 | 124 | void flush_bool_encoder(vp8_bool_encoder *e) 125 | { 126 | int c = e->bit_count; 127 | uint32_t v = e->bottom; 128 | if( v & (1 << (32 - c))) 129 | add_one_to_output(e->output); 130 | v <<= c & 7; 131 | c >>= 3; 132 | while( --c >= 0) 133 | v <<= 8; 134 | c = 4; 135 | while( --c >= 0) { 136 | /* write remaining data, possibly padded */ 137 | *e->output++ = (uint8_t) (v >> 24); 138 | e->count++; 139 | v <<= 8; 140 | } 141 | } 142 | 143 | typedef enum 144 | { DCT_0, /* value 0 */ 145 | DCT_1, /* 1 */ 146 | DCT_2, /* 2 */ 147 | DCT_3, /* 3 */ 148 | DCT_4, /* 4 */ 149 | dct_cat1, /* range 5 - 6 (size 2) */ 150 | dct_cat2, /* 7 - 10 (4) */ 151 | dct_cat3, /* 11 - 18 (8) */ 152 | dct_cat4, /* 19 - 34 (16) */ 153 | dct_cat5, /* 35 - 66 (32) */ 154 | dct_cat6, /* 67 - 2048 (1982) */ 155 | dct_eob, /* end of block */ 156 | num_dct_tokens /* 12 */ 157 | } dct_token; 158 | typedef struct { 159 | int sign; 160 | int bits; 161 | int size; 162 | int extra_bits; 163 | int extra_size; 164 | __constant Prob* pcat; 165 | } token; 166 | 167 | __constant tree_index coeff_tree [2 * (num_dct_tokens - 1)] = { -dct_eob, 2, /* eob = "0" */ 168 | -DCT_0, 4, /* 0 = "10" */ 169 | -DCT_1, 6, /* 1 = "110" */ 170 | 8, 12, 171 | -DCT_2, 10, /* 2 = "11100" */ 172 | -DCT_3, -DCT_4, /* 3 = "111010", 4 = "111011" */ 173 | 14, 16, 174 | -dct_cat1, -dct_cat2, /* cat1 = "111100", cat2 = "111101" */ 175 | 18, 20, 176 | -dct_cat3, -dct_cat4, /* cat3 = "1111100", cat4 = "1111101" */ 177 | -dct_cat5, -dct_cat6 /* cat5 = "1111110", cat6 = "1111111" */ 178 | }; 179 | __constant Prob Pcat1[] = { 159, 0}; 180 | __constant Prob Pcat2[] = { 165, 145, 0}; 181 | __constant Prob Pcat3[] = { 173, 148, 140, 0}; 182 | __constant Prob Pcat4[] = { 176, 155, 140, 135, 0}; 183 | __constant Prob Pcat5[] = { 180, 157, 141, 134, 130, 0}; 184 | __constant Prob Pcat6[] = { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0}; 185 | __constant int coeff_bands[16] = { 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7}; 186 | 187 | void encode_block(vp8_bool_encoder *vbe, __global uint *coeff_probs, int mb_num, int b_num, token *tokens, int ctx1, uchar ctx3) 188 | { 189 | // ctx1 = 0 for Y beggining at coefficient 1 (when y2 exists) 190 | // = 1 for Y2 191 | // = 2 for U or V 192 | // = 3 for Y beggining at coefficient 0 (when Y2 is absent) 193 | // ctx2 = coefficient position in block {(0), 1, 2, 3, ... 15} 194 | // chooses value from coeff_bands[16] = { 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7}; 195 | // ctx3 = for the first(second when ctx1 = 0) coefficient it is equal to number of nearby(above and left only counts) blocks 196 | // with non-zero coefficients 197 | // = for next coefficients it equals: to 0, when previous is zero: to 1, when previous is +1 or -1; to 2 in other cases 198 | // ctx4 = token tree position 199 | int ctx2; 200 | tree_index ctx4; 201 | 202 | int i = ((ctx1 == 0) ? 1 : 0); // maybe (!ctx1) 203 | int prev_is_zero = 0; 204 | 205 | for (; i < 16; ++i) 206 | { 207 | ctx2=coeff_bands[i]; 208 | 209 | // if previous coefficient was DCT_0, then current can't be EOB (inneficient to have 0,0,0,eob) 210 | // since EOB the only token, that has ZERO as highest bit 211 | // then ONE in first bit becomes implicit and doesn't require encoding 212 | 213 | // to handle this we must lower encoding bits size by 1 214 | // and tree_index at 2, instead of 0 (the route is tree[0+1]==2 when we encode "1") 215 | if (prev_is_zero) { 216 | ctx4 = 2; 217 | --(tokens[i].size); 218 | } else ctx4 = 0; 219 | 220 | do { 221 | const int b = (tokens[i].bits >> (--(tokens[i].size))) & 1; 222 | write_bool(vbe, (uchar)coeff_probs[(((ctx1<<3) + ctx2)*3 + ctx3)*11 + (ctx4>>1)], b); 223 | ctx4 = coeff_tree[ctx4+b]; 224 | } while (tokens[i].size); 225 | 226 | if (tokens[i].bits == 0) return; // EOB == "0" 227 | 228 | //now we maybe we have extra bits to encode (if previously dct_catx was encoded) 229 | if (tokens[i].extra_size > 0) 230 | { 231 | int mask = 1 << (tokens[i].extra_size-1); 232 | int j = 0; 233 | while (tokens[i].pcat[j]) 234 | { 235 | write_bool(vbe, tokens[i].pcat[j], (tokens[i].extra_bits & mask) ? 1 : 0); 236 | ++j; 237 | mask >>= 1; 238 | } 239 | } 240 | 241 | ctx3 = 2; 242 | if (tokens[i].bits == 6) ctx3 = 1; /* DCT__1 = "110" */ 243 | if (tokens[i].bits == 2) { //DCT__0 == "10" 244 | prev_is_zero = 1; 245 | ctx3 = 0; 246 | } 247 | else { 248 | write_bool(vbe, 128, tokens[i].sign); //sign 249 | prev_is_zero = 0; 250 | } 251 | 252 | } 253 | 254 | return; 255 | } 256 | 257 | void tokenize_block(__global macroblock *MBs, int mb_num, int b_num, token tokens[16]) //IF-ELSE 258 | { 259 | int next = 0; // imaginary 17th element 260 | int i; 261 | for (i = 15; i >= 0; --i) // tokenize block 262 | { 263 | int coeff = (int)MBs[mb_num].coeffs[b_num][i]; 264 | tokens[i].sign = (coeff < 0) ? 1 : 0; 265 | coeff = (coeff < 0) ? -coeff : coeff; 266 | tokens[i].extra_bits = 0; 267 | tokens[i].extra_size = 0; 268 | tokens[i].pcat = Pcat1; 269 | if (coeff == 0) { 270 | if (next == 0) { 271 | tokens[i].bits = 0; //dct_eob = "0" 272 | tokens[i].size = 1; 273 | } else { 274 | tokens[i].bits = 2; /* 0 = "10" */ 275 | tokens[i].size = 2; 276 | } 277 | } 278 | else if (coeff == 1) { 279 | tokens[i].bits = 6; /* 1 = "110" */ 280 | tokens[i].size = 3; 281 | } 282 | else if (coeff == 2) { 283 | tokens[i].bits = 28; /* 2 = "11100" */ 284 | tokens[i].size = 5; 285 | } 286 | else if (coeff == 3) { 287 | tokens[i].bits = 58; /* 3 = "111010" */ 288 | tokens[i].size = 6; 289 | } 290 | else if (coeff == 4) { 291 | tokens[i].bits = 59; /* 4 = "111011" */ 292 | tokens[i].size = 6; 293 | } 294 | else if (coeff <= 6) { 295 | tokens[i].bits = 60; /* cat1 = "111100" */ 296 | tokens[i].size = 6; /* range 5 - 6 (size 2) */ 297 | tokens[i].extra_bits = coeff - 5; 298 | tokens[i].extra_size = 1; 299 | //Pcat1 already assigned 300 | } 301 | else if (coeff <= 10) { 302 | tokens[i].bits = 61; /* cat2 = "111101" */ 303 | tokens[i].size = 6; /* 7 - 10 (4) */ 304 | tokens[i].extra_bits = coeff - 7; 305 | tokens[i].extra_size = 2; 306 | tokens[i].pcat = Pcat2; 307 | } 308 | else if (coeff <= 18) { 309 | tokens[i].bits = 124; /* cat3 = "1111100" */ 310 | tokens[i].size = 7; /* 11 - 18 (8) */ 311 | tokens[i].extra_bits = coeff - 11; 312 | tokens[i].extra_size = 3; 313 | tokens[i].pcat = Pcat3; 314 | } 315 | else if (coeff <= 34) { 316 | tokens[i].bits = 125; /* cat4 = "1111101" */ 317 | tokens[i].size = 7; /* 19 - 34 (16) */ 318 | tokens[i].extra_bits = coeff - 19; 319 | tokens[i].extra_size = 4; 320 | tokens[i].pcat = Pcat4; 321 | } 322 | else if (coeff <= 66) { 323 | tokens[i].bits = 126; /* cat5 = "1111110" */ 324 | tokens[i].size = 7; /* 35 - 66 (32) */ 325 | tokens[i].extra_bits = coeff - 35; 326 | tokens[i].extra_size = 5; 327 | tokens[i].pcat = Pcat5; 328 | } 329 | else { 330 | tokens[i].bits = 127; /* cat6 = "1111111" */ 331 | tokens[i].size = 7; /* 67 - 2048 (1982) */ 332 | tokens[i].extra_bits = coeff - 67; 333 | tokens[i].extra_size = 11; 334 | tokens[i].pcat = Pcat6; 335 | } 336 | next = tokens[i].bits; 337 | } 338 | return; 339 | } 340 | 341 | __kernel void encode_coefficients( __global macroblock *MBs, 342 | __global uchar *output, 343 | __global int *partition_sizes, 344 | __global uchar *third_context, 345 | __global uint *coeff_probs, 346 | __global uint *coeff_probs_denom, 347 | int mb_height, 348 | int mb_width, 349 | int num_partitions, 350 | int key_frame, 351 | int partition_step, 352 | int skip_prob) 353 | { 354 | int part_num = get_global_id(0); 355 | int mb_row, mb_num, mb_col, b_num; 356 | int first_context; 357 | vp8_bool_encoder vbe[1]; 358 | 359 | token tokens[16]; 360 | 361 | init_bool_encoder(vbe, output + partition_step*part_num); 362 | 363 | for (mb_row = part_num; mb_row < mb_height; mb_row+= num_partitions) 364 | { 365 | for (mb_col = 0; mb_col < mb_width; ++mb_col) 366 | { 367 | mb_num = mb_col + mb_row * mb_width; 368 | if (MBs[mb_num].non_zero_coeffs == 0) 369 | continue; 370 | if (MBs[mb_num].parts == are16x16) 371 | { 372 | first_context = 1; // for Y2 373 | tokenize_block(MBs, mb_num, 24, tokens); 374 | encode_block(vbe, coeff_probs, mb_num, 24, tokens, first_context, *(third_context + mb_num*25 + 24)); 375 | first_context = 0; //for Y, when Y2 exists 376 | } else { 377 | first_context = 3; //for Y, when Y2 is absent 378 | } 379 | // then always goes Y 380 | // 16 of them 381 | for (b_num = 0; b_num < 16; ++b_num) 382 | { 383 | tokenize_block(MBs, mb_num, b_num, tokens); 384 | encode_block(vbe, coeff_probs, mb_num, b_num, tokens, first_context, *(third_context + mb_num*25 + b_num)); 385 | } 386 | //now 8 U-blocks 387 | first_context = 2; // for all chromas 388 | for (b_num = 16; b_num < 20; ++b_num) 389 | { 390 | tokenize_block(MBs, mb_num, b_num, tokens); 391 | encode_block(vbe, coeff_probs, mb_num, b_num, tokens, first_context, *(third_context + mb_num*25 + b_num)); 392 | } 393 | //now 8 V-blocks 394 | for (b_num = 20; b_num < 24; ++b_num) 395 | { 396 | tokenize_block(MBs, mb_num, b_num, tokens); 397 | encode_block(vbe, coeff_probs, mb_num, b_num, tokens, first_context, *(third_context + mb_num*25 + b_num)); 398 | } 399 | } 400 | } 401 | flush_bool_encoder(vbe); 402 | partition_sizes[part_num] = vbe->count; 403 | 404 | return; 405 | } 406 | 407 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////// 408 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////// 409 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////// 410 | void tokenize_block_cut(__global macroblock *MBs, int mb_num, int b_num, token tokens[16]) 411 | { 412 | int next = 0; // imaginary 17th element 413 | int i; 414 | for (i = 15; i >= 0; --i) // tokenize block 415 | { 416 | int coeff = (int)MBs[mb_num].coeffs[b_num][i]; 417 | coeff = (coeff < 0) ? -coeff : coeff; 418 | if (coeff == 0) { 419 | if (next == 0) { 420 | tokens[i].bits = 0; //dct_eob = "0" 421 | tokens[i].size = 1; 422 | } else { 423 | tokens[i].bits = 2; /* 0 = "10" */ 424 | tokens[i].size = 2; 425 | } 426 | } 427 | else if (coeff == 1) { 428 | tokens[i].bits = 6; /* 1 = "110" */ 429 | tokens[i].size = 3; 430 | } 431 | else if (coeff == 2) { 432 | tokens[i].bits = 28; /* 2 = "11100" */ 433 | tokens[i].size = 5; 434 | } 435 | else if (coeff == 3) { 436 | tokens[i].bits = 58; /* 3 = "111010" */ 437 | tokens[i].size = 6; 438 | } 439 | else if (coeff == 4) { 440 | tokens[i].bits = 59; /* 4 = "111011" */ 441 | tokens[i].size = 6; 442 | } 443 | else if (coeff <= 6) { 444 | tokens[i].bits = 60; /* cat1 = "111100" */ 445 | tokens[i].size = 6; /* range 5 - 6 (size 2) */ 446 | } 447 | else if (coeff <= 10) { 448 | tokens[i].bits = 61; /* cat2 = "111101" */ 449 | tokens[i].size = 6; /* 7 - 10 (4) */ 450 | } 451 | else if (coeff <= 18) { 452 | tokens[i].bits = 124; /* cat3 = "1111100" */ 453 | tokens[i].size = 7; /* 11 - 18 (8) */ 454 | } 455 | else if (coeff <= 34) { 456 | tokens[i].bits = 125; /* cat4 = "1111101" */ 457 | tokens[i].size = 7; /* 19 - 34 (16) */ 458 | } 459 | else if (coeff <= 66) { 460 | tokens[i].bits = 126; /* cat5 = "1111110" */ 461 | tokens[i].size = 7; /* 35 - 66 (32) */ 462 | } 463 | else { 464 | tokens[i].bits = 127; /* cat6 = "1111111" */ 465 | tokens[i].size = 7; /* 67 - 2048 (1982) */ 466 | } 467 | next = tokens[i].bits; 468 | } 469 | return; 470 | } 471 | 472 | 473 | void count_probs_in_block( __global uint *const coeff_probs, 474 | __global uint *const coeff_probs_denom, 475 | const int part_num, 476 | const int mb_num, const int b_num, 477 | token *tokens, 478 | const int ctx1,const int in_ctx3) 479 | { 480 | // ctx1 = 0 for Y beggining at coefficient 1 (when y2 exists) 481 | // = 1 for Y2 482 | // = 2 for U or V 483 | // = 3 for Y beggining at coefficient 0 (when Y2 is absent) 484 | // ctx2 = coefficient position in block {(0), 1, 2, 3, ... 15} 485 | // chooses value from coeff_bands[16] = { 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7}; 486 | // ctx3 = for the first(second when ctx1 = 0) coefficient it is equal to number of nearby(above and left only counts) blocks 487 | // with non-zero coefficients 488 | // = for next coefficients it equals: to 0, when previous is zero: to 1, when previous is +1 or -1; to 2 in other cases 489 | // ctx4 = token tree position 490 | int ctx2; 491 | int ctx3 = in_ctx3; 492 | tree_index ctx4; 493 | 494 | int i = ((ctx1 == 0) ? 1 : 0); // maybe (!ctx1) 495 | int prev_is_zero = 0; 496 | 497 | for (; i < 16; ++i) 498 | { 499 | ctx2=coeff_bands[i]; 500 | 501 | // if previous coefficient was DCT_0, then current can't be EOB (inneficient to have 0,0,0,eob) 502 | // since EOB the only token, that has ZERO as highest bit 503 | // then ONE in first bit becomes implicit and doesn't require encoding 504 | 505 | // to handle this we must lower encoding bits size by 1 506 | // and tree_index at 2, instead of 0 (the route is tree[0+1]==2 when we encode "1") 507 | if (prev_is_zero) { 508 | ctx4 = 2; 509 | --(tokens[i].size); 510 | } else ctx4 = 0; 511 | 512 | //__constant Prob *const p = default_coeff_probs[ctx1][ctx2][ctx3]; 513 | 514 | do { 515 | const uchar b = (tokens[i].bits >> (--(tokens[i].size))) & 1; 516 | coeff_probs[((((part_num<<5) + (ctx1<<3)) + ctx2)*3 + ctx3)*11 + (ctx4>>1)] += (1 - b); //increase numerator when b == 0 517 | ++(coeff_probs_denom[((((part_num<<5) + (ctx1<<3)) + ctx2)*3 + ctx3)*11 + (ctx4>>1)]); // increase denominator 518 | ctx4 = coeff_tree[ctx4+b]; 519 | } while (tokens[i].size); 520 | 521 | ctx3 = 2; 522 | if (tokens[i].bits == 6) ctx3 = 1; /* DCT__1 = "110" */ 523 | if (tokens[i].bits == 2) { //DCT__0 == "10" 524 | prev_is_zero = 1; 525 | ctx3 = 0; 526 | } 527 | else { 528 | prev_is_zero = 0; 529 | } 530 | } 531 | return; 532 | } 533 | 534 | __kernel void count_probs( __global macroblock *MBs, 535 | __global uint *coeff_probs, 536 | __global uint *coeff_probs_denom, 537 | __global uchar *third_context, 538 | int mb_height, 539 | int mb_width, 540 | int num_partitions, 541 | int key_frame, 542 | int partition_step) 543 | { 544 | int part_num = get_global_id(0); 545 | int mb_row, mb_num, mb_col, b_num; 546 | int prev_mb, prev_b; 547 | int i; 548 | int first_context, firstCoeff; 549 | token tokens[16]; 550 | 551 | { 552 | // we have to work with 1-dimensional array in global memory, so 553 | // coeff_probs[p][ctx1][ctx2][ctx3][ctx4] => *(coeff_probs + p*11*3*8*4 + ctx1*11*3*8 + ctx2*11*3 + ctx3*11 + ctx4) 554 | // or coeff_probs[((((p<<5) + (ctx1<<3)) + ctx2)*3 + ctx3)*11 + ctx4] 555 | int ctx1, ctx2, ctx3, ctx4; 556 | for (ctx1 = 0; ctx1 < 4; ++ctx1) 557 | for (ctx2 = 0; ctx2 < 8; ++ctx2) 558 | for (ctx3 = 0; ctx3 < 3; ++ctx3) 559 | for (ctx4 = 0; ctx4 < 11; ++ctx4) { 560 | coeff_probs[((((part_num<<5) + (ctx1<<3)) + ctx2)*3 + ctx3)*11 + ctx4] = 0; 561 | coeff_probs_denom[((((part_num<<5) + (ctx1<<3)) + ctx2)*3 + ctx3)*11 + ctx4] = 1; 562 | } 563 | } 564 | 565 | for (mb_row = part_num; mb_row < mb_height; mb_row+= num_partitions) 566 | { 567 | for (mb_col = 0; mb_col < mb_width; ++mb_col) 568 | { 569 | mb_num = mb_col + mb_row * mb_width; 570 | if (MBs[mb_num].non_zero_coeffs == 0) 571 | continue; 572 | if (MBs[mb_num].parts == are16x16) 573 | { 574 | first_context = 1; // for Y2 575 | *(third_context + mb_num*25 + 24) = 0; 576 | if (mb_row > 0) { // check if "above" Y2 has non zero 577 | // we go up until we find MB with Y2 mode enabled 578 | prev_mb = mb_num-mb_width; 579 | while(prev_mb>=0) { 580 | if (MBs[prev_mb].parts == are16x16) break; 581 | prev_mb-=mb_width; 582 | } 583 | if (prev_mb >= 0) 584 | for (i = 0; i < 16; ++i) { 585 | if (MBs[prev_mb].coeffs[24][i] != 0) { 586 | ++(*(third_context + mb_num*25 + 24)); 587 | break; 588 | } 589 | } 590 | } 591 | if (mb_col > 0) { // check if "left" Y2 has non zero 592 | prev_mb = mb_num-1; 593 | while (prev_mb >= (mb_row*mb_width)) { 594 | if (MBs[prev_mb].parts == are16x16) break; 595 | --prev_mb; 596 | } 597 | if (prev_mb >= (mb_row*mb_width)) 598 | for (i = 0; i < 16; ++i) { 599 | if (MBs[prev_mb].coeffs[24][i] != 0) { 600 | ++(*(third_context + mb_num*25 + 24)); 601 | break; 602 | } 603 | } 604 | } 605 | tokenize_block_cut(MBs, mb_num, 24, tokens); 606 | count_probs_in_block(coeff_probs, coeff_probs_denom, part_num, mb_num, 24, tokens, first_context, *(third_context + mb_num*25 + 24)); 607 | first_context = 0; //for Y, when Y2 exists 608 | } else first_context = 3; //for Y, when Y2 is absent 609 | // then always goes Y 610 | // 16 of them 611 | for (b_num = 0; b_num < 16; ++b_num) 612 | { 613 | *(third_context + mb_num*25 + b_num) = 0; 614 | // look above: 615 | prev_mb = -1; // as flag, that above is empty 616 | if ((b_num >> 2) > 0) { // /4 617 | prev_mb = mb_num; 618 | prev_b = b_num - 4; 619 | } 620 | else if (mb_row > 0) { 621 | prev_mb = mb_num - mb_width; 622 | prev_b = b_num + 12; 623 | } 624 | if (prev_mb >= 0) { 625 | firstCoeff = (MBs[prev_mb].parts == are16x16) ? 1 : 0; 626 | for (i = firstCoeff; i < 16; ++i) { 627 | if (MBs[prev_mb].coeffs[prev_b][i] != 0) { 628 | ++(*(third_context + mb_num*25 + b_num)); 629 | break; 630 | } 631 | } 632 | } 633 | // look to the left 634 | prev_mb = -1; 635 | if ((b_num & 3) > 0) { // %4 636 | prev_mb = mb_num; 637 | prev_b = b_num - 1; 638 | } 639 | else if (mb_col > 0) { 640 | prev_mb = mb_num - 1; 641 | prev_b = b_num + 3; 642 | } 643 | if (prev_mb >= 0) { 644 | firstCoeff = (MBs[prev_mb].parts == are16x16) ? 1 : 0; 645 | for (i = firstCoeff; i < 16; ++i) { 646 | if (MBs[prev_mb].coeffs[prev_b][i] != 0) { 647 | ++(*(third_context + mb_num*25 + b_num)); 648 | break; 649 | } 650 | } 651 | } 652 | tokenize_block_cut(MBs, mb_num, b_num, tokens); 653 | count_probs_in_block(coeff_probs, coeff_probs_denom, part_num, mb_num, b_num, tokens, first_context, *(third_context + mb_num*25 + b_num)); 654 | } 655 | //now 8 U-blocks 656 | first_context = 2; // for all chromas 657 | for (b_num = 16; b_num < 20; ++b_num) 658 | { 659 | *(third_context + mb_num*25 + b_num) = 0; 660 | // look above: 661 | prev_mb = -1; // as flag, that above is empty 662 | if (((b_num-16) >> 1) > 0) { // /2 663 | prev_mb = mb_num; 664 | prev_b = b_num - 2; 665 | } 666 | else if (mb_row > 0) { 667 | prev_mb = mb_num - mb_width; 668 | prev_b = b_num + 2; 669 | } 670 | if (prev_mb >= 0) { 671 | for (i = 0; i < 16; ++i) { 672 | if (MBs[prev_mb].coeffs[prev_b][i] != 0) { 673 | ++(*(third_context + mb_num*25 + b_num)); 674 | break; 675 | } 676 | } 677 | } 678 | // look to the left 679 | prev_mb = -1; 680 | if (((b_num-16) & 1) > 0) { // %2 681 | prev_mb = mb_num; 682 | prev_b = b_num - 1; 683 | } 684 | else if (mb_col > 0) { 685 | prev_mb = mb_num - 1; 686 | prev_b = b_num + 1; 687 | } 688 | if (prev_mb >= 0) { 689 | for (i = 0; i < 16; ++i) { 690 | if (MBs[prev_mb].coeffs[prev_b][i] != 0) { 691 | ++(*(third_context + mb_num*25 + b_num)); 692 | break; 693 | } 694 | } 695 | } 696 | tokenize_block_cut(MBs, mb_num, b_num, tokens); 697 | count_probs_in_block(coeff_probs, coeff_probs_denom, part_num, mb_num, b_num, tokens, first_context, *(third_context + mb_num*25 + b_num)); 698 | } 699 | //now 8 V-blocks 700 | for (b_num = 20; b_num < 24; ++b_num) 701 | { 702 | *(third_context + mb_num*25 + b_num) = 0; 703 | // look above: 704 | prev_mb = -1; // as flag, that above is empty 705 | if (((b_num-20) >> 1) > 0) { // /2 706 | prev_mb = mb_num; 707 | prev_b = b_num - 2; 708 | } 709 | else if (mb_row > 0) { 710 | prev_mb = mb_num - mb_width; 711 | prev_b = b_num + 2; 712 | } 713 | if (prev_mb >= 0) { 714 | for (i = 0; i < 16; ++i) { 715 | if (MBs[prev_mb].coeffs[prev_b][i] != 0) { 716 | ++(*(third_context + mb_num*25 + b_num)); 717 | break; 718 | } 719 | } 720 | } 721 | // look to the left 722 | prev_mb = -1; 723 | if (((b_num-20) & 1) > 0) { // %2 724 | prev_mb = mb_num; 725 | prev_b = b_num - 1; 726 | } 727 | else if (mb_col > 0) { 728 | prev_mb = mb_num - 1; 729 | prev_b = b_num + 1; 730 | } 731 | if (prev_mb >= 0) { 732 | for (i = 0; i < 16; ++i) { 733 | if (MBs[prev_mb].coeffs[prev_b][i] != 0) { 734 | ++(*(third_context + mb_num*25 + b_num)); 735 | break; 736 | } 737 | } 738 | } 739 | tokenize_block_cut(MBs, mb_num, b_num, tokens); 740 | count_probs_in_block(coeff_probs, coeff_probs_denom, part_num, mb_num, b_num, tokens, first_context, *(third_context + mb_num*25 + b_num)); 741 | } 742 | } 743 | 744 | } 745 | 746 | return; 747 | } 748 | 749 | __kernel void num_div_denom(__global uint *coeff_probs, 750 | __global uint *coeff_probs_denom, 751 | int num_partitions) 752 | { 753 | int part_num = get_global_id(0); 754 | int ctx1, ctx2, ctx3, ctx4, p; 755 | uint num, denom; 756 | for (ctx1 = 0; ctx1 < 4; ++ctx1) 757 | for (ctx2 = part_num; ctx2 < 8; ctx2 += num_partitions) 758 | for (ctx3 = 0; ctx3 < 3; ++ctx3) 759 | for (ctx4 = 0; ctx4 < 11; ++ctx4) { 760 | num = 0; 761 | denom = 0; 762 | for (p = 0; p < num_partitions; ++p) { 763 | num += coeff_probs[((((p<<5) + (ctx1<<3)) + ctx2)*3 + ctx3)*11 + ctx4]; 764 | denom += coeff_probs_denom[((((p<<5) + (ctx1<<3)) + ctx2)*3 + ctx3)*11 + ctx4]; 765 | } 766 | num = (num << 8) / denom; 767 | coeff_probs[(((ctx1<<3) + ctx2)*3 + ctx3)*11 + ctx4] = (num > 255) ? 255 : ((num == 0) ? 1 : num); 768 | } 769 | return; 770 | } 771 | 772 | 773 | #ifdef LOOP_FILTER 774 | __kernel void prepare_filter_mask(__global macroblock *const MBs, //0 775 | __global int *const mb_mask, //1 776 | const int width, //2 777 | const int height, //3 778 | const int parts) //4 779 | { 780 | __private int mb_num, b_num, mb_row, mb_col, mb_height, mb_width, i, mask, coeffs, split_mode; 781 | mb_height = height/16; 782 | mb_width = width/16; 783 | 784 | for (mb_row = get_global_id(0); mb_row < mb_height; mb_row += parts) 785 | { 786 | for (mb_col = 0; mb_col < mb_width; ++mb_col) 787 | { 788 | mb_num = mb_row * mb_width + mb_col; 789 | //printf((__constant char*)"%d\n",mb_num); 790 | mask = 0; coeffs = 0; split_mode = MBs[mb_num].parts; 791 | for (b_num = 0; b_num < 16; ++b_num) { 792 | for (i = 1; i < 16; ++i) { 793 | coeffs += (int)abs(MBs[mb_num].coeffs[b_num][i]); 794 | } 795 | } 796 | for (b_num = 16; b_num < 24; ++b_num) { 797 | for (i = 0; i < 16; ++i) { 798 | coeffs += (int)abs(MBs[mb_num].coeffs[b_num][i]); 799 | } 800 | } 801 | if (split_mode == are16x16) { 802 | for (i = 0; i < 16; ++i) { 803 | coeffs += (int)abs(MBs[mb_num].coeffs[24][i]); 804 | } 805 | } 806 | else { 807 | for (b_num = 0; b_num < 16; ++b_num) { 808 | coeffs += (int)abs(MBs[mb_num].coeffs[b_num][0]); 809 | } 810 | } 811 | 812 | MBs[mb_num].non_zero_coeffs = coeffs; 813 | mask = ((split_mode != are16x16) || (coeffs > 0)) ? -1 : 0; 814 | mb_mask[mb_num] = mask; 815 | } 816 | } 817 | return; 818 | } 819 | 820 | void filter_mb_edge(short8 *const p3, short8 *const p2, short8 *const p1, short8 *const p0, 821 | short8 *const q0, short8 *const q1, short8 *const q2, short8 *const q3, 822 | ushort mb_lim, ushort int_lim, ushort hev_thr) 823 | { 824 | short8 mask, hev, a, b, w; 825 | 826 | mask = (abs(*p3 - *p2) > int_lim); 827 | mask |= (abs(*p2 - *p1) > int_lim); 828 | mask |= (abs(*p1 - *p0) > int_lim); 829 | mask |= (abs(*q1 - *q0) > int_lim); 830 | mask |= (abs(*q2 - *q1) > int_lim); 831 | mask |= (abs(*q3 - *q2) > int_lim); 832 | mask |= ((abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2) > mb_lim); 833 | mask = ~mask; // for vectors in OpenCL TRUE means -1 (all bits set) 834 | hev = (abs(*p1 - *p0) > hev_thr); 835 | hev |= (abs(*q1 - *q0) > hev_thr); 836 | //w = clamp128(clamp128(p1 - q1) + 3*(q0 - p0)); 837 | w = *p1 - *q1; 838 | w = select(w,-128,w<-128); 839 | w = select(w,127,w>127); 840 | w += (*q0 - *p0) * (short)3; 841 | w = select(w,-128,w<-128); 842 | w = select(w,127,w>127); 843 | w &= mask; 844 | a = w & hev; 845 | // b = clamp128(a+3) >> 3 846 | b = a + 3; 847 | b = select(b,-128,b<-128); 848 | b = select(b,127,b>127); 849 | b >>= 3; 850 | // a = clamp128(a+4) >> 3 851 | a = a + 4; 852 | a = select(a,-128,a<-128); 853 | a = select(a,127,a>127); 854 | a >>= 3; 855 | *q0 -= a; *p0 += b; 856 | w &= ~hev; 857 | //a = clamp128((27*w + 63) >> 7); 858 | a = (w * (short)27 + (short)63) >> 7; 859 | a = select(a,-128,a<-128); 860 | a = select(a,127,a>127); 861 | *q0 -= a; *p0 += a; 862 | //a = clamp128((18*w + 63) >> 7); 863 | a = (w * (short)18 + (short)63) >> 7; 864 | a = select(a,-128,a<-128); 865 | a = select(a,127,a>127); 866 | *q1 -= a; *p1 += a; 867 | //a = clamp128((9*w + 63) >> 7); 868 | a = (w * (short)9 + (short)63) >> 7; 869 | a = select(a,-128,a<-128); 870 | a = select(a,127,a>127); 871 | *q2 -= a; *p2 += a; 872 | 873 | return; 874 | } 875 | 876 | void filter_b_edge(short8 *const p3, short8 *const p2, short8 *const p1, short8 *const p0, 877 | short8 *const q0, short8 *const q1, short8 *const q2, short8 *const q3, 878 | ushort b_lim, ushort int_lim, ushort hev_thr) 879 | { 880 | short8 mask, hev, a, b; 881 | 882 | mask = (abs(*p3 - *p2) > int_lim); 883 | mask |= (abs(*p2 - *p1) > int_lim); 884 | mask |= (abs(*p1 - *p0) > int_lim); 885 | mask |= (abs(*q1 - *q0) > int_lim); 886 | mask |= (abs(*q2 - *q1) > int_lim); 887 | mask |= (abs(*q3 - *q2) > int_lim); 888 | mask |= ((abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2) > b_lim); 889 | mask = ~mask; // for vectors in OpenCL TRUE means -1 (all bits set) 890 | hev = (abs(*p1 - *p0) > hev_thr); 891 | hev |= (abs(*q1 - *q0) > hev_thr); 892 | //a = clamp128((use_outer_taps? clamp128(p1 - q1) : 0) + 3*(q0 - p0)); 893 | a = *p1 - *q1; 894 | a = select(a,-128,a<-128); 895 | a = select(a,127,a>127); 896 | a &= hev; 897 | a += (*q0 - *p0) * (short)3; 898 | a = select(a,-128,a<-128); 899 | a = select(a,127,a>127); 900 | a &= mask; 901 | // b = clamp128(a+3) >> 3 902 | b = a + 3; 903 | b = select(b,-128,b<-128); 904 | b = select(b,127,b>127); 905 | b >>= 3; 906 | // a = clamp128(a+4) >> 3 907 | a = a + 4; 908 | a = select(a,-128,a<-128); 909 | a = select(a,127,a>127); 910 | a >>= 3; 911 | *q0 -= a; *p0 += b; 912 | a = (a + 1) >> 1; 913 | a &= ~hev; 914 | *q1 -= a; *p1 += a; 915 | 916 | return; 917 | } 918 | 919 | void read8p(__global uchar *const frame, const int pos, const int step, short8 *const V) 920 | { 921 | int i = pos; 922 | (*V).s0 = (short)frame[i] - 128; i += step; 923 | (*V).s1 = (short)frame[i] - 128; i += step; 924 | (*V).s2 = (short)frame[i] - 128; i += step; 925 | (*V).s3 = (short)frame[i] - 128; i += step; 926 | (*V).s4 = (short)frame[i] - 128; i += step; 927 | (*V).s5 = (short)frame[i] - 128; i += step; 928 | (*V).s6 = (short)frame[i] - 128; i += step; 929 | (*V).s7 = (short)frame[i] - 128; 930 | return; 931 | } 932 | 933 | void write8p(__global uchar *const frame, const int pos, const int step, short8 *const V) 934 | { 935 | int i = pos; 936 | uchar8 buf; 937 | buf = convert_uchar8_sat(*V + 128); 938 | frame[i] = buf.s0; i += step; 939 | frame[i] = buf.s1; i += step; 940 | frame[i] = buf.s2; i += step; 941 | frame[i] = buf.s3; i += step; 942 | frame[i] = buf.s4; i += step; 943 | frame[i] = buf.s5; i += step; 944 | frame[i] = buf.s6; i += step; 945 | frame[i] = buf.s7; 946 | return; 947 | } 948 | 949 | __kernel void loop_filter_frame(__global uchar *const frame, //0 950 | __global macroblock *const MBs, //1 951 | __global int *const mb_mask, //2 952 | __constant const segment_data *const SD, //3 953 | const int width, //4 954 | const int height, //5 955 | const int mb_size) //6 956 | { 957 | __private int mb_num, mb_width, mb_count; 958 | __private int x0,y0,x,y,i; 959 | __private short int_lim, mb_lim, b_lim, hev_thr; 960 | __private short8 p3,p2,p1,p0,q0,q1,q2,q3; 961 | 962 | if (get_global_id(0) != 0) return; //there can be only one 963 | 964 | mb_width = width/mb_size; 965 | mb_count = mb_width*(height/mb_size); 966 | 967 | for (mb_num = 0; mb_num < mb_count; ++mb_num) 968 | { 969 | i = MBs[mb_num].segment_id; 970 | if (SD[i].loop_filter_level == 0) return; 971 | int_lim = (short)SD[i].interior_limit; 972 | mb_lim = (short)SD[i].mbedge_limit; 973 | b_lim = (short)SD[i].sub_bedge_limit; 974 | hev_thr = (short)SD[i].hev_threshold; 975 | 976 | x0 = (mb_num%mb_width)*mb_size; 977 | y0 = (mb_num/mb_width)*mb_size; 978 | 979 | // horizontal 980 | for (y = y0; (y-y0) < mb_size; y += 8) 981 | { 982 | x = x0; 983 | i = y * width + x; read8p(frame,i,width,&q0); 984 | ++i; read8p(frame,i,width,&q1); 985 | ++i; read8p(frame,i,width,&q2); 986 | ++i; read8p(frame,i,width,&q3); 987 | if (x0>0) 988 | { 989 | i = y * width + x-4; read8p(frame,i,width,&p3); 990 | ++i; read8p(frame,i,width,&p2); 991 | ++i; read8p(frame,i,width,&p1); 992 | ++i; read8p(frame,i,width,&p0); 993 | filter_mb_edge(&p3,&p2,&p1,&p0,&q0,&q1,&q2,&q3,mb_lim,int_lim,hev_thr); 994 | i = y * width + x-3; write8p(frame,i,width,&p2); 995 | ++i; write8p(frame,i,width,&p1); 996 | ++i; write8p(frame,i,width,&p0); 997 | ++i; write8p(frame,i,width,&q0); 998 | ++i; write8p(frame,i,width,&q1); 999 | ++i; write8p(frame,i,width,&q2); 1000 | } 1001 | 1002 | for (x = x0 + 4; ((x-x0) < mb_size) && (mb_mask[mb_num]); x += 4) 1003 | { 1004 | p3 = q0; p2 = q1; p1 = q2; p0 = q3; 1005 | i = y * width + x; read8p(frame,i,width,&q0); 1006 | ++i; read8p(frame,i,width,&q1); 1007 | ++i; read8p(frame,i,width,&q2); 1008 | ++i; read8p(frame,i,width,&q3); 1009 | filter_b_edge(&p3,&p2,&p1,&p0,&q0,&q1,&q2,&q3,b_lim,int_lim,hev_thr); 1010 | i = y * width + x-2; write8p(frame,i,width,&p1); 1011 | ++i; write8p(frame,i,width,&p0); 1012 | ++i; write8p(frame,i,width,&q0); 1013 | ++i; write8p(frame,i,width,&q1); 1014 | } 1015 | } 1016 | 1017 | // vertically 1018 | 1019 | for (x = x0; (x-x0) < mb_size; x += 8) 1020 | { 1021 | y = y0; 1022 | i = y * width + x; q0 = convert_short8(vload8(0, frame + i)) - 128; 1023 | i += width; q1 = convert_short8(vload8(0, frame + i)) - 128; 1024 | i += width; q2 = convert_short8(vload8(0, frame + i)) - 128; 1025 | i += width; q3 = convert_short8(vload8(0, frame + i)) - 128; 1026 | if (y0 > 0) 1027 | { 1028 | i = (y-4) * width + x; p3 = convert_short8(vload8(0, frame + i)) - 128; 1029 | i += width; p2 = convert_short8(vload8(0, frame + i)) - 128; 1030 | i += width; p1 = convert_short8(vload8(0, frame + i)) - 128; 1031 | i += width; p0 = convert_short8(vload8(0, frame + i)) - 128; 1032 | filter_mb_edge(&p3,&p2,&p1,&p0,&q0,&q1,&q2,&q3,mb_lim,int_lim,hev_thr); 1033 | i = (y-3) * width + x; vstore8(convert_uchar8_sat(p2 + 128),0,frame + i); 1034 | i += width; vstore8(convert_uchar8_sat(p1 + 128), 0, frame + i); 1035 | i += width; vstore8(convert_uchar8_sat(p0 + 128), 0, frame + i); 1036 | i += width; vstore8(convert_uchar8_sat(q0 + 128), 0, frame + i); 1037 | i += width; vstore8(convert_uchar8_sat(q1 + 128), 0, frame + i); 1038 | i += width; vstore8(convert_uchar8_sat(q2 + 128), 0, frame + i); 1039 | } 1040 | 1041 | for (y = y0 + 4; ((y - y0) < mb_size) && (mb_mask[mb_num]); y += 4) 1042 | { 1043 | p3 = q0; p2 = q1; p1 = q2; p0 = q3; 1044 | i = y * width + x; q0 = convert_short8(vload8(0, frame + i)) - 128; 1045 | i += width; q1 = convert_short8(vload8(0, frame + i)) - 128; 1046 | i += width; q2 = convert_short8(vload8(0, frame + i)) - 128; 1047 | i += width; q3 = convert_short8(vload8(0, frame + i)) - 128; 1048 | filter_b_edge(&p3,&p2,&p1,&p0,&q0,&q1,&q2,&q3,b_lim,int_lim,hev_thr); 1049 | i = (y-2) * width + x; vstore8(convert_uchar8_sat(p1 + 128), 0, frame + i); 1050 | i += width; vstore8(convert_uchar8_sat(p0 + 128), 0, frame + i); 1051 | i += width; vstore8(convert_uchar8_sat(q0 + 128), 0, frame + i); 1052 | i += width; vstore8(convert_uchar8_sat(q1 + 128), 0, frame + i); 1053 | } 1054 | } 1055 | 1056 | } 1057 | 1058 | } 1059 | #endif 1060 | --------------------------------------------------------------------------------