├── .appveyor.yml ├── .travis.yml ├── CMakeLists.txt ├── README.md ├── SW.h ├── SWalloc.h ├── SWbuffer.c ├── SWbuffer.h ├── SWcompress.c ├── SWcompress.h ├── SWcontext.c ├── SWcontext.h ├── SWcore.c ├── SWcore.h ├── SWcpu.c ├── SWcpu.h ├── SWculling.c ├── SWculling.h ├── SWculling_AVX2.c ├── SWculling_AVX512.c ├── SWculling_NEON.c ├── SWculling_Ref.c ├── SWculling_SSE2.c ├── SWculling_rast.inl ├── SWculling_rast_scanline.inl ├── SWculling_rast_tri.inl ├── SWdraw.c ├── SWdraw.h ├── SWframebuffer.c ├── SWframebuffer.h ├── SWintrin.inl ├── SWpixels.h ├── SWprogram.c ├── SWprogram.h ├── SWrasterize.c ├── SWrasterize.h ├── SWtexture.c ├── SWtexture.h ├── SWtypes.h ├── SWzbuffer.c ├── SWzbuffer.h ├── _SW.c ├── _SW_AVX2.c ├── _SW_AVX512.c ├── _SW_NEON.c ├── _SW_SSE2.c ├── img1.jpg ├── img2.jpg ├── img3.jpg └── tests ├── CMakeLists.txt ├── main.c ├── test_buffer.c ├── test_common.h ├── test_context.c ├── test_framebuffer.c ├── test_pixels.c ├── test_program.c ├── test_texture.c └── test_zbuffer.c /.appveyor.yml: -------------------------------------------------------------------------------- 1 | branches: 2 | only: 3 | - master 4 | 5 | image: 6 | - Visual Studio 2013 7 | - Visual Studio 2015 8 | - Visual Studio 2017 9 | 10 | clone_folder: c:\projects\SW 11 | shallow_clone: true 12 | 13 | matrix: 14 | fast_finish: true 15 | 16 | platform: 17 | - x64 18 | - x86 19 | 20 | configuration: 21 | - Debug 22 | - Release 23 | 24 | init: 25 | - set arch= 26 | - if "%PLATFORM%"=="x64" ( set arch= Win64) 27 | - if "%APPVEYOR_BUILD_WORKER_IMAGE%"=="Visual Studio 2017" ( set generator="Visual Studio 15 2017%arch%" ) 28 | - if "%APPVEYOR_BUILD_WORKER_IMAGE%"=="Visual Studio 2015" ( set generator="Visual Studio 14 2015%arch%" ) 29 | - if "%APPVEYOR_BUILD_WORKER_IMAGE%"=="Visual Studio 2013" ( set generator="Visual Studio 12 2013%arch%" ) 30 | 31 | before_build: 32 | - mkdir build 33 | - cd build 34 | - cmake .. -G %generator% 35 | 36 | build: 37 | project: build\ALL_BUILD.vcxproj 38 | 39 | test_script: 40 | - tests\%CONFIGURATION%\test_SW 41 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | compiler: gcc 3 | dist: trusty 4 | env: 5 | - PACKAGE_VERSION="0.0.*" 6 | matrix: 7 | include: 8 | - compiler: gcc 9 | addons: 10 | apt: 11 | sources: 12 | - ubuntu-toolchain-r-test 13 | packages: 14 | - g++-6 15 | env: COMPILER=g++-6 16 | script: 17 | - mkdir build 18 | - "(cd build && cmake .. && make)" 19 | - "(cd build/tests && ./test_SW)" 20 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.1) 2 | project(SW) 3 | 4 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") 5 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DNDEBUG -O3 -ftree-vectorize -msse4.1 -fvariable-expansion-in-unroller -ftree-vectorizer-verbose=0 -ftree-loop-if-convert-stores -std=c99 -Wall -Wno-switch -Wno-attributes -Wno-unused-function -Wno-unknown-pragmas -ffast-math") 6 | endif() 7 | 8 | set(SOURCE_FILES SW.h 9 | SWalloc.h 10 | SWtypes.h 11 | SWcore.h 12 | SWcore.c 13 | SWcontext.h 14 | SWcontext.c 15 | SWculling.h 16 | SWculling.c 17 | SWculling_Ref.c 18 | SWculling_AVX2.c 19 | SWculling_AVX512.c 20 | SWculling_NEON.c 21 | SWculling_SSE2.c 22 | SWculling_rast.inl 23 | SWculling_rast_scanline.inl 24 | SWculling_rast_tri.inl 25 | SWprogram.h 26 | SWprogram.c 27 | SWrasterize.h 28 | SWrasterize.c 29 | SWframebuffer.h 30 | SWframebuffer.c 31 | SWintrin.inl 32 | SWdraw.h 33 | SWdraw.c 34 | SWtexture.h 35 | SWtexture.c 36 | SWbuffer.h 37 | SWbuffer.c 38 | SWzbuffer.h 39 | SWzbuffer.c 40 | SWpixels.h 41 | SWcpu.h 42 | SWcpu.c 43 | SWcompress.h 44 | SWcompress.c) 45 | 46 | list(APPEND ALL_SOURCE_FILES ${SOURCE_FILES}) 47 | set_source_files_properties(${SOURCE_FILES} PROPERTIES HEADER_FILE_ONLY 1) 48 | source_group("src" FILES ${SOURCE_FILES}) 49 | 50 | list(APPEND ALL_SOURCE_FILES _SW.c _SW_SSE2.c _SW_AVX2.c _SW_AVX512.c _SW_NEON.c) 51 | source_group("src" FILES _SW.c _SW_SSE2.c _SW_AVX2.c _SW_AVX512.c _SW_NEON.c) 52 | 53 | STRING (REGEX REPLACE "/RTC(su|[1su])" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") 54 | 55 | if(MSVC) 56 | if(NOT CMAKE_CL_64) 57 | set_source_files_properties(_SW_SSE2.c PROPERTIES COMPILE_FLAGS /arch:SSE2) 58 | else() 59 | set_source_files_properties(_SW_SSE2.c PROPERTIES COMPILE_FLAGS "/O2 /Ob2") 60 | endif() 61 | if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") 62 | set_source_files_properties(_SW.c PROPERTIES COMPILE_FLAGS -msse4.1) 63 | set_source_files_properties(_SW_SSE2.c PROPERTIES COMPILE_FLAGS -msse4.1) 64 | endif() 65 | set_source_files_properties(_SW_AVX2.c PROPERTIES COMPILE_FLAGS "/arch:AVX2 /O2 /Ob2") 66 | set_source_files_properties(_SW_AVX512.c PROPERTIES COMPILE_FLAGS "/arch:AVX512 /O2 /Ob2") 67 | else(MSVC) 68 | set_source_files_properties(_SW_SSE2.c PROPERTIES COMPILE_FLAGS "-msse2 -O2") 69 | set_source_files_properties(_SW_AVX2.c PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -O2") 70 | set_source_files_properties(_SW_AVX512.c PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512bw -mavx512dq -O2") 71 | endif(MSVC) 72 | 73 | add_library(SW STATIC ${ALL_SOURCE_FILES}) 74 | set_target_properties(SW PROPERTIES UNITY_BUILD OFF) 75 | 76 | add_subdirectory(tests) 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build status](https://ci.appveyor.com/api/projects/status/46ky3ltwvain51u6?svg=true)](https://ci.appveyor.com/project/MrApfel1994/sw) 2 | # SW 3 | Simple software rasterizer with OpenGL-like interface. 4 | It's neither fast nor good, created for learning purposes. 5 | Inspired by this article: https://web.archive.org/web/20180129085015/http://forum.devmaster.net/t/advanced-rasterization/6145 6 | 7 | - Full Application: https://bitbucket.org/Apfel1994/swdemo 8 | 9 | ![Screenshot](img1.jpg)|![Screenshot](img2.jpg)|![Screenshot](img3.jpg) 10 | :-------------------------:|:-------------------------:|:-------------------------: 11 | 12 | ### Drawing 13 | 14 | ```cpp 15 | swUseProgram(program); 16 | 17 | swBindBuffer(SW_ARRAY_BUFFER, attribs_buf_id); 18 | swBindBuffer(SW_INDEX_BUFFER, indices_buf_id); 19 | 20 | swSetUniform(U_MVP, SW_MAT4, &mvp_mat[0][0]); 21 | 22 | const int stride = sizeof(float) * 8; 23 | swVertexAttribPointer(A_POS, 3 * sizeof(float), (SWuint)stride, (void *)0); 24 | swVertexAttribPointer(A_UVS, 2 * sizeof(float), (SWuint)stride, (void *)(6 * sizeof(float))); 25 | 26 | swDrawElements(SW_TRIANGLE_STRIP, (SWuint)num_indices, SW_UNSIGNED_SHORT, (void *)uintptr_t(offset)); 27 | ``` 28 | 29 | ### Shaders (not really, just function pointers) 30 | 31 | ```cpp 32 | enum { A_POS, 33 | A_UVS }; 34 | 35 | enum { V_UVS }; 36 | 37 | enum { U_MVP, 38 | U_AMBIENT }; 39 | 40 | VSHADER environment_vs(VS_IN, VS_OUT) { 41 | using namespace glm; 42 | 43 | *(vec2 *)V_FVARYING(V_UVS) = make_vec2(V_FATTR(A_UVS)); 44 | *(vec4 *)V_POS_OUT = make_mat4(F_UNIFORM(U_MVP)) * vec4(make_vec3(V_FATTR(A_POS)), 1); 45 | } 46 | 47 | FSHADER environment_fs(FS_IN, FS_OUT) { 48 | using namespace glm; 49 | 50 | const vec4 &fl = make_vec4(F_UNIFORM(U_FLASHLIGHT_POS)); 51 | 52 | TEXTURE(DIFFUSEMAP_SLOT, F_FVARYING_IN(V_UVS), F_COL_OUT); 53 | *(vec3 *) F_COL_OUT *= make_vec3(F_UNIFORM(U_AMBIENT)); 54 | } 55 | 56 | ... 57 | 58 | SWint program = swCreateProgram(); 59 | swUseProgram(p); 60 | swInitProgram(environment_vs, environment_fs, 2); 61 | 62 | swRegisterUniformv(U_MVP, SW_MAT4, 1); 63 | swRegisterUniformv(U_AMBIENT, SW_VEC3, 1); 64 | ``` 65 | 66 | -------------------------------------------------------------------------------- /SW.h: -------------------------------------------------------------------------------- 1 | #ifndef SW_H 2 | #define SW_H 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include "SWcontext.h" 9 | #include "SWdraw.h" 10 | 11 | #include "SWculling.h" 12 | 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | 17 | #endif /* SW_H */ -------------------------------------------------------------------------------- /SWalloc.h: -------------------------------------------------------------------------------- 1 | #ifndef SW_ALLOC_H 2 | #define SW_ALLOC_H 3 | 4 | #include 5 | #include 6 | 7 | static inline void* sw_aligned_malloc(size_t size, size_t alignment) { 8 | assert(alignment > sizeof(void *)); 9 | size_t space = size + (alignment - 1); 10 | 11 | void *ptr = malloc(space + sizeof(void *)); 12 | void *original_ptr = ptr; 13 | 14 | char *ptr_bytes = (char *)ptr; 15 | ptr_bytes += sizeof(void *); 16 | 17 | size_t off = (size_t)((uintptr_t)(ptr_bytes) % alignment); 18 | if (off) { 19 | off = alignment - off; 20 | } 21 | ptr_bytes += off; 22 | assert(((uintptr_t)(ptr_bytes) % alignment) == 0); 23 | 24 | ptr = ptr_bytes; 25 | ptr_bytes -= sizeof(void *); 26 | 27 | memcpy(ptr_bytes, &original_ptr, sizeof(void *)); 28 | 29 | return ptr; 30 | } 31 | 32 | static inline void sw_aligned_free(void* p) { 33 | if (p) { 34 | free(((void **)p)[-1]); 35 | } 36 | } 37 | 38 | #endif // SW_ALLOC_H -------------------------------------------------------------------------------- /SWbuffer.c: -------------------------------------------------------------------------------- 1 | #include "SWbuffer.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | void swBufInit(SWbuffer *b, const SWuint size, const void *data) { 8 | void *_data = malloc(size); 9 | assert(_data); 10 | if (data) { 11 | memcpy(_data, data, size); 12 | } 13 | b->size = size; 14 | b->data = _data; 15 | } 16 | 17 | void swBufDestroy(SWbuffer *b) { 18 | free(b->data); 19 | memset(b, 0, sizeof(SWbuffer)); 20 | } 21 | 22 | void swBufSetData(SWbuffer *b, const SWuint offset, const SWuint size, const void *data) { 23 | assert(b->data); 24 | memcpy((char *)b->data + offset, data, size); 25 | } 26 | 27 | void swBufGetData(const SWbuffer *b, const SWuint offset, const SWuint size, void *data) { 28 | assert(b->data); 29 | memcpy(data, (char *)b->data + offset, size); 30 | } 31 | -------------------------------------------------------------------------------- /SWbuffer.h: -------------------------------------------------------------------------------- 1 | #ifndef SW_BUFFER_H 2 | #define SW_BUFFER_H 3 | 4 | #include "SWcore.h" 5 | 6 | typedef struct SWbuffer { 7 | SWuint size; 8 | void *data; 9 | } SWbuffer; 10 | 11 | void swBufInit(SWbuffer *b, SWuint size, const void *data); 12 | void swBufDestroy(SWbuffer *b); 13 | 14 | void swBufSetData(SWbuffer *b, SWuint offset, SWuint size, const void *data); 15 | void swBufGetData(const SWbuffer *b, SWuint offset, SWuint size, void *data); 16 | 17 | #endif /* SW_BUFFER_H */ -------------------------------------------------------------------------------- /SWcompress.c: -------------------------------------------------------------------------------- 1 | #include "SWcompress.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | typedef struct SWtex_block { 8 | SWubyte col[4][4]; 9 | SWint counter; 10 | } SWtex_block; 11 | 12 | SWint swTexBlockCompare(const SWtex_block *blck1, const SWtex_block *blck2) { 13 | SWint i, j, res = 0; 14 | for (i = 0; i < 4; i++) { 15 | for (j = 0; j < 4; j++) { 16 | res += sw_abs((SWint)blck1->col[i][j] - blck2->col[i][j]); 17 | } 18 | } 19 | return res; 20 | } 21 | 22 | static int tex_block_cmp(const void *blck1, const void *blck2) { 23 | // return ((const SWtex_block *)blck1)->counter < ((const SWtex_block 24 | // *)blck2)->counter; 25 | 26 | const SWtex_block *b1 = (const SWtex_block *)blck1; 27 | const SWtex_block *b2 = (const SWtex_block *)blck2; 28 | 29 | SWfloat brightness1 = 1, brightness2 = 1; 30 | 31 | SWint i, j; 32 | 33 | extern SWfloat _sw_ubyte_to_float_table[256]; 34 | for (i = 0; i < 4; i++) { 35 | for (j = 0; j < 4; j++) { 36 | brightness1 += _sw_ubyte_to_float_table[b1->col[i][j]]; 37 | brightness2 += _sw_ubyte_to_float_table[b2->col[i][j]]; 38 | } 39 | } 40 | 41 | return b1->counter * brightness1 < b2->counter * brightness2; 42 | } 43 | 44 | SWint swTexBlockFind(SWtex_block *blocks, const SWint num_blocks, const SWtex_block *blck, 45 | const SWint tolerance) { 46 | SWint i; 47 | for (i = 0; i < num_blocks; i++) { 48 | SWint res = swTexBlockCompare(&blocks[i], blck); 49 | if (res < tolerance) 50 | return i; 51 | } 52 | return -1; 53 | } 54 | 55 | SWint swTexBlockClosest(SWtex_block *blocks, const SWint num_blocks, 56 | const SWtex_block *blck) { 57 | SWint i, best = 0, best_res = 255 * 4; 58 | for (i = 0; i < num_blocks; i++) { 59 | SWint res = swTexBlockCompare(&blocks[i], blck); 60 | if (res < best_res) { 61 | if (!res) 62 | return i; 63 | best = i; 64 | best_res = res; 65 | } 66 | } 67 | return best; 68 | } 69 | 70 | void swTexCompress(const void *data, const SWenum mode, const SWint w, const SWint h, 71 | void **out_data, SWint *out_size) { 72 | assert(w > 1 && h > 1); 73 | assert(w % 2 == 0 && h % 2 == 0); 74 | 75 | const SWubyte *pixels = (const SWubyte *)data; 76 | SWint max_num_blocks = sw_max(w / 2, 1) * sw_max(h / 2, 1); 77 | SWtex_block *blocks = (SWtex_block *)malloc(max_num_blocks * sizeof(SWtex_block)); 78 | SWint num_blocks = 0; 79 | SWint step = (mode == SW_RGB) ? 3 : 4; 80 | SWint i, j, k; 81 | 82 | #define InitBlock(block) \ 83 | block.counter = 1; \ 84 | memcpy(&block.col[0][0], &pixels[step * ((j + 0) * w + (i + 0))], step); \ 85 | memcpy(&block.col[1][0], &pixels[step * ((j + 0) * w + (i + 1))], step); \ 86 | memcpy(&block.col[2][0], &pixels[step * ((j + 1) * w + (i + 0))], step); \ 87 | memcpy(&block.col[3][0], &pixels[step * ((j + 1) * w + (i + 1))], step); \ 88 | if (mode == SW_RGB) { \ 89 | for (k = 0; k < 4; k++) { \ 90 | block.col[k][3] = 255; \ 91 | } \ 92 | } 93 | 94 | for (j = 0; j < h; j += 2) { 95 | for (i = 0; i < w; i += 2) { 96 | SWtex_block cur_block; 97 | InitBlock(cur_block); 98 | 99 | const SWint index = swTexBlockFind(blocks, num_blocks, &cur_block, 72); 100 | if (index == -1) { 101 | blocks[num_blocks++] = cur_block; 102 | } else { 103 | blocks[index].counter++; 104 | } 105 | } 106 | } 107 | 108 | qsort(blocks, num_blocks, sizeof(SWtex_block), tex_block_cmp); 109 | 110 | (*out_size) = 4 * 4 * 256 + (w / 2) * (h / 2); 111 | (*out_data) = malloc((size_t)(*out_size)); 112 | SWubyte *p = (SWubyte *)(*out_data); 113 | 114 | for (i = 0; i < sw_min(256, num_blocks); i++) { 115 | memcpy(p, &blocks[i], 4 * 4); 116 | p += 4 * 4; 117 | } 118 | 119 | for (i = num_blocks; i < 256; i++) { 120 | SWubyte col[4] = {0, 0, 0, 255}; 121 | for (j = 0; j < 4; j++) { 122 | memcpy(&p[j * 4], col, 4); 123 | } 124 | p += 4 * 4; 125 | } 126 | 127 | for (j = 0; j < h; j += 2) { 128 | for (i = 0; i < w; i += 2) { 129 | SWtex_block cur_block; 130 | InitBlock(cur_block); 131 | 132 | SWubyte index = (SWubyte)swTexBlockClosest(blocks, 256, &cur_block); 133 | (*p++) = index; 134 | } 135 | } 136 | 137 | free(blocks); 138 | 139 | #undef InitBlock 140 | } 141 | 142 | SWenum swTexDecompress(const void *data, const SWint w, const SWint h, void **out_data, 143 | SWint *out_size) { 144 | assert(w > 1 && h > 1); 145 | assert(w % 2 == 0 && h % 2 == 0); 146 | 147 | const SWubyte *blocks = (const SWubyte *)data; 148 | const SWubyte *pixels = (const SWubyte *)data + 4 * 4 * 256; 149 | 150 | SWint has_alpha = 0; 151 | SWint i, j; 152 | for (i = 0; i < 256; i++) { 153 | const SWubyte *block = &blocks[i * 16]; 154 | if (block[4 * 0 + 3] != 255 || block[4 * 1 + 3] != 255 || 155 | block[4 * 2 + 3] != 255 || block[4 * 3 + 3] != 255) { 156 | has_alpha = 1; 157 | break; 158 | } 159 | } 160 | 161 | (*out_size) = w * h * (3 + has_alpha); 162 | (*out_data) = malloc((size_t)(*out_size)); 163 | 164 | SWubyte *p = (SWubyte *)(*out_data); 165 | const SWuint step = has_alpha ? 4 : 3; 166 | for (j = 0; j < h / 2; j++) { 167 | for (i = 0; i < w / 2; i++) { 168 | SWuint index = pixels[j * (w / 2) + i]; 169 | const SWubyte *block = &blocks[index * 16]; 170 | 171 | const SWint x = i * 2, y = j * 2; 172 | memcpy(&p[step * (y * w + x)], &block[0], step); 173 | memcpy(&p[step * (y * w + x + 1)], &block[4], step); 174 | memcpy(&p[step * ((y + 1) * w + x)], &block[8], step); 175 | memcpy(&p[step * ((y + 1) * w + x + 1)], &block[12], step); 176 | } 177 | } 178 | 179 | return has_alpha ? SW_RGBA : SW_RGB; 180 | } 181 | -------------------------------------------------------------------------------- /SWcompress.h: -------------------------------------------------------------------------------- 1 | #ifndef SW_COMPRESS_H 2 | #define SW_COMPRESS_H 3 | 4 | #include "SWcore.h" 5 | 6 | void swTexCompress(const void *data, SWenum mode, SWint w, SWint h, void **out_data, 7 | SWint *out_size); 8 | SWenum swTexDecompress(const void *data, SWint w, SWint h, void **out_data, 9 | SWint *out_size); 10 | 11 | #endif /* SW_COMPRESS_H */ 12 | -------------------------------------------------------------------------------- /SWcontext.c: -------------------------------------------------------------------------------- 1 | #include "SWcontext.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | static SWubyte _1byte_tmp_buf[] = {0}; 8 | static SWubyte _1px_tmp_tex[] = {0, 200, 200}; 9 | 10 | sw_inline SWint _swBufferIndex(const SWcontext *ctx, const SWenum type) { 11 | ((void)ctx); 12 | 13 | const SWint i = type - SW_ARRAY_BUFFER; 14 | assert(i < sizeof(ctx->binded_buffers) / sizeof(SWint)); 15 | return i; 16 | } 17 | 18 | sw_inline SWbuffer *_swBindedBuffer(SWcontext *ctx, const SWenum type) { 19 | const SWint i = _swBufferIndex(ctx, type); 20 | const SWint buf = ctx->binded_buffers[i]; 21 | assert(buf >= 0 && buf < sizeof(ctx->buffers)); 22 | return &ctx->buffers[buf]; 23 | } 24 | 25 | void swCtxInit(SWcontext *ctx, const SWint w, const SWint h) { 26 | memset(ctx, 0, sizeof(SWcontext)); 27 | ctx->cur_framebuffer = swCtxCreateFramebuffer(ctx, SW_BGRA8888, w, h, 1); 28 | ctx->render_flags = DEFAULT_RENDER_FLAGS; 29 | ctx->curve_tolerance = 8.0f; 30 | 31 | ctx->binded_buffers[0] = -1; 32 | ctx->binded_buffers[1] = -1; 33 | 34 | swCPUInfoInit(&ctx->cpu_info); 35 | 36 | extern SWfloat _sw_ubyte_to_float_table[256]; 37 | if (_sw_ubyte_to_float_table[1] == 0) { 38 | SWint i; 39 | for (i = 1; i < 256; i++) { 40 | _sw_ubyte_to_float_table[i] = (SWfloat)i / 255; 41 | } 42 | } 43 | } 44 | 45 | void swCtxDestroy(SWcontext *ctx) { 46 | for (SWint i = 0; i < ctx->num_programs; i++) { 47 | swCtxDeleteProgram(ctx, i); 48 | } 49 | for (SWint i = 0; i < ctx->num_buffers; i++) { 50 | swCtxDeleteBuffer(ctx, i); 51 | } 52 | for (SWint i = 0; i < ctx->num_framebuffers; i++) { 53 | swFbufDestroy(&ctx->framebuffers[i]); 54 | } 55 | for (SWint i = 0; i < ctx->num_textures; i++) { 56 | swCtxDeleteTexture(ctx, i); 57 | } 58 | swCPUInfoDestroy(&ctx->cpu_info); 59 | memset(ctx, 0, sizeof(SWcontext)); 60 | } 61 | 62 | /*************************************************************************************************/ 63 | 64 | SWint swCtxCreateBuffer(SWcontext *ctx) { 65 | assert(ctx->num_buffers < sizeof(ctx->buffers) / sizeof(SWbuffer) - 1); 66 | SWint i, index = ctx->num_buffers; 67 | for (i = 0; i < ctx->num_buffers; i++) { 68 | if (ctx->buffers[i].data == NULL) { 69 | index = i; 70 | break; 71 | } 72 | } 73 | 74 | SWbuffer *b = &ctx->buffers[index]; 75 | swBufInit(b, sizeof(SWubyte), &_1byte_tmp_buf); 76 | 77 | if (index == ctx->num_buffers) { 78 | ctx->num_buffers++; 79 | return ctx->num_buffers; 80 | } else { 81 | return index + 1; 82 | } 83 | } 84 | 85 | void swCtxDeleteBuffer(SWcontext *ctx, SWint buf) { 86 | buf -= 1; 87 | SWbuffer *b = &ctx->buffers[buf]; 88 | if (b->data != &_1byte_tmp_buf) { 89 | swBufDestroy(b); 90 | } 91 | if (buf == ctx->num_buffers - 1) { 92 | ctx->num_buffers--; 93 | } 94 | } 95 | 96 | void swCtxBindBuffer(SWcontext *ctx, const SWenum type, const SWint buf) { 97 | const SWint i = _swBufferIndex(ctx, type); 98 | ctx->binded_buffers[i] = buf - 1; 99 | } 100 | 101 | void swCtxBufferData(SWcontext *ctx, const SWenum type, const SWuint size, 102 | const void *data) { 103 | SWbuffer *b = _swBindedBuffer(ctx, type); 104 | swBufInit(b, size, data); 105 | } 106 | 107 | void swCtxBufferSubData(SWcontext *ctx, const SWenum type, const SWuint offset, 108 | const SWuint size, const void *data) { 109 | assert(data); 110 | SWbuffer *b = _swBindedBuffer(ctx, type); 111 | swBufSetData(b, offset, size, data); 112 | } 113 | 114 | void swCtxGetBufferSubData(SWcontext *ctx, SWenum type, SWuint offset, SWuint size, 115 | void *data) { 116 | SWbuffer *b = _swBindedBuffer(ctx, type); 117 | swBufGetData(b, offset, size, data); 118 | } 119 | 120 | /*************************************************************************************************/ 121 | 122 | SWint swCtxCreateFramebuffer(SWcontext *ctx, SWenum type, const SWint w, const SWint h, 123 | const SWint with_depth) { 124 | assert(ctx->num_framebuffers < sizeof(ctx->framebuffers) / sizeof(SWframebuffer) - 1); 125 | SWint i, index = ctx->num_framebuffers; 126 | for (i = 0; i < ctx->num_framebuffers; i++) { 127 | if (ctx->framebuffers[i].pixels == NULL) { 128 | index = i; 129 | break; 130 | } 131 | } 132 | 133 | SWframebuffer *f = &ctx->framebuffers[index]; 134 | swFbufInit(f, type, w, h, with_depth); 135 | 136 | if (index == ctx->num_framebuffers) { 137 | return ctx->num_framebuffers++; 138 | } else { 139 | return index; 140 | } 141 | } 142 | 143 | void swCtxDeleteFramebuffer(SWcontext *ctx, const SWint i) { 144 | SWframebuffer *f = &ctx->framebuffers[i]; 145 | swFbufDestroy(f); 146 | } 147 | 148 | void swCtxBindFramebuffer(SWcontext *ctx, const SWint i) { ctx->cur_framebuffer = i; } 149 | 150 | SWint swCtxGetCurFramebuffer(SWcontext *ctx) { return ctx->cur_framebuffer; } 151 | 152 | const void *swCtxGetPixelDataRef(SWcontext *ctx, const SWint i) { 153 | SWframebuffer *f = &ctx->framebuffers[i]; 154 | return f->pixels; 155 | } 156 | 157 | const void *swCtxGetDepthDataRef(SWcontext *ctx, const SWint i) { 158 | SWframebuffer *f = &ctx->framebuffers[i]; 159 | return f->zbuf->depth; 160 | } 161 | 162 | void swCtxBlitPixels(SWcontext *ctx, const SWint x, const SWint y, const SWint pitch, const SWenum type, 163 | const SWenum mode, const SWint w, const SWint h, const void *pixels, const SWfloat scale) { 164 | SWframebuffer *f = &ctx->framebuffers[ctx->cur_framebuffer]; 165 | swFbufBlitPixels(f, x, y, pitch, type, mode, w, h, pixels, scale); 166 | } 167 | 168 | void swCtxBlitTexture(SWcontext *ctx, const SWint x, const SWint y, const SWfloat scale) { 169 | SWframebuffer *f = &ctx->framebuffers[ctx->cur_framebuffer]; 170 | SWtexture *t = &ctx->textures[ctx->binded_textures[ctx->active_tex_slot]]; 171 | swFbufBlitTexture(f, x, y, t, scale); 172 | } 173 | 174 | /*************************************************************************************************/ 175 | 176 | SWint swCtxCreateTexture(SWcontext *ctx) { 177 | assert(ctx->num_textures < sizeof(ctx->textures) / sizeof(SWtexture) - 1); 178 | SWint i, index = ctx->num_textures; 179 | for (i = 0; i < ctx->num_textures; i++) { 180 | if (ctx->textures[i].pixels == NULL) { 181 | index = i; 182 | break; 183 | } 184 | } 185 | 186 | SWtexture *t = &ctx->textures[index]; 187 | swTexInit(t, SW_RGB, SW_UNSIGNED_BYTE, 1, 1, _1px_tmp_tex); 188 | 189 | if (index == ctx->num_textures) { 190 | return ctx->num_textures++; 191 | } else { 192 | return index; 193 | } 194 | } 195 | 196 | void swCtxDeleteTexture(SWcontext *ctx, const SWint tex) { 197 | SWtexture *t = &ctx->textures[tex]; 198 | if (t->pixels != _1px_tmp_tex) { 199 | swTexDestroy(t); 200 | } 201 | if (tex == ctx->num_textures - 1) { 202 | ctx->num_textures--; 203 | } 204 | } 205 | 206 | void swCtxActiveTexture(SWcontext *ctx, const SWint slot) { ctx->active_tex_slot = slot; } 207 | 208 | void swCtxBindTexture(SWcontext *ctx, const SWint tex) { 209 | ctx->binded_textures[ctx->active_tex_slot] = tex; 210 | } 211 | 212 | void swCtxTexImage2D(SWcontext *ctx, const SWenum mode, const SWenum type, const SWint w, const SWint h, 213 | const void *pixels) { 214 | SWtexture *t = &ctx->textures[ctx->binded_textures[ctx->active_tex_slot]]; 215 | swTexDestroy(t); 216 | swTexInit(t, mode, type, w, h, pixels); 217 | } 218 | 219 | void swCtxTexImage2DMove_malloced(SWcontext *ctx, const SWenum mode, const SWenum type, const SWint w, 220 | const SWint h, void *pixels) { 221 | SWtexture *t = &ctx->textures[ctx->binded_textures[ctx->active_tex_slot]]; 222 | swTexInitMove_malloced(t, mode, type, w, h, pixels); 223 | } 224 | 225 | static void _sw_null_free(void *p) { ((void)p); } 226 | void swCtxTexImage2DConst(SWcontext *ctx, const SWenum mode, const SWenum type, const SWint w, const SWint h, 227 | void *pixels) { 228 | SWtexture *t = &ctx->textures[ctx->binded_textures[ctx->active_tex_slot]]; 229 | swTexInitMove(t, mode, type, w, h, pixels, _sw_null_free); 230 | } 231 | 232 | /*************************************************************************************************/ 233 | 234 | static VSHADER empty_vshader(VS_IN, VS_OUT) { 235 | ((void)attribs); 236 | ((void)index); 237 | ((void)uniforms); 238 | ((void)out_data); 239 | } 240 | static FSHADER empty_fshader(FS_IN, FS_OUT) { 241 | ((void)f_in_data); 242 | ((void)uniforms); 243 | ((void)f_out_data); 244 | ((void)b_discard); 245 | } 246 | 247 | SWint swCtxCreateProgram(SWcontext *ctx) { 248 | assert(ctx->num_programs < sizeof(ctx->programs) / sizeof(SWprogram) - 1); 249 | SWint i, index = ctx->num_programs; 250 | for (i = 0; i < ctx->num_programs; i++) { 251 | if (ctx->programs[i].v_proc == NULL) { 252 | index = i; 253 | break; 254 | } 255 | } 256 | SWprogram *p = &ctx->programs[index]; 257 | swProgInit(p, ctx->uniform_buf, empty_vshader, empty_fshader, 0); 258 | 259 | if (index == ctx->num_programs) { 260 | return ctx->num_programs++; 261 | } else { 262 | return index; 263 | } 264 | } 265 | 266 | void swCtxInitProgram(SWcontext *ctx, vtx_shader_proc v_proc, frag_shader_proc f_proc, 267 | SWint v_out_floats) { 268 | SWprogram *p = &ctx->programs[ctx->cur_program]; 269 | swProgInit(p, ctx->uniform_buf, v_proc, f_proc, v_out_floats); 270 | } 271 | 272 | void swCtxDeleteProgram(SWcontext *ctx, const SWint program) { 273 | SWprogram *p = &ctx->programs[program]; 274 | swProgDestroy(p); 275 | if (program == ctx->num_programs - 1) { 276 | ctx->num_programs--; 277 | } 278 | } 279 | 280 | void swCtxUseProgram(SWcontext *ctx, const SWint program) { 281 | assert(program < ctx->num_programs); 282 | ctx->cur_program = program; 283 | } 284 | 285 | void swCtxRegisterUniform(SWcontext *ctx, const SWint index, const SWenum type) { 286 | SWprogram *p = &ctx->programs[ctx->cur_program]; 287 | assert(p->v_proc != empty_vshader && p->f_proc != empty_fshader); 288 | swProgRegUniform(p, index, type); 289 | } 290 | 291 | void swCtxRegisterUniformv(SWcontext *ctx, const SWint index, const SWenum type, const SWint num) { 292 | SWprogram *p = &ctx->programs[ctx->cur_program]; 293 | assert(p->v_proc != empty_vshader && p->f_proc != empty_fshader); 294 | swProgRegUniformv(p, index, type, num); 295 | } 296 | 297 | void swCtxSetUniform(SWcontext *ctx, const SWint index, const SWenum type, const void *data) { 298 | SWprogram *p = &ctx->programs[ctx->cur_program]; 299 | assert(p->v_proc != empty_vshader && p->f_proc != empty_fshader); 300 | swProgSetProgramUniform(p, index, type, data); 301 | } 302 | 303 | void swCtxSetUniformv(SWcontext *ctx, const SWint index, const SWenum type, const SWint num, 304 | const void *data) { 305 | SWprogram *p = &ctx->programs[ctx->cur_program]; 306 | assert(p->v_proc != empty_vshader && p->f_proc != empty_fshader); 307 | swProgSetProgramUniformv(p, index, type, num, data); 308 | } 309 | -------------------------------------------------------------------------------- /SWcontext.h: -------------------------------------------------------------------------------- 1 | #ifndef SW_CONTEXT_H 2 | #define SW_CONTEXT_H 3 | 4 | #include "SWbuffer.h" 5 | #include "SWcore.h" 6 | #include "SWcpu.h" 7 | #include "SWframebuffer.h" 8 | #include "SWprogram.h" 9 | #include "SWtexture.h" 10 | 11 | /* render flags */ 12 | #define DEPTH_TEST_ENABLED (1 << 0) 13 | #define DEPTH_WRITE_ENABLED (1 << 1) 14 | #define BLEND_ENABLED (1 << 2) 15 | #define PERSPECTIVE_CORRECTION_ENABLED (1 << 3) 16 | #define FAST_PERSPECTIVE_CORRECTION (1 << 4) 17 | 18 | #define DEFAULT_RENDER_FLAGS \ 19 | (DEPTH_TEST_ENABLED | DEPTH_WRITE_ENABLED | PERSPECTIVE_CORRECTION_ENABLED) 20 | 21 | #define SW_UNIFORM_BUF_SIZE 2048 22 | 23 | struct SWcontext { 24 | SWubyte uniform_buf[SW_UNIFORM_BUF_SIZE]; 25 | 26 | SWprogram programs[32]; 27 | SWint num_programs, cur_program; 28 | 29 | SWtexture textures[256]; 30 | SWint num_textures, binded_textures[8], active_tex_slot; 31 | 32 | SWbuffer buffers[256]; 33 | SWint num_buffers, binded_buffers[2]; 34 | 35 | SWframebuffer framebuffers[8]; 36 | SWint num_framebuffers, cur_framebuffer; 37 | 38 | SWuint render_flags; 39 | 40 | SWfloat curve_tolerance; 41 | 42 | SWcpu_info cpu_info; 43 | }; 44 | 45 | void swCtxInit(SWcontext *ctx, SWint w, SWint h); 46 | void swCtxDestroy(SWcontext *ctx); 47 | 48 | /* Vertex buffer operations */ 49 | SWint swCtxCreateBuffer(SWcontext *ctx); 50 | void swCtxDeleteBuffer(SWcontext *ctx, SWint buf); 51 | void swCtxBindBuffer(SWcontext *ctx, SWenum type, SWint buf); 52 | void swCtxBufferData(SWcontext *ctx, SWenum type, SWuint size, const void *data); 53 | void swCtxBufferSubData(SWcontext *ctx, SWenum type, SWuint offset, SWuint size, 54 | const void *data); 55 | void swCtxGetBufferSubData(SWcontext *ctx, SWenum type, SWuint offset, SWuint size, 56 | void *data); 57 | 58 | /* Framebuffer operations */ 59 | SWint swCtxCreateFramebuffer(SWcontext *ctx, SWenum type, SWint w, SWint h, 60 | SWint with_depth); 61 | void swCtxDeleteFramebuffer(SWcontext *ctx, SWint i); 62 | void swCtxBindFramebuffer(SWcontext *ctx, SWint i); 63 | SWint swCtxGetCurFramebuffer(SWcontext *ctx); 64 | const void *swCtxGetPixelDataRef(SWcontext *ctx, SWint i); 65 | const void *swCtxGetDepthDataRef(SWcontext *ctx, SWint i); 66 | 67 | void swCtxBlitPixels(SWcontext *ctx, SWint x, SWint y, SWint pitch, SWenum type, 68 | SWenum mode, SWint w, SWint h, const void *pixels, SWfloat scale); 69 | void swCtxBlitTexture(SWcontext *ctx, SWint x, SWint y, SWfloat scale); 70 | 71 | /* Texture operations */ 72 | SWint swCtxCreateTexture(SWcontext *ctx); 73 | void swCtxDeleteTexture(SWcontext *ctx, SWint tex); 74 | void swCtxActiveTexture(SWcontext *ctx, SWint slot); 75 | void swCtxBindTexture(SWcontext *ctx, SWint tex); 76 | void swCtxTexImage2D(SWcontext *ctx, SWenum mode, SWenum type, SWint w, SWint h, 77 | const void *pixels); 78 | void swCtxTexImage2DMove_malloced(SWcontext *ctx, SWenum mode, SWenum type, SWint w, 79 | SWint h, void *pixels); 80 | void swCtxTexImage2DConst(SWcontext *ctx, SWenum mode, SWenum type, SWint w, SWint h, 81 | void *pixels); 82 | 83 | /* Program operations */ 84 | SWint swCtxCreateProgram(SWcontext *ctx); 85 | void swCtxInitProgram(SWcontext *ctx, vtx_shader_proc v_proc, frag_shader_proc f_proc, 86 | SWint v_out_floats); 87 | void swCtxDeleteProgram(SWcontext *ctx, SWint program); 88 | void swCtxUseProgram(SWcontext *ctx, SWint program); 89 | void swCtxRegisterUniform(SWcontext *ctx, SWint index, SWenum type); 90 | void swCtxRegisterUniformv(SWcontext *ctx, SWint index, SWenum type, SWint num); 91 | void swCtxSetUniform(SWcontext *ctx, SWint index, SWenum type, const void *data); 92 | void swCtxSetUniformv(SWcontext *ctx, SWint index, SWenum type, SWint num, 93 | const void *data); 94 | 95 | #endif /* SW_CONTEXT_H */ 96 | -------------------------------------------------------------------------------- /SWcore.c: -------------------------------------------------------------------------------- 1 | #include "SWcore.h" 2 | 3 | #include 4 | #include 5 | 6 | #include "SWcontext.h" 7 | #include "SWcpu.h" 8 | 9 | /***************************************************************************************/ 10 | 11 | SWcontext *sw_cur_context = NULL; 12 | 13 | SWcontext *swCreateContext(const SWint w, const SWint h) { 14 | SWcontext *ctx = (SWcontext *)calloc(1, sizeof(SWcontext)); 15 | swCtxInit(ctx, w, h); 16 | if (!sw_cur_context) { 17 | swMakeCurrent(ctx); 18 | } 19 | return ctx; 20 | } 21 | 22 | void swMakeCurrent(SWcontext *ctx) { sw_cur_context = ctx; } 23 | 24 | void swDeleteContext(SWcontext *ctx) { 25 | if (ctx == sw_cur_context) { 26 | sw_cur_context = NULL; 27 | } 28 | swCtxDestroy(ctx); 29 | free(ctx); 30 | } 31 | 32 | /***************************************************************************************/ 33 | 34 | SWint swCreateBuffer() { return swCtxCreateBuffer(sw_cur_context); } 35 | 36 | void swDeleteBuffer(const SWint buf) { swCtxDeleteBuffer(sw_cur_context, buf); } 37 | 38 | void swBindBuffer(const SWenum type, const SWint buf) { 39 | swCtxBindBuffer(sw_cur_context, type, buf); 40 | } 41 | 42 | void swBufferData(const SWenum type, const SWuint size, const void *data) { 43 | swCtxBufferData(sw_cur_context, type, size, data); 44 | } 45 | 46 | void swBufferSubData(const SWenum type, const SWuint offset, const SWuint size, 47 | const void *data) { 48 | swCtxBufferSubData(sw_cur_context, type, offset, size, data); 49 | } 50 | 51 | void swGetBufferSubData(const SWenum type, const SWuint offset, const SWuint size, 52 | void *data) { 53 | swCtxGetBufferSubData(sw_cur_context, type, offset, size, data); 54 | } 55 | 56 | /***************************************************************************************/ 57 | 58 | SWint swCreateFramebuffer(const SWenum type, const SWint w, const SWint h, 59 | const SWint with_depth) { 60 | return swCtxCreateFramebuffer(sw_cur_context, type, w, h, with_depth); 61 | } 62 | 63 | void swDeleteFramebuffer(SWint i) { swCtxDeleteFramebuffer(sw_cur_context, i); } 64 | 65 | void swBindFramebuffer(SWint i) { swCtxBindFramebuffer(sw_cur_context, i); } 66 | 67 | SWint swGetCurFramebuffer() { return swCtxGetCurFramebuffer(sw_cur_context); } 68 | 69 | const void *swGetPixelDataRef(SWint i) { return swCtxGetPixelDataRef(sw_cur_context, i); } 70 | 71 | const void *swGetDepthDataRef(SWint i) { return swCtxGetDepthDataRef(sw_cur_context, i); } 72 | 73 | void swBlitPixels(const SWint x, const SWint y, const SWint pitch, const SWenum type, 74 | const SWenum mode, const SWint w, const SWint h, const void *pixels, 75 | const SWfloat scale) { 76 | swCtxBlitPixels(sw_cur_context, x, y, pitch, type, mode, w, h, pixels, scale); 77 | } 78 | 79 | void swBlitTexture(const SWint x, const SWint y, const SWfloat scale) { 80 | swCtxBlitTexture(sw_cur_context, x, y, scale); 81 | } 82 | 83 | /***************************************************************************************/ 84 | 85 | SWint swCreateTexture() { return swCtxCreateTexture(sw_cur_context); } 86 | 87 | void swDeleteTexture(const SWint tex) { swCtxDeleteTexture(sw_cur_context, tex); } 88 | 89 | void swActiveTexture(const SWint slot) { swCtxActiveTexture(sw_cur_context, slot); } 90 | 91 | void swBindTexture(const SWint tex) { swCtxBindTexture(sw_cur_context, tex); } 92 | 93 | void swTexImage2D(const SWenum mode, const SWenum type, const SWint w, const SWint h, 94 | const void *pixels) { 95 | swCtxTexImage2D(sw_cur_context, mode, type, w, h, pixels); 96 | } 97 | 98 | void swTexImage2DMove_malloced(const SWenum mode, const SWenum type, const SWint w, 99 | const SWint h, void *pixels) { 100 | swCtxTexImage2DMove_malloced(sw_cur_context, mode, type, w, h, pixels); 101 | } 102 | 103 | void swTexImage2DConst(const SWenum mode, const SWenum type, const SWint w, const SWint h, 104 | void *pixels) { 105 | swCtxTexImage2DConst(sw_cur_context, mode, type, w, h, pixels); 106 | } 107 | 108 | void swTexture(const SWint slot, const SWfloat *uv, SWfloat *col) { 109 | SWtexture *t = &sw_cur_context->textures[sw_cur_context->binded_textures[slot]]; 110 | swTexGetColorFloat_RGBA(t, uv[0], uv[1], col); 111 | } 112 | 113 | /***************************************************************************************/ 114 | 115 | SWint swCreateProgram() { return swCtxCreateProgram(sw_cur_context); } 116 | 117 | void swInitProgram(vtx_shader_proc v_proc, frag_shader_proc f_proc, SWint v_out_floats) { 118 | swCtxInitProgram(sw_cur_context, v_proc, f_proc, v_out_floats); 119 | } 120 | 121 | void swDeleteProgram(const SWint program) { swCtxDeleteProgram(sw_cur_context, program); } 122 | 123 | void swUseProgram(const SWint program) { swCtxUseProgram(sw_cur_context, program); } 124 | 125 | /***************************************************************************************/ 126 | 127 | void swEnable(SWenum func) { 128 | if (func == SW_DEPTH_TEST) { 129 | sw_cur_context->render_flags |= DEPTH_TEST_ENABLED; 130 | } else if (func == SW_DEPTH_WRITE) { 131 | sw_cur_context->render_flags |= DEPTH_WRITE_ENABLED; 132 | } else if (func == SW_BLEND) { 133 | sw_cur_context->render_flags |= BLEND_ENABLED; 134 | } else if (func == SW_PERSPECTIVE_CORRECTION) { 135 | sw_cur_context->render_flags |= PERSPECTIVE_CORRECTION_ENABLED; 136 | } else if (func == SW_FAST_PERSPECTIVE_CORRECTION) { 137 | sw_cur_context->render_flags |= FAST_PERSPECTIVE_CORRECTION; 138 | } 139 | } 140 | 141 | void swDisable(const SWenum func) { 142 | if (func == SW_DEPTH_TEST) { 143 | sw_cur_context->render_flags &= ~DEPTH_TEST_ENABLED; 144 | } else if (func == SW_DEPTH_WRITE) { 145 | sw_cur_context->render_flags &= ~DEPTH_WRITE_ENABLED; 146 | } else if (func == SW_BLEND) { 147 | sw_cur_context->render_flags &= ~BLEND_ENABLED; 148 | } else if (func == SW_PERSPECTIVE_CORRECTION) { 149 | sw_cur_context->render_flags &= ~PERSPECTIVE_CORRECTION_ENABLED; 150 | } else if (func == SW_FAST_PERSPECTIVE_CORRECTION) { 151 | sw_cur_context->render_flags &= ~FAST_PERSPECTIVE_CORRECTION; 152 | } 153 | } 154 | 155 | SWint swIsEnabled(const SWenum func) { 156 | if (func == SW_DEPTH_TEST) { 157 | return sw_cur_context->render_flags & DEPTH_TEST_ENABLED; 158 | } else if (func == SW_DEPTH_WRITE) { 159 | return sw_cur_context->render_flags & DEPTH_WRITE_ENABLED; 160 | } else if (func == SW_BLEND) { 161 | return sw_cur_context->render_flags & BLEND_ENABLED; 162 | } else if (func == SW_PERSPECTIVE_CORRECTION) { 163 | return sw_cur_context->render_flags & PERSPECTIVE_CORRECTION_ENABLED; 164 | } else if (func == SW_FAST_PERSPECTIVE_CORRECTION) { 165 | return sw_cur_context->render_flags & FAST_PERSPECTIVE_CORRECTION; 166 | } 167 | return 0; 168 | } 169 | 170 | void swClearColor(const SWfloat r, const SWfloat g, const SWfloat b, const SWfloat a) { 171 | SWframebuffer *f = &sw_cur_context->framebuffers[sw_cur_context->cur_framebuffer]; 172 | swFbufClearColorFloat(f, r, g, b, a); 173 | } 174 | 175 | void swClearDepth(const SWfloat val) { 176 | SWframebuffer *f = &sw_cur_context->framebuffers[sw_cur_context->cur_framebuffer]; 177 | swFbufClearDepth(f, val); 178 | } 179 | 180 | SWint swGetInteger(const SWenum what) { 181 | if (what == SW_MAX_VERTEX_UNIFORM_VECTORS) { 182 | return SW_UNIFORM_BUF_SIZE / (4 * sizeof(SWfloat)); 183 | } else if (what == SW_NUM_CPUS) { 184 | return sw_cur_context->cpu_info.num_cpus; 185 | } else if (what == SW_PHYSICAL_MEMORY) { 186 | return (SWint)(sw_cur_context->cpu_info.physical_memory * 1024); 187 | } else { 188 | return -1; 189 | } 190 | } 191 | 192 | SWfloat swGetFloat(const SWenum what) { 193 | if (what == SW_PHYSICAL_MEMORY) { 194 | return sw_cur_context->cpu_info.physical_memory; 195 | } else { 196 | return 0; 197 | } 198 | } 199 | 200 | const char *swGetString(const SWenum what) { 201 | if (what == SW_CPU_VENDOR) { 202 | return sw_cur_context->cpu_info.vendor; 203 | } else if (what == SW_CPU_MODEL) { 204 | return sw_cur_context->cpu_info.model; 205 | } else { 206 | return "Undefined"; 207 | } 208 | } 209 | 210 | void swSetFloat(const SWenum what, const SWfloat val) { 211 | if (what == SW_CURVE_TOLERANCE) { 212 | sw_cur_context->curve_tolerance = val; 213 | } 214 | } -------------------------------------------------------------------------------- /SWcore.h: -------------------------------------------------------------------------------- 1 | #ifndef SW_CORE_H 2 | #define SW_CORE_H 3 | 4 | #include "SWtypes.h" 5 | 6 | typedef enum SWenum { 7 | /* primitive types */ 8 | SW_LINES, 9 | SW_LINE_STRIP, 10 | SW_CURVES, 11 | SW_CURVE_STRIP, 12 | SW_TRIANGLES, 13 | SW_TRIANGLE_STRIP, 14 | 15 | /* data types */ 16 | SW_UNSIGNED_BYTE, 17 | SW_UNSIGNED_SHORT, 18 | SW_UNSIGNED_INT, 19 | /* unifom types */ 20 | SW_INT, 21 | SW_FLOAT, 22 | SW_VEC2, 23 | SW_VEC3, 24 | SW_VEC4, 25 | SW_MAT3, 26 | SW_MAT4, 27 | 28 | /* framebuffer types */ 29 | SW_BGRA8888, 30 | SW_FRGBA, 31 | 32 | /* texture types */ 33 | SW_RGB, 34 | SW_RGBA, 35 | SW_BGRA, 36 | 37 | SW_COMPRESSED, 38 | 39 | /* args for swEnable */ 40 | SW_DEPTH_TEST, 41 | SW_DEPTH_WRITE, 42 | SW_BLEND, 43 | SW_PERSPECTIVE_CORRECTION, 44 | SW_FAST_PERSPECTIVE_CORRECTION, 45 | 46 | /* buffer types */ 47 | SW_ARRAY_BUFFER, 48 | SW_INDEX_BUFFER, 49 | 50 | SW_MAX_VERTEX_UNIFORM_VECTORS, 51 | 52 | SW_CURVE_TOLERANCE, 53 | 54 | SW_TEXTURE0 = 0, 55 | 56 | SW_PHYSICAL_MEMORY, 57 | SW_CPU_VENDOR, 58 | SW_CPU_MODEL, 59 | SW_NUM_CPUS, 60 | } SWenum; 61 | 62 | #define SW_TILE_SIZE 8 63 | #define SW_INV_TILE_SIZE (((SWfloat)1) / SW_TILE_SIZE) 64 | #define SW_INV_TILE_STEP (((SWfloat)1) / (SW_TILE_SIZE - 1)); 65 | 66 | typedef struct SWcontext SWcontext; 67 | 68 | /* Context operations */ 69 | struct SWcontext *swCreateContext(SWint w, SWint h); 70 | void swMakeCurrent(struct SWcontext *ctx); 71 | void swDeleteContext(struct SWcontext *ctx); 72 | 73 | /* Vertex buffer operations */ 74 | SWint swCreateBuffer(); 75 | void swDeleteBuffer(SWint buf); 76 | void swBindBuffer(SWenum type, SWint buf); 77 | void swBufferData(SWenum type, SWuint size, const void *data); 78 | void swBufferSubData(SWenum type, SWuint offset, SWuint size, const void *data); 79 | void swGetBufferSubData(SWenum type, SWuint offset, SWuint size, void *data); 80 | 81 | /* Framebuffer operations */ 82 | SWint swCreateFramebuffer(SWenum type, SWint w, SWint h, SWint with_depth); 83 | void swDeleteFramebuffer(SWint i); 84 | void swBindFramebuffer(SWint i); 85 | SWint swGetCurFramebuffer(); 86 | const void *swGetPixelDataRef(SWint i); 87 | const void *swGetDepthDataRef(SWint i); 88 | 89 | void swBlitPixels(SWint x, SWint y, SWint pitch, SWenum type, SWenum mode, SWint w, 90 | SWint h, const void *pixels, SWfloat scale); 91 | void swBlitTexture(SWint x, SWint y, SWfloat scale); 92 | 93 | /* Texture operations */ 94 | SWint swCreateTexture(); 95 | void swDeleteTexture(SWint tex); 96 | void swActiveTexture(SWint slot); 97 | void swBindTexture(SWint tex); 98 | void swTexImage2D(SWenum mode, SWenum type, SWint w, SWint h, const void *pixels); 99 | void swTexImage2DMove_malloced(SWenum mode, SWenum type, SWint w, SWint h, void *pixels); 100 | void swTexImage2DConst(SWenum mode, SWenum type, SWint w, SWint h, void *pixels); 101 | void swTexture(SWint slot, const SWfloat *uv, SWfloat *col); 102 | 103 | /* SWtexture.h should be included for these */ 104 | #define swTexture_RGB888(slot, uv, col) \ 105 | { \ 106 | extern SWcontext *sw_cur_context; \ 107 | swTex_RGB888_GetColorFloat_RGBA(&sw_cur_context->textures[slot], uv[0], uv[1], \ 108 | col); \ 109 | } 110 | 111 | /* Program operations */ 112 | SWint swCreateProgram(); 113 | void swInitProgram(vtx_shader_proc v_proc, frag_shader_proc f_proc, SWint v_out_floats); 114 | void swDeleteProgram(SWint program); 115 | void swUseProgram(SWint program); 116 | 117 | void swEnable(SWenum func); 118 | void swDisable(SWenum func); 119 | 120 | SWint swIsEnabled(SWenum func); 121 | 122 | void swClearColor(SWfloat r, SWfloat g, SWfloat b, SWfloat a); 123 | void swClearDepth(SWfloat val); 124 | 125 | SWint swGetInteger(SWenum what); 126 | SWfloat swGetFloat(SWenum what); 127 | const char *swGetString(SWenum what); 128 | 129 | void swSetFloat(SWenum what, SWfloat val); 130 | 131 | #endif /* SW_CORE_H */ 132 | -------------------------------------------------------------------------------- /SWcpu.c: -------------------------------------------------------------------------------- 1 | #include "SWcpu.h" 2 | 3 | #include 4 | #include 5 | 6 | #if defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64) 7 | // Windows 8 | #include 9 | #ifdef __GNUC__ 10 | #include 11 | inline void cpuid(int info[4], int InfoType) { 12 | __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); 13 | } 14 | #if defined(__GNUC__) && (__GNUC__ < 9) 15 | inline unsigned long long _xgetbv(unsigned int index) { 16 | unsigned int eax, edx; 17 | __asm__ __volatile__( 18 | "xgetbv;" 19 | : "=a" (eax), "=d"(edx) 20 | : "c" (index) 21 | ); 22 | return ((unsigned long long)edx << 32) | eax; 23 | } 24 | #endif 25 | #else 26 | #define cpuid(info, x) __cpuidex(info, x, 0) 27 | #endif 28 | 29 | #else 30 | 31 | #if !defined(__arm__) && !defined(__aarch64__) && !defined(__EMSCRIPTEN__) && !defined(__ANDROID__) && \ 32 | !defined(_M_ARM) && !defined(_M_ARM64) 33 | // GCC Intrinsics 34 | #include 35 | #include 36 | static inline void cpuid(int info[4], int InfoType) { 37 | __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); 38 | } 39 | #if defined(__GNUC__) && (__GNUC__ < 9) && !defined(__APPLE__) 40 | inline unsigned long long _xgetbv(unsigned int index) { 41 | unsigned int eax, edx; 42 | __asm__ __volatile__( 43 | "xgetbv;" 44 | : "=a" (eax), "=d"(edx) 45 | : "c" (index) 46 | ); 47 | return ((unsigned long long)edx << 32) | eax; 48 | } 49 | #endif 50 | #endif 51 | 52 | #endif 53 | 54 | #ifdef _WIN32 55 | 56 | #include 57 | #include 58 | 59 | #ifdef _MSC_VER 60 | #pragma warning(disable : 4996) 61 | #endif 62 | #endif 63 | 64 | #if defined(__linux) && !defined(__ANDROID__) && !defined(_M_ARM) && !defined(_M_ARM64) 65 | 66 | #include 67 | #include 68 | 69 | ssize_t getline(char **lineptr, size_t *n, FILE *stream); 70 | char *strdup(const char *s); 71 | 72 | #endif 73 | 74 | unsigned long long get_xcr_feature_mask(); 75 | 76 | void swCPUInfoInit(SWcpu_info *info) { 77 | memset(info, 0, sizeof(SWcpu_info)); 78 | 79 | info->vendor = strdup("Unknown"); 80 | //info->model = "Unknown"; 81 | info->num_cpus = 0; 82 | info->physical_memory = 0; 83 | 84 | #if defined(_WIN32) && !defined(_M_ARM) && !defined(_M_ARM64) 85 | int CPUInfo[4] = { -1 }; 86 | unsigned nExIds, i = 0; 87 | char CPUBrandString[0x40]; 88 | // Get the information associated with each extended ID. 89 | cpuid(CPUInfo, 0x80000000); 90 | nExIds = CPUInfo[0]; 91 | for (i = 0x80000000; i <= nExIds; ++i) { 92 | cpuid(CPUInfo, i); 93 | // Interpret CPU brand string 94 | if (i == 0x80000002) 95 | memcpy(CPUBrandString, CPUInfo, sizeof(CPUInfo)); 96 | else if (i == 0x80000003) 97 | memcpy(CPUBrandString + 16, CPUInfo, sizeof(CPUInfo)); 98 | else if (i == 0x80000004) 99 | memcpy(CPUBrandString + 32, CPUInfo, sizeof(CPUInfo)); 100 | } 101 | 102 | info->model = strdup(CPUBrandString); 103 | 104 | SYSTEM_INFO sys_info; 105 | GetSystemInfo(&sys_info); 106 | 107 | info->num_cpus = sys_info.dwNumberOfProcessors; 108 | 109 | MEMORYSTATUSEX statex; 110 | statex.dwLength = sizeof(statex); 111 | GlobalMemoryStatusEx(&statex); 112 | 113 | info->physical_memory = ((statex.ullTotalPhys / 1024.0f) / 1024) / 1024; 114 | 115 | char vendor[13]; 116 | cpuid(CPUInfo, 0); 117 | memcpy(vendor, &CPUInfo[1], 4); // copy EBX 118 | memcpy(vendor + 4, &CPUInfo[2], 4); // copy ECX 119 | memcpy(vendor + 8, &CPUInfo[3], 4); // copy EDX 120 | vendor[12] = '\0'; 121 | #elif !defined(__ANDROID__) && !defined(_M_ARM) && !defined(_M_ARM64) 122 | #if !defined(__APPLE__) 123 | struct sysinfo mem_info; 124 | sysinfo(&mem_info); 125 | long long total_virtual_mem = (long long)mem_info.totalram; 126 | total_virtual_mem *= mem_info.mem_unit; 127 | 128 | info->physical_memory = (SWfloat)(((total_virtual_mem / 1024.0) / 1024) / 1024); 129 | 130 | FILE *cpuinfo = fopen("/proc/cpuinfo", "rb"); 131 | char *arg = 0; 132 | size_t size = 0; 133 | while (getline(&arg, &size, cpuinfo) != -1) { 134 | char *tok = arg; 135 | if ((tok = strtok(tok, " \t:\n")) != NULL) { 136 | if (strcmp(tok, "vendor_id") == 0) { 137 | if ((tok = strtok(NULL, "\t:\n")) != NULL) { 138 | while (*tok && *tok == ' ') tok++; 139 | info->vendor = strdup(tok); 140 | } 141 | } else if (strcmp(tok, "model") == 0) { 142 | if ((tok = strtok(NULL, " \t:\n")) != NULL) { 143 | if (strcmp(tok, "name") == 0) { 144 | if ((tok = strtok(NULL, "\t:\n")) != NULL) { 145 | while (*tok && *tok == ' ') tok++; 146 | info->model = strdup(tok); 147 | } 148 | } 149 | } 150 | } else if (strcmp(tok, "processor") == 0) { 151 | info->num_cpus++; 152 | } 153 | } 154 | } 155 | free(arg); 156 | fclose(cpuinfo); 157 | #else // __APPLE__ 158 | info->physical_memory = 0.0f; 159 | #endif 160 | #endif 161 | 162 | #if !defined(__aarch64__) && !defined(_M_ARM) && !defined(_M_ARM64) 163 | int cpu_info[4]; 164 | cpuid(cpu_info, 0); 165 | int ids_count = cpu_info[0]; 166 | 167 | cpuid(cpu_info, 0x80000000u); 168 | //unsigned ex_ids_count = cpu_info[0]; 169 | 170 | // Detect Features 171 | if (ids_count >= 0x00000001) { 172 | cpuid(cpu_info, 0x00000001); 173 | info->sse2_supported = (cpu_info[3] & ((int)1 << 26)) != 0; 174 | info->sse3_supported = (cpu_info[2] & ((int)1 << 0)) != 0; 175 | info->ssse3_supported = (cpu_info[2] & ((int)1 << 9)) != 0; 176 | info->sse41_supported = (cpu_info[2] & ((int)1 << 19)) != 0; 177 | 178 | int os_uses_XSAVE_XRSTORE = (cpu_info[2] & (1 << 27)) != 0; 179 | int os_saves_YMM = 0; 180 | if (os_uses_XSAVE_XRSTORE) { 181 | // Check if the OS will save the YMM registers 182 | // _XCR_XFEATURE_ENABLED_MASK = 0 183 | unsigned long long xcr_feature_mask = get_xcr_feature_mask(); 184 | os_saves_YMM = (xcr_feature_mask & 0x6) != 0; 185 | } 186 | 187 | int cpu_FMA_support = (cpu_info[2] & ((int)1 << 12)) != 0; 188 | 189 | int cpu_AVX_support = (cpu_info[2] & (1 << 28)) != 0; 190 | info->avx_supported = os_saves_YMM && cpu_AVX_support; 191 | 192 | if (ids_count >= 0x00000007) { 193 | cpuid(cpu_info, 0x00000007); 194 | 195 | int cpu_AVX2_support = (cpu_info[1] & (1 << 5)) != 0; 196 | // use fma in conjunction with avx2 support (like microsoft compiler does) 197 | info->avx2_supported = os_saves_YMM && cpu_AVX2_support && cpu_FMA_support; 198 | 199 | info->avx512_supported = (cpu_info[1] & (1 << 16)) != 0; // HW_AVX512F 200 | //info->avx512_supported &= (cpu_info[1] & (1 << 28)) != 0; // HW_AVX512CD 201 | //info->avx512_supported &= (cpu_info[1] & (1 << 26)) != 0; // HW_AVX512PF 202 | //info->avx512_supported &= (cpu_info[1] & (1 << 27)) != 0; // HW_AVX512ER 203 | //info->avx512_supported &= (cpu_info[1] & (1 << 31)) != 0; // HW_AVX512VL 204 | info->avx512_supported &= (cpu_info[1] & (1 << 30)) != 0; // HW_AVX512BW 205 | info->avx512_supported &= (cpu_info[1] & (1 << 17)) != 0; // HW_AVX512DQ 206 | //info->avx512_supported &= (cpu_info[1] & (1 << 21)) != 0; // HW_AVX512IFMA 207 | //info->avx512_supported &= (cpu_info[2] & (1 << 1)) != 0; // HW_AVX512VBMI 208 | } 209 | } 210 | #elif defined(__i386__) || defined(__x86_64__) 211 | info->sse2_supported = true; 212 | #endif 213 | } 214 | 215 | void swCPUInfoDestroy(SWcpu_info *info) { 216 | free(info->vendor); 217 | free(info->model); 218 | 219 | memset(info, 0, sizeof(SWcpu_info)); 220 | } 221 | -------------------------------------------------------------------------------- /SWcpu.h: -------------------------------------------------------------------------------- 1 | #ifndef SWCPU_H 2 | #define SWCPU_H 3 | 4 | #include "SWtypes.h" 5 | 6 | typedef struct SWcpu_info { 7 | char *vendor; 8 | char *model; 9 | SWint num_cpus; 10 | SWfloat physical_memory; 11 | // features 12 | unsigned sse2_supported : 1; 13 | unsigned sse3_supported : 1; 14 | unsigned ssse3_supported : 1; 15 | unsigned sse41_supported : 1; 16 | unsigned avx_supported : 1; 17 | unsigned avx2_supported : 1; 18 | unsigned avx512_supported : 1; 19 | } SWcpu_info; 20 | 21 | void swCPUInfoInit(SWcpu_info *info); 22 | void swCPUInfoDestroy(SWcpu_info *info); 23 | 24 | #endif // SWCPU_H 25 | -------------------------------------------------------------------------------- /SWculling.c: -------------------------------------------------------------------------------- 1 | #include "SWculling.h" 2 | 3 | #include "SWalloc.h" 4 | #include "SWintrin.inl" 5 | 6 | SWint _swProcessTrianglesIndexed_Ref(SWcull_ctx *ctx, const void *attribs, const SWuint *indices, SWuint stride, 7 | SWuint index_count, const SWfloat *xform, SWint is_occluder); 8 | SWint _swCullCtxTestRect_Ref(const SWcull_ctx *ctx, const SWfloat p_min[2], const SWfloat p_max[3], 9 | const SWfloat w_min); 10 | void _swCullCtxClearBuf_Ref(SWcull_ctx *ctx); 11 | void _swCullCtxDebugDepth_Ref(const SWcull_ctx *ctx, SWfloat *out_depth); 12 | 13 | #if defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64) 14 | SWint _swProcessTrianglesIndexed_NEON(SWcull_ctx *ctx, const void *attribs, const SWuint *indices, SWuint stride, 15 | SWuint index_count, const SWfloat *xform, SWint is_occluder); 16 | SWint _swCullCtxTestRect_NEON(const SWcull_ctx *ctx, const SWfloat p_min[2], const SWfloat p_max[3], 17 | const SWfloat w_min); 18 | void _swCullCtxClearBuf_NEON(SWcull_ctx *ctx); 19 | void _swCullCtxDebugDepth_NEON(const SWcull_ctx *ctx, SWfloat *out_depth); 20 | #else // defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64) 21 | SWint _swProcessTrianglesIndexed_SSE2(SWcull_ctx *ctx, const void *attribs, const SWuint *indices, SWuint stride, 22 | SWuint index_count, const SWfloat *xform, SWint is_occluder); 23 | SWint _swProcessTrianglesIndexed_AVX2(SWcull_ctx *ctx, const void *attribs, const SWuint *indices, SWuint stride, 24 | SWuint index_count, const SWfloat *xform, SWint is_occluder); 25 | 26 | SWint _swCullCtxTestRect_SSE2(const SWcull_ctx *ctx, const SWfloat p_min[2], const SWfloat p_max[3], 27 | const SWfloat w_min); 28 | SWint _swCullCtxTestRect_AVX2(const SWcull_ctx *ctx, const SWfloat p_min[2], const SWfloat p_max[3], 29 | const SWfloat w_min); 30 | 31 | void _swCullCtxClearBuf_SSE2(SWcull_ctx *ctx); 32 | void _swCullCtxClearBuf_AVX2(SWcull_ctx *ctx); 33 | 34 | void _swCullCtxDebugDepth_SSE2(const SWcull_ctx *ctx, SWfloat *out_depth); 35 | void _swCullCtxDebugDepth_AVX2(const SWcull_ctx *ctx, SWfloat *out_depth); 36 | 37 | #if !defined(_MSC_VER) || _MSC_VER > 1916 38 | SWint _swProcessTrianglesIndexed_AVX512(SWcull_ctx *ctx, const void *attribs, const SWuint *indices, SWuint stride, 39 | SWuint index_count, const SWfloat *xform, SWint is_occluder); 40 | SWint _swCullCtxTestRect_AVX512(const SWcull_ctx *ctx, const SWfloat p_min[2], const SWfloat p_max[3], 41 | const SWfloat w_min); 42 | void _swCullCtxClearBuf_AVX512(SWcull_ctx *ctx); 43 | void _swCullCtxDebugDepth_AVX512(const SWcull_ctx *ctx, SWfloat *out_depth); 44 | #endif 45 | #endif // defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64) 46 | 47 | void swCullCtxInit(SWcull_ctx *ctx, const SWint w, const SWint h, SWfloat near_clip) { 48 | swCPUInfoInit(&ctx->cpu_info); 49 | 50 | ctx->ztiles = NULL; 51 | swCullCtxResize(ctx, w, h, near_clip); 52 | 53 | swCullCtxClear(ctx); 54 | } 55 | 56 | void swCullCtxDestroy(SWcull_ctx *ctx) { 57 | swCPUInfoDestroy(&ctx->cpu_info); 58 | sw_aligned_free(ctx->ztiles); 59 | memset(ctx, 0, sizeof(SWcull_ctx)); 60 | } 61 | 62 | void swCullCtxResize(SWcull_ctx *ctx, const SWint w, const SWint h, SWfloat near_clip) { 63 | if (ctx->w == w && ctx->h == h && ctx->near_clip == near_clip) { 64 | return; 65 | } 66 | 67 | ctx->w = w; 68 | ctx->h = h; 69 | 70 | ctx->half_w = (SWfloat)w / 2; 71 | ctx->half_h = (SWfloat)h / 2; 72 | 73 | #if defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64) 74 | ctx->tile_size_y = 4; 75 | ctx->subtile_size_y = 4; 76 | ctx->tri_indexed_proc = (SWCullTrianglesIndexedProcType)&_swProcessTrianglesIndexed_NEON; 77 | ctx->test_rect_proc = &_swCullCtxTestRect_NEON; 78 | ctx->clear_buf_proc = &_swCullCtxClearBuf_NEON; 79 | ctx->debug_depth_proc = (SWCullDebugDepthProcType)&_swCullCtxDebugDepth_NEON; 80 | #else 81 | #if !defined(_MSC_VER) || _MSC_VER > 1916 82 | if (ctx->cpu_info.avx512_supported) { 83 | ctx->tile_size_y = 16; 84 | ctx->subtile_size_y = 4; 85 | ctx->tri_indexed_proc = (SWCullTrianglesIndexedProcType)&_swProcessTrianglesIndexed_AVX512; 86 | ctx->test_rect_proc = &_swCullCtxTestRect_AVX512; 87 | ctx->clear_buf_proc = &_swCullCtxClearBuf_AVX512; 88 | ctx->debug_depth_proc = (SWCullDebugDepthProcType)&_swCullCtxDebugDepth_AVX512; 89 | } else 90 | #endif 91 | if (ctx->cpu_info.avx2_supported) { 92 | ctx->tile_size_y = 8; 93 | ctx->subtile_size_y = 4; 94 | ctx->tri_indexed_proc = (SWCullTrianglesIndexedProcType)&_swProcessTrianglesIndexed_AVX2; 95 | ctx->test_rect_proc = &_swCullCtxTestRect_AVX2; 96 | ctx->clear_buf_proc = &_swCullCtxClearBuf_AVX2; 97 | ctx->debug_depth_proc = (SWCullDebugDepthProcType)&_swCullCtxDebugDepth_AVX2; 98 | } else if (ctx->cpu_info.sse2_supported) { 99 | ctx->tile_size_y = 4; 100 | ctx->subtile_size_y = 4; 101 | ctx->tri_indexed_proc = (SWCullTrianglesIndexedProcType)&_swProcessTrianglesIndexed_SSE2; 102 | ctx->test_rect_proc = &_swCullCtxTestRect_SSE2; 103 | ctx->clear_buf_proc = &_swCullCtxClearBuf_SSE2; 104 | ctx->debug_depth_proc = (SWCullDebugDepthProcType)&_swCullCtxDebugDepth_SSE2; 105 | } else 106 | #endif 107 | { 108 | ctx->tile_size_y = 1; 109 | ctx->subtile_size_y = 1; 110 | ctx->tri_indexed_proc = (SWCullTrianglesIndexedProcType)&_swProcessTrianglesIndexed_Ref; 111 | ctx->test_rect_proc = &_swCullCtxTestRect_Ref; 112 | ctx->clear_buf_proc = &_swCullCtxClearBuf_Ref; 113 | ctx->debug_depth_proc = (SWCullDebugDepthProcType)&_swCullCtxDebugDepth_Ref; 114 | } 115 | 116 | assert((w % SW_CULL_SUBTILE_X == 0) && (h % ctx->subtile_size_y == 0)); 117 | 118 | ctx->tile_w = (w + (SW_CULL_TILE_SIZE_X - 1)) / SW_CULL_TILE_SIZE_X; 119 | ctx->tile_h = (h + (ctx->tile_size_y - 1)) / ctx->tile_size_y; 120 | 121 | const int tile_size = SW_CULL_TILE_SIZE_X * ctx->tile_size_y / 8 + 2 * sizeof(float) * 122 | (SW_CULL_TILE_SIZE_X / SW_CULL_SUBTILE_X) * 123 | (ctx->tile_size_y / ctx->subtile_size_y); 124 | 125 | ctx->ztiles_mem_size = ctx->tile_w * ctx->tile_h * tile_size; 126 | sw_aligned_free(ctx->ztiles); 127 | ctx->ztiles = sw_aligned_malloc(ctx->ztiles_mem_size, 64); 128 | 129 | assert((uintptr_t)ctx->size_ivec4 % 16 == 0); 130 | __m128i *size_ivec4 = (__m128i *)ctx->size_ivec4; 131 | (*size_ivec4) = _mm128_setr_epi32(ctx->w, ctx->w, ctx->h, ctx->h); 132 | 133 | assert((uintptr_t)ctx->half_size_vec4 % 16 == 0); 134 | __m128 *half_size = (__m128 *)ctx->half_size_vec4; 135 | (*half_size) = _mm128_setr_ps(ctx->half_w, ctx->half_w, ctx->half_h, ctx->half_h); 136 | 137 | const SWfloat pad_w = ((SWfloat)2) / ctx->w; 138 | const SWfloat pad_h = ((SWfloat)2) / ctx->h; 139 | 140 | assert((uintptr_t)ctx->clip_planes % 16 == 0); 141 | __m128 *clip_planes = (__m128 *)ctx->clip_planes; 142 | clip_planes[0] = _mm128_setr_ps(1.0f - pad_w, 0.0f, 1.0f, 0.0f); 143 | clip_planes[1] = _mm128_setr_ps(-1.0f + pad_w, 0.0f, 1.0f, 0.0f); 144 | clip_planes[2] = _mm128_setr_ps(0.0f, -1.0f + pad_h, 1.0f, 0.0f); 145 | clip_planes[3] = _mm128_setr_ps(0.0f, 1.0f - pad_h, 1.0f, 0.0f); 146 | clip_planes[4] = _mm128_setr_ps(0.0f, 0.0f, 1.0f, -near_clip); 147 | 148 | ctx->near_clip = near_clip; 149 | } 150 | 151 | void swCullCtxClear(SWcull_ctx *ctx) { (*ctx->clear_buf_proc)(ctx); } 152 | 153 | void swCullCtxSubmitCullSurfs(SWcull_ctx *ctx, SWcull_surf *surfs, const SWuint count) { 154 | for (SWuint i = 0; i < count; i++) { 155 | SWcull_surf *s = &surfs[i]; 156 | 157 | if (s->indices) { 158 | if (s->prim_type == SW_TRIANGLES) { 159 | if (s->index_type == SW_UNSIGNED_INT) { 160 | s->visible = (*ctx->tri_indexed_proc)(ctx, s->attribs, (const SWuint *)s->indices, s->stride, 161 | s->count, s->xform, (s->type == SW_OCCLUDER)); 162 | } else { 163 | assert(0); 164 | } 165 | } 166 | } else { 167 | } 168 | } 169 | } 170 | 171 | SWint swCullCtxTestRect(SWcull_ctx *ctx, const SWfloat p_min[2], const SWfloat p_max[3], const SWfloat w_min) { 172 | return (*ctx->test_rect_proc)(ctx, p_min, p_max, w_min); 173 | } 174 | 175 | SWint _swClipPolygon(const __m128 in_vtx[], const SWint in_vtx_count, const __m128 plane, __m128 out_vtx[]) { 176 | __m128 p0 = in_vtx[in_vtx_count - 1]; 177 | __m128 dist0 = _mm128_dp4_ps(plane, p0); 178 | 179 | SWint out_vtx_count = 0; 180 | for (SWint i = 0; i < in_vtx_count; i++) { 181 | const __m128 p1 = in_vtx[i]; 182 | const __m128 dist1 = _mm128_dp4_ps(plane, p1); 183 | const int dist0_neg = _mm128_movemask_ps(dist0); 184 | if (!dist0_neg) { // dist0 >= 0.0f 185 | out_vtx[out_vtx_count++] = p0; 186 | } 187 | 188 | // if dist0 and dist1 have different signs (segment intersects plane) 189 | if (_mm128_movemask_ps(_mm128_xor_ps(dist0, dist1))) { 190 | if (!dist0_neg) { 191 | const __m128 t = _mm128_div_ps(dist0, _mm128_sub_ps(dist0, dist1)); 192 | out_vtx[out_vtx_count++] = _mm128_fmadd_ps(_mm128_sub_ps(p1, p0), t, p0); 193 | } else { 194 | const __m128 t = _mm128_div_ps(dist1, _mm128_sub_ps(dist1, dist0)); 195 | out_vtx[out_vtx_count++] = _mm128_fmadd_ps(_mm128_sub_ps(p0, p1), t, p1); 196 | } 197 | } 198 | 199 | dist0 = dist1; 200 | p0 = p1; 201 | } 202 | 203 | return out_vtx_count; 204 | } 205 | 206 | void swCullCtxDebugDepth(SWcull_ctx *ctx, SWfloat *out_depth) { (*ctx->debug_depth_proc)(ctx, out_depth); } 207 | -------------------------------------------------------------------------------- /SWculling.h: -------------------------------------------------------------------------------- 1 | #ifndef SW_CULLING_H 2 | #define SW_CULLING_H 3 | 4 | #include "SWcore.h" 5 | #include "SWcpu.h" 6 | #include "SWzbuffer.h" 7 | 8 | #define SW_CULL_TILE_WIDTH_SHIFT 5 9 | #define SW_CULL_TILE_SIZE_X (1 << SW_CULL_TILE_WIDTH_SHIFT) 10 | 11 | #define SW_CULL_SUBTILE_X 8 12 | 13 | #define SW_CULL_QUICK_MASK 14 | 15 | typedef enum SWsurf_type { SW_OCCLUDER = 0, SW_OCCLUDEE } SWsurf_type; 16 | 17 | typedef struct SWcull_surf { 18 | SWsurf_type type; 19 | SWenum prim_type, index_type; 20 | const void *attribs; 21 | const void *indices; 22 | SWuint stride, count; 23 | SWfloat bbox_min[3], bbox_max[3]; 24 | const SWfloat *xform; 25 | SWint visible; 26 | } SWcull_surf; 27 | 28 | /************************************************************************/ 29 | 30 | struct SWcull_ctx; 31 | typedef SWint (*SWCullTrianglesIndexedProcType)(struct SWcull_ctx *ctx, 32 | const void *attribs, 33 | const SWuint *indices, SWuint stride, 34 | SWuint index_count, const SWfloat *xform, 35 | SWint is_occluder); 36 | typedef SWint (*SWCullRectProcType)(const struct SWcull_ctx *ctx, const SWfloat p_min[2], 37 | const SWfloat p_max[3], const SWfloat w_min); 38 | typedef void (*SWCullClearBufferProcType)(struct SWcull_ctx *ctx); 39 | typedef void (*SWCullDebugDepthProcType)(const struct SWcull_ctx *ctx, 40 | SWfloat *out_depth); 41 | 42 | enum eClipPlane { Left, Right, Top, Bottom, Near, _PlanesCount }; 43 | 44 | typedef struct SWcull_ctx { 45 | SWcpu_info cpu_info; 46 | 47 | SWint w, h; 48 | SWfloat half_w, half_h; 49 | SWfloat near_clip; 50 | 51 | SWint tile_size_y, subtile_size_y; 52 | SWint tile_w, tile_h; 53 | void *ztiles; 54 | SWuint ztiles_mem_size; 55 | 56 | SWCullTrianglesIndexedProcType tri_indexed_proc; 57 | SWCullRectProcType test_rect_proc; 58 | SWCullClearBufferProcType clear_buf_proc; 59 | SWCullDebugDepthProcType debug_depth_proc; 60 | 61 | ALIGNED(SWint size_ivec4[4], 16); 62 | ALIGNED(SWfloat half_size_vec4[4], 16); 63 | 64 | ALIGNED(SWfloat clip_planes[_PlanesCount][4], 16); 65 | } SWcull_ctx; 66 | 67 | void swCullCtxInit(SWcull_ctx *ctx, SWint w, SWint h, SWfloat near_clip); 68 | void swCullCtxDestroy(SWcull_ctx *ctx); 69 | 70 | void swCullCtxResize(SWcull_ctx *ctx, SWint w, SWint h, SWfloat near_clip); 71 | void swCullCtxClear(SWcull_ctx *ctx); 72 | void swCullCtxSubmitCullSurfs(SWcull_ctx *ctx, SWcull_surf *surfs, SWuint count); 73 | 74 | SWint swCullCtxTestRect(SWcull_ctx *ctx, const SWfloat p_min[2], const SWfloat p_max[3], 75 | SWfloat w_min); 76 | 77 | void swCullCtxDebugDepth(SWcull_ctx *ctx, SWfloat *out_depth); 78 | 79 | #endif /* SW_CULLING_H */ -------------------------------------------------------------------------------- /SWculling_AVX2.c: -------------------------------------------------------------------------------- 1 | #include "SWculling.h" 2 | #include "SWrasterize.h" 3 | 4 | #include 5 | 6 | #define SW_CULL_TILE_HEIGHT_SHIFT 3 7 | #define SW_CULL_TILE_SIZE_Y (1 << SW_CULL_TILE_HEIGHT_SHIFT) 8 | 9 | #define USE_AVX2 10 | #include "SWculling_rast.inl" 11 | #undef USE_AVX2 12 | 13 | #undef SW_CULL_TILE_HEIGHT_SHIFT 14 | #undef SW_CULL_TILE_SIZE_Y 15 | -------------------------------------------------------------------------------- /SWculling_AVX512.c: -------------------------------------------------------------------------------- 1 | #include "SWculling.h" 2 | #include "SWrasterize.h" 3 | 4 | #include 5 | 6 | #define SW_CULL_TILE_HEIGHT_SHIFT 4 7 | #define SW_CULL_TILE_SIZE_Y (1 << SW_CULL_TILE_HEIGHT_SHIFT) 8 | 9 | #define USE_AVX512 10 | #include "SWculling_rast.inl" 11 | #undef USE_AVX512 12 | 13 | #undef SW_CULL_TILE_HEIGHT_SHIFT 14 | #undef SW_CULL_TILE_SIZE_Y 15 | -------------------------------------------------------------------------------- /SWculling_NEON.c: -------------------------------------------------------------------------------- 1 | #include "SWculling.h" 2 | #include "SWrasterize.h" 3 | 4 | #define SW_CULL_TILE_HEIGHT_SHIFT 2 5 | #define SW_CULL_TILE_SIZE_Y (1 << SW_CULL_TILE_HEIGHT_SHIFT) 6 | 7 | #define USE_NEON 8 | 9 | #include "SWculling_rast.inl" 10 | #undef USE_NEON 11 | 12 | #undef SW_CULL_TILE_HEIGHT_SHIFT 13 | #undef SW_CULL_TILE_SIZE_Y 14 | -------------------------------------------------------------------------------- /SWculling_SSE2.c: -------------------------------------------------------------------------------- 1 | #include "SWculling.h" 2 | #include "SWrasterize.h" 3 | 4 | #include 5 | 6 | #define SW_CULL_TILE_HEIGHT_SHIFT 2 7 | #define SW_CULL_TILE_SIZE_Y (1 << SW_CULL_TILE_HEIGHT_SHIFT) 8 | 9 | #define USE_SSE2 10 | #include "SWculling_rast.inl" 11 | #undef USE_SSE2 12 | 13 | #undef SW_CULL_TILE_HEIGHT_SHIFT 14 | #undef SW_CULL_TILE_SIZE_Y -------------------------------------------------------------------------------- /SWculling_rast_scanline.inl: -------------------------------------------------------------------------------- 1 | 2 | #define SIMD_ALL_LANES_MASK ((1 << SIMD_WIDTH) - 1) 3 | 4 | static SWint SCANLINE_FUNC_NAME(SWztile *tiles, SWint left_offset, SWint right_offset, 5 | SWint left_event, SWint right_event, 6 | const __mXXXi events[3], SWint tile_ndx, 7 | const __mXXX *tri_zmin, const __mXXX *tri_zmax, 8 | const __mXXX *_z0, SWfloat zx) { 9 | SWint event_offset = (left_offset << SW_CULL_TILE_WIDTH_SHIFT); 10 | 11 | __mXXXi left[LEFT_COUNT], right[RIGHT_COUNT]; 12 | for (SWint i = 0; i < LEFT_COUNT; i++) { 13 | left[i] = _mmXXX_max_epi32( 14 | _mmXXX_sub_epi32(_mmXXX_srai_epi32(events[left_event - i], FP_BITS), 15 | _mmXXX_set1_epi32(event_offset)), 16 | _mmXXX_set1_epi32(0)); 17 | } 18 | for (SWint i = 0; i < RIGHT_COUNT; i++) { 19 | right[i] = _mmXXX_max_epi32( 20 | _mmXXX_sub_epi32(_mmXXX_srai_epi32(events[right_event + i], FP_BITS), 21 | _mmXXX_set1_epi32(event_offset)), 22 | _mmXXX_set1_epi32(0)); 23 | } 24 | 25 | __mXXX z0 = _mmXXX_add_ps(*_z0, _mmXXX_set1_ps(zx * left_offset)); 26 | 27 | SWint tile_ndx_end = tile_ndx + right_offset; 28 | tile_ndx += left_offset; 29 | while (1) { 30 | #ifdef SW_CULL_QUICK_MASK 31 | __mXXX zmin0_buf = tiles[tile_ndx].zmin[0].vec; 32 | #else 33 | __mXXX zmin0_buf; 34 | //#error "Not implemented!" 35 | #endif 36 | 37 | __mXXX dist0 = _mmXXX_sub_ps(*tri_zmax, zmin0_buf); 38 | if (_mmXXX_movemask_ps(dist0) != SIMD_ALL_LANES_MASK) { 39 | __mXXXi coverage_mask = _mmXXX_sllv_ones(left[0]); 40 | for (SWint i = 1; i < LEFT_COUNT; i++) { 41 | coverage_mask = 42 | _mmXXX_and_siXXX(coverage_mask, _mmXXX_sllv_ones(left[i])); 43 | } 44 | for (SWint i = 0; i < RIGHT_COUNT; i++) { 45 | coverage_mask = 46 | _mmXXX_andnot_siXXX(_mmXXX_sllv_ones(right[i]), coverage_mask); 47 | } 48 | 49 | // rearrange to 8x4 tiles 50 | coverage_mask = _mmXXX_transpose_epi8(coverage_mask); 51 | 52 | #ifdef IS_OCCLUDER 53 | __mXXX z_subtile_min = _mmXXX_max_ps(z0, *tri_zmin); 54 | #ifdef SW_CULL_QUICK_MASK 55 | NAME(_swUpdateTileQuick) 56 | (&tiles[tile_ndx], &coverage_mask, &z_subtile_min); 57 | #else 58 | 59 | #endif 60 | #else // occludee case 61 | __mXXX z_subtile_max = _mmXXX_min_ps(z0, *tri_zmax); 62 | __mXXXi z_pass = 63 | _mmXXX_castps_siXXX(_mmXXX_cmpge_ps(z_subtile_max, zmin0_buf)); 64 | 65 | __mXXXi dead_lane = _mmXXX_cmpeq_epi32(coverage_mask, _mmXXX_setzero_siXXX()); 66 | z_pass = _mmXXX_andnot_siXXX(dead_lane, z_pass); 67 | 68 | if (!_mmXXX_testz_siXXX(z_pass, z_pass)) { 69 | return 1; 70 | } 71 | #endif 72 | } 73 | 74 | if (++tile_ndx >= tile_ndx_end) { 75 | break; 76 | } 77 | z0 = _mmXXX_add_ps(z0, _mmXXX_set1_ps(zx)); 78 | for (SWint i = 0; i < LEFT_COUNT; i++) { 79 | // saturated sub, does max(x, 0) automatically 80 | left[i] = _mmXXX_subs_epu16(left[i], _mmXXX_set1_epi32(SW_CULL_TILE_SIZE_X)); 81 | } 82 | for (SWint i = 0; i < RIGHT_COUNT; i++) { 83 | right[i] = 84 | _mmXXX_subs_epu16(right[i], _mmXXX_set1_epi32(SW_CULL_TILE_SIZE_X)); 85 | } 86 | } 87 | 88 | #ifdef IS_OCCLUDER 89 | return 1; 90 | #else 91 | return 0; 92 | #endif 93 | } 94 | -------------------------------------------------------------------------------- /SWculling_rast_tri.inl: -------------------------------------------------------------------------------- 1 | 2 | SWint TRI_FUNC_NAME(SWcull_ctx *ctx, SWint tile_row_ndx, SWint tile_mid_row_ndx, 3 | SWint tile_end_row_ndx, SWint bb_width, SWint tri_ndx, 4 | SWint slope_tile_delta[], SWint event_start[], SWint slope[], 5 | const __mXXX *tri_zmin, const __mXXX *tri_zmax, __mXXX *z0, 6 | SWfloat zx, SWfloat zy, SWint flat_bottom) { 7 | #define LEFT_EDGE_BIAS 0 8 | #define RIGHT_EDGE_BIAS 0 9 | #define UPDATE_TILE_EVENTS_Y(i) \ 10 | tri_event[i] = _mmXXX_add_epi32(tri_event[i], tri_slope_tile_delta[i]) 11 | 12 | const __mXXXi tri_slope_tile_delta[3] = { 13 | _mmXXX_set1_epi32(slope_tile_delta[0 * SIMD_WIDTH + tri_ndx]), 14 | _mmXXX_set1_epi32(slope_tile_delta[1 * SIMD_WIDTH + tri_ndx]), 15 | _mmXXX_set1_epi32(slope_tile_delta[2 * SIMD_WIDTH + tri_ndx])}; 16 | 17 | __mXXXi tri_event[3] = { 18 | _mmXXX_add_epi32( 19 | _mmXXX_set1_epi32(event_start[0 * SIMD_WIDTH + tri_ndx]), 20 | _mmXXX_mullo_epi32(SIMD_LANE_NDX, 21 | _mmXXX_set1_epi32(slope[0 * SIMD_WIDTH + tri_ndx]))), 22 | _mmXXX_add_epi32( 23 | _mmXXX_set1_epi32(event_start[1 * SIMD_WIDTH + tri_ndx]), 24 | _mmXXX_mullo_epi32(SIMD_LANE_NDX, 25 | _mmXXX_set1_epi32(slope[1 * SIMD_WIDTH + tri_ndx]))), 26 | _mmXXX_add_epi32( 27 | _mmXXX_set1_epi32(event_start[2 * SIMD_WIDTH + tri_ndx]), 28 | _mmXXX_mullo_epi32(SIMD_LANE_NDX, 29 | _mmXXX_set1_epi32(slope[2 * SIMD_WIDTH + tri_ndx])))}; 30 | 31 | #ifdef TIGHT_TRANVERSAL 32 | SWint start_delta, end_delta, top_delta, start_event, end_event, top_event; 33 | start_delta = slope_tile_delta[2 * SIMD_WIDTH + tri_ndx] + LEFT_EDGE_BIAS; 34 | end_delta = slope_tile_delta[0 * SIMD_WIDTH + tri_ndx] + RIGHT_EDGE_BIAS; 35 | top_delta = slope_tile_delta[1 * SIMD_WIDTH + tri_ndx] + 36 | (MID_VTX_RIGHT ? RIGHT_EDGE_BIAS : LEFT_EDGE_BIAS); 37 | 38 | start_event = event_start[2 * SIMD_WIDTH + tri_ndx] + sw_min(0, start_delta); 39 | end_event = event_start[0 * SIMD_WIDTH + tri_ndx] + sw_max(0, end_delta) + 40 | (SW_CULL_TILE_SIZE_X << FP_BITS); 41 | #if MID_VTX_RIGHT 42 | top_event = event_start[1 * SIMD_WIDTH + tri_ndx] + sw_max(0, top_delta) + 43 | (SW_CULL_TILE_SIZE_X << FP_BITS); 44 | #else // MID_VTX_RIGHT 45 | top_event = event_start[1 * SIMD_WIDTH + tri_ndx] + sw_min(0, top_delta); 46 | #endif // MID_VTX_RIGHT 47 | #endif // TIGHT_TRANVERSAL 48 | 49 | if (!flat_bottom) { 50 | SWint tile_stop_ndx = sw_min(tile_end_row_ndx, tile_mid_row_ndx); 51 | // bottom half of triangle 52 | while (tile_row_ndx < tile_stop_ndx) { 53 | SWint start = 0, end = bb_width; 54 | #ifdef TIGHT_TRANVERSAL 55 | start = 56 | sw_max(0, sw_min(bb_width - 1, 57 | start_event >> (SW_CULL_TILE_WIDTH_SHIFT + FP_BITS))); 58 | end = sw_min(bb_width, (end_event >> (SW_CULL_TILE_WIDTH_SHIFT + FP_BITS))); 59 | start_event += start_delta; 60 | end_event += end_delta; 61 | #endif // TIGHT_TRANVERSAL 62 | 63 | #ifdef IS_OCCLUDER 64 | const SWint res = NAME(_swProcessScanlineOccluder_L1R1)( 65 | #else 66 | const SWint res = NAME(_swProcessScanline_L1R1)( 67 | #endif 68 | (SWztile *)ctx->ztiles, start, end, 2 /* left_event */, 69 | 0 /* right_event*/, tri_event, tile_row_ndx, tri_zmin, tri_zmax, z0, zx); 70 | #ifndef IS_OCCLUDER 71 | if (res) { 72 | return 1; 73 | } 74 | #else 75 | (void)res; 76 | #endif 77 | 78 | tile_row_ndx += ctx->tile_w; 79 | (*z0) = _mmXXX_add_ps(*z0, _mmXXX_set1_ps(zy)); 80 | UPDATE_TILE_EVENTS_Y(0); 81 | UPDATE_TILE_EVENTS_Y(2); 82 | } 83 | 84 | // middle part (touched by all three edges) 85 | if (tile_row_ndx < tile_end_row_ndx) { 86 | SWint start = 0, end = bb_width; 87 | #ifdef TIGHT_TRANVERSAL 88 | start = 89 | sw_max(0, sw_min(bb_width - 1, 90 | start_event >> (SW_CULL_TILE_WIDTH_SHIFT + FP_BITS))); 91 | end = sw_min(bb_width, (end_event >> (SW_CULL_TILE_WIDTH_SHIFT + FP_BITS))); 92 | 93 | end_event = mid_vtx_right ? top_event : end_event; 94 | end_delta = mid_vtx_right ? top_delta : end_delta; 95 | start_event = mid_vtx_right ? start_event : top_event; 96 | start_delta = mid_vtx_right ? start_delta : top_delta; 97 | start_event += start_delta; 98 | end_event += end_delta; 99 | #endif // TIGHT_TRANVERSAL 100 | 101 | #if MID_VTX_RIGHT 102 | #ifdef IS_OCCLUDER 103 | const SWint res = NAME(_swProcessScanlineOccluder_L1R2)( 104 | #else 105 | const SWint res = NAME(_swProcessScanline_L1R2)( 106 | #endif 107 | (SWztile *)ctx->ztiles, start, end, 2 /* left_event */, 108 | 0 /* right_event*/, tri_event, tile_row_ndx, tri_zmin, tri_zmax, z0, zx); 109 | #else // MID_VTX_RIGHT 110 | #ifdef IS_OCCLUDER 111 | const SWint res = NAME(_swProcessScanlineOccluder_L2R1)( 112 | #else 113 | const SWint res = NAME(_swProcessScanline_L2R1)( 114 | #endif 115 | (SWztile *)ctx->ztiles, start, end, 2 /* left_event */, 116 | 0 /* right_event*/, tri_event, tile_row_ndx, tri_zmin, tri_zmax, z0, zx); 117 | #endif // MID_VTX_RIGHT 118 | #ifndef IS_OCCLUDER 119 | if (res) { 120 | return 1; 121 | } 122 | #else 123 | (void)res; 124 | #endif 125 | 126 | tile_row_ndx += ctx->tile_w; 127 | } 128 | 129 | // top half of triangle 130 | if (tile_row_ndx < tile_end_row_ndx) { 131 | // move to the next scanline 132 | (*z0) = _mmXXX_add_ps(*z0, _mmXXX_set1_ps(zy)); 133 | SWint i0 = MID_VTX_RIGHT + 0; 134 | SWint i1 = MID_VTX_RIGHT + 1; 135 | UPDATE_TILE_EVENTS_Y(i0); 136 | UPDATE_TILE_EVENTS_Y(i1); 137 | 138 | while (1) { 139 | SWint start = 0, end = bb_width; 140 | #ifdef TIGHT_TRANVERSAL 141 | start = sw_max( 142 | 0, sw_min(bb_width - 1, 143 | start_event >> (SW_CULL_TILE_WIDTH_SHIFT + FP_BITS))); 144 | end = 145 | sw_min(bb_width, (end_event >> (SW_CULL_TILE_WIDTH_SHIFT + FP_BITS))); 146 | start_event += start_delta; 147 | end_event += end_delta; 148 | #endif // TIGHT_TRANVERSAL 149 | 150 | #ifdef IS_OCCLUDER 151 | const SWint res = NAME(_swProcessScanlineOccluder_L1R1)( 152 | #else 153 | const SWint res = NAME(_swProcessScanline_L1R1)( 154 | #endif 155 | (SWztile *)ctx->ztiles, start, end, 156 | MID_VTX_RIGHT + 1 /* left_event */, 157 | MID_VTX_RIGHT + 0 /* right_event*/, tri_event, tile_row_ndx, tri_zmin, 158 | tri_zmax, z0, zx); 159 | 160 | #ifndef IS_OCCLUDER 161 | if (res) { 162 | return 1; 163 | } 164 | #else 165 | (void)res; 166 | #endif 167 | 168 | tile_row_ndx += ctx->tile_w; 169 | if (tile_row_ndx >= tile_end_row_ndx) { 170 | break; 171 | } 172 | (*z0) = _mmXXX_add_ps(*z0, _mmXXX_set1_ps(zy)); 173 | UPDATE_TILE_EVENTS_Y(i0); 174 | UPDATE_TILE_EVENTS_Y(i1); 175 | } 176 | } 177 | } else { 178 | #ifdef TIGHT_TRANVERSAL 179 | end_event = MID_VTX_RIGHT ? top_event : end_event; 180 | end_delta = MID_VTX_RIGHT ? top_delta : end_delta; 181 | start_event = MID_VTX_RIGHT ? start_event : top_event; 182 | start_delta = MID_VTX_RIGHT ? start_delta : top_delta; 183 | #endif // TIGHT_TRANVERSAL 184 | 185 | // top half of triangle 186 | if (tile_row_ndx < tile_end_row_ndx) { 187 | SWint i0 = MID_VTX_RIGHT + 0; 188 | SWint i1 = MID_VTX_RIGHT + 1; 189 | 190 | while (1) { 191 | SWint start = 0, end = bb_width; 192 | #ifdef TIGHT_TRANVERSAL 193 | start = sw_max( 194 | 0, sw_min(bb_width - 1, 195 | start_event >> (SW_CULL_TILE_WIDTH_SHIFT + FP_BITS))); 196 | end = 197 | sw_min(bb_width, (end_event >> (SW_CULL_TILE_WIDTH_SHIFT + FP_BITS))); 198 | start_event += start_delta; 199 | end_event += end_delta; 200 | #endif // TIGHT_TRANVERSAL 201 | 202 | #ifdef IS_OCCLUDER 203 | const SWint res = NAME(_swProcessScanlineOccluder_L1R1)( 204 | #else 205 | const SWint res = NAME(_swProcessScanline_L1R1)( 206 | #endif 207 | (SWztile *)ctx->ztiles, start, end, 208 | MID_VTX_RIGHT + 1 /* left_event */, 209 | MID_VTX_RIGHT + 0 /* right_event*/, tri_event, tile_row_ndx, tri_zmin, 210 | tri_zmax, z0, zx); 211 | #ifndef IS_OCCLUDER 212 | if (res) { 213 | return 1; 214 | } 215 | #else 216 | (void)res; 217 | #endif 218 | 219 | tile_row_ndx += ctx->tile_w; 220 | if (tile_row_ndx >= tile_end_row_ndx) { 221 | break; 222 | } 223 | (*z0) = _mmXXX_add_ps(*z0, _mmXXX_set1_ps(zy)); 224 | UPDATE_TILE_EVENTS_Y(i0); 225 | UPDATE_TILE_EVENTS_Y(i1); 226 | } 227 | } 228 | } 229 | 230 | #ifdef IS_OCCLUDER 231 | return 1; 232 | #else 233 | return 0; 234 | #endif 235 | 236 | #undef LEFT_EDGE_BIAS 237 | #undef RIGHT_EDGE_BIAS 238 | #undef UPDATE_TILE_EVENTS_Y 239 | } 240 | -------------------------------------------------------------------------------- /SWdraw.c: -------------------------------------------------------------------------------- 1 | #include "SWdraw.h" 2 | 3 | #include 4 | 5 | #include "SWcontext.h" 6 | 7 | extern SWcontext *sw_cur_context; 8 | 9 | void swVertexAttribPointer(const SWuint index, const SWint size, const SWuint stride, 10 | const void *pointer) { 11 | SWprogram *p = &sw_cur_context->programs[sw_cur_context->cur_program]; 12 | swProgSetVtxAttribPointer(p, sw_cur_context, index, size, stride, pointer); 13 | } 14 | 15 | void swRegisterUniform(const SWint index, const SWenum type) { 16 | swCtxRegisterUniform(sw_cur_context, index, type); 17 | } 18 | 19 | void swRegisterUniformv(const SWint index, const SWenum type, const SWint num) { 20 | swCtxRegisterUniformv(sw_cur_context, index, type, num); 21 | } 22 | 23 | void swSetUniform(const SWint index, const SWenum type, const void *data) { 24 | swCtxSetUniform(sw_cur_context, index, type, data); 25 | } 26 | 27 | void swSetUniformv(const SWint index, const SWenum type, const SWint num, 28 | const void *data) { 29 | swCtxSetUniformv(sw_cur_context, index, type, num, data); 30 | } 31 | 32 | void swDrawArrays(const SWenum prim_type, const SWuint first, const SWuint count) { 33 | SWprogram *p = &sw_cur_context->programs[sw_cur_context->cur_program]; 34 | if (prim_type == SW_LINES) { 35 | swProgDrawLinesArray(p, sw_cur_context, first, count); 36 | } else if (prim_type == SW_LINE_STRIP) { 37 | swProgDrawLineStripArray(p, sw_cur_context, first, count); 38 | } else if (prim_type == SW_CURVES) { 39 | swProgDrawCurvesArray(p, sw_cur_context, first, count); 40 | } else if (prim_type == SW_CURVE_STRIP) { 41 | swProgDrawCurveStripArray(p, sw_cur_context, first, count); 42 | } else if (prim_type == SW_TRIANGLES) { 43 | swProgDrawTrianglesArray(p, sw_cur_context, first, count); 44 | } else if (prim_type == SW_TRIANGLE_STRIP) { 45 | swProgDrawTriangleStripArray(p, sw_cur_context, first, count); 46 | } else { 47 | assert(0); 48 | } 49 | } 50 | 51 | void swDrawElements(const SWenum prim_type, const SWuint count, const SWenum type, 52 | const void *indices) { 53 | SWprogram *p = &sw_cur_context->programs[sw_cur_context->cur_program]; 54 | if (prim_type == SW_LINES) { 55 | swProgDrawLinesIndexed(p, sw_cur_context, count, type, indices); 56 | } else if (prim_type == SW_LINE_STRIP) { 57 | swProgDrawLineStripIndexed(p, sw_cur_context, count, type, indices); 58 | } else if (prim_type == SW_CURVES) { 59 | swProgDrawCurvesIndexed(p, sw_cur_context, count, type, indices); 60 | } else if (prim_type == SW_CURVE_STRIP) { 61 | swProgDrawCurveStripIndexed(p, sw_cur_context, count, type, indices); 62 | } else if (prim_type == SW_TRIANGLES) { 63 | swProgDrawTrianglesIndexed(p, sw_cur_context, count, type, indices); 64 | } else if (prim_type == SW_TRIANGLE_STRIP) { 65 | swProgDrawTriangleStripIndexed(p, sw_cur_context, count, type, indices); 66 | } else { 67 | assert(0); 68 | } 69 | } -------------------------------------------------------------------------------- /SWdraw.h: -------------------------------------------------------------------------------- 1 | #ifndef SW_DRAW_H 2 | #define SW_DRAW_H 3 | 4 | #include "SWcore.h" 5 | 6 | void swVertexAttribPointer(SWuint index, SWint size, SWuint stride, const void *pointer); 7 | 8 | void swRegisterUniform(SWint index, SWenum type); 9 | void swRegisterUniformv(SWint index, SWenum type, SWint num); 10 | void swSetUniform(SWint index, SWenum type, const void *data); 11 | void swSetUniformv(SWint index, SWenum type, SWint num, const void *data); 12 | 13 | void swDrawArrays(SWenum prim_type, SWuint first, SWuint count); 14 | void swDrawElements(SWenum prim_type, SWuint count, SWenum type, const void *indices); 15 | 16 | #endif /* SW_DRAW_H */ -------------------------------------------------------------------------------- /SWframebuffer.c: -------------------------------------------------------------------------------- 1 | #include "SWframebuffer.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "SWtexture.h" 8 | #include "SWzbuffer.h" 9 | 10 | void swFbufInit(SWframebuffer *f, const SWenum type, const SWint w, const SWint h, 11 | const SWint with_depth) { 12 | f->type = type; 13 | f->w = w; 14 | f->h = h; 15 | f->zbuf = NULL; 16 | 17 | SWuint num_bytes = 0; 18 | if (type == SW_BGRA8888) { 19 | num_bytes = (SWuint)w * h * 4; 20 | } else if (type == SW_FRGBA) { 21 | num_bytes = (SWuint)w * h * 4 * sizeof(SWfloat); 22 | } 23 | f->pixels = calloc(num_bytes, 1); 24 | if (with_depth) { 25 | f->zbuf = (SWzbuffer *)malloc(sizeof(SWzbuffer)); 26 | swZbufInit(f->zbuf, w, h, 1.0f); 27 | swFbufClearDepth(f, (SWfloat)1); 28 | } 29 | } 30 | 31 | void swFbufDestroy(SWframebuffer *f) { 32 | free(f->pixels); 33 | if (f->zbuf) { 34 | swZbufDestroy(f->zbuf); 35 | free(f->zbuf); 36 | } 37 | memset(f, 0, sizeof(SWframebuffer)); 38 | } 39 | 40 | void swFbufClearColor_RGBA(SWframebuffer *f, SWubyte *rgba) { 41 | SWint y, span_size = 0; 42 | if (f->type == SW_BGRA8888) { 43 | SWint x; 44 | for (x = 0; x < f->w; x++) { 45 | swPx_BGRA8888_SetColor_RGBA8888(f->w, f->h, f->pixels, x, 0, rgba); 46 | } 47 | span_size = f->w * 4; 48 | } 49 | 50 | for (y = 1; y < f->h; y++) { 51 | memcpy(((char *)f->pixels) + y * span_size, f->pixels, (size_t)span_size); 52 | } 53 | } 54 | 55 | void swFbufClearColorFloat(SWframebuffer *f, const SWfloat r, const SWfloat g, 56 | const SWfloat b, const SWfloat a) { 57 | if (f->type == SW_BGRA8888) { 58 | SWubyte rgba[4]; 59 | _swPx_RGBA8888_SetColor_FRGBA_(rgba, r, g, b, a); 60 | swFbufClearColor_RGBA(f, rgba); 61 | } else if (f->type == SW_FRGBA) { 62 | SWfloat rgba[4] = {r, g, b, a}; 63 | SWint x, y, span_size = f->w * 4 * sizeof(SWfloat); 64 | for (x = 0; x < f->w; x++) { 65 | swPx_FRGBA_SetColor_FRGBA(f->w, f->h, f->pixels, x, 0, rgba); 66 | } 67 | 68 | for (y = 1; y < f->h; y++) { 69 | memcpy(((char *)f->pixels) + y * span_size, f->pixels, (size_t)span_size); 70 | } 71 | } 72 | } 73 | 74 | void swFbufBlitPixels(SWframebuffer *f, const SWint x, const SWint y, SWint pitch, 75 | const SWenum type, const SWenum mode, const SWint w, const SWint h, 76 | const void *pixels, const SWfloat scale) { 77 | const SWint beg_x = sw_max(x, 0), beg_y = sw_max(y, 0), 78 | end_x = sw_min(f->w, (SWint)(x + scale * w)), 79 | end_y = sw_min(f->h, (SWint)(y + scale * h)); 80 | 81 | if (!pitch) { 82 | pitch = w; 83 | } 84 | 85 | const SWfloat u_step = (SWfloat)1.0 / (w * scale), 86 | v_step = (SWfloat)1.0 / (h * scale); 87 | 88 | SWint i, j; 89 | 90 | if (type == SW_UNSIGNED_BYTE) { 91 | SWfloat v = 0; 92 | 93 | #define LOOP(__fun__) \ 94 | for (j = beg_y; j < end_y; j++) { \ 95 | SWfloat u = 0; \ 96 | for (i = beg_x; i < end_x; i++) { \ 97 | SWubyte *p = (SWubyte *)f->pixels + (j * f->w + i) * 4; \ 98 | __fun__(w, h, pixels, u, v, p); \ 99 | u += u_step; \ 100 | } \ 101 | v += v_step; \ 102 | } 103 | 104 | if (mode == SW_RGB) { 105 | if (f->type == SW_BGRA8888) { 106 | LOOP(swPx_RGB888_GetColor_BGRA8888_UV_norepeat_unsafe) 107 | } 108 | } else if (mode == SW_RGBA) { 109 | if (f->type == SW_BGRA8888) { 110 | LOOP(swPx_RGBA8888_GetColor_BGRA8888_UV_norepeat_unsafe) 111 | } 112 | } 113 | 114 | #undef LOOP 115 | } else if (type == SW_FLOAT) { 116 | assert(scale == 1.0f); 117 | const SWfloat *fp = (SWfloat *)pixels; 118 | if (mode == SW_FRGBA) { 119 | if (f->type == SW_BGRA8888) { 120 | for (j = beg_y; j < end_y; j++) { 121 | for (i = beg_x; i < end_x; i++) { 122 | swPx_BGRA8888_SetColor_FRGBA(f->w, f->h, (f->pixels), i, j, 123 | &fp[(i - beg_x) * 4]); 124 | } 125 | fp += pitch * 4; 126 | } 127 | } 128 | } 129 | } else if (type == SW_COMPRESSED) { 130 | assert(0); 131 | } 132 | } 133 | 134 | void swFbufBlitTexture(SWframebuffer *f, const SWint x, const SWint y, const SWtexture *t, 135 | const SWfloat scale) { 136 | swFbufBlitPixels(f, x, y, 0, t->type, t->mode, t->w, t->h, t->pixels, scale); 137 | } 138 | -------------------------------------------------------------------------------- /SWframebuffer.h: -------------------------------------------------------------------------------- 1 | #ifndef SW_FRAMEBUFFER_H 2 | #define SW_FRAMEBUFFER_H 3 | 4 | #include "SWcore.h" 5 | #include "SWpixels.h" 6 | #include "SWzbuffer.h" 7 | 8 | typedef struct SWframebuffer { 9 | SWenum type; 10 | SWint w, h; 11 | void *pixels; 12 | SWzbuffer *zbuf; 13 | } SWframebuffer; 14 | 15 | struct SWtexture; 16 | 17 | void swFbufInit(SWframebuffer *f, SWenum type, SWint w, SWint h, SWint with_depth); 18 | void swFbufDestroy(SWframebuffer *f); 19 | 20 | void swFbufClearColor_RGBA(SWframebuffer *f, SWubyte *rgba); 21 | void swFbufClearColorFloat(SWframebuffer *f, SWfloat r, SWfloat g, SWfloat b, SWfloat a); 22 | 23 | #define swFbufSetPixel_FRGBA(f, x, y, col) \ 24 | if ((f)->type == SW_BGRA8888) { \ 25 | swPx_BGRA8888_SetColor_FRGBA((f)->w, (f)->h, (f)->pixels, (x), (y), (col)); \ 26 | } else if ((f)->type == SW_FRGBA) { \ 27 | swPx_FRGBA_SetColor_FRGBA((f)->w, (f)->h, (f)->pixels, (x), (y), (col)); \ 28 | } 29 | 30 | #define swFbufSetPixel_BGRA8888(f, x, y, col) \ 31 | if ((f)->type == SW_BGRA8888) { \ 32 | swPx_BGRA8888_SetColor_BGRA8888((f)->w, (f)->h, (f)->pixels, (x), (y), (col)); \ 33 | } 34 | 35 | #define swFbufGetPixel_FRGBA(f, x, y, col) \ 36 | if ((f)->type == SW_BGRA8888) { \ 37 | swPx_BGRA8888_GetColor_FRGBA((f)->w, (f)->h, (f)->pixels, (x), (y), (col)); \ 38 | } else { (col)[0] = (col)[1] = (col)[2] = (col)[3] = 0; } 39 | 40 | #define swFbufTestDepth(f, x, y, z) \ 41 | swZbufTestDepth((f)->zbuf, x, y, z) 42 | 43 | #define swFbufSetDepth(f, x, y, z) \ 44 | swZbufSetDepth((f)->zbuf, x, y, z) 45 | 46 | 47 | #define swFbufSetTileRange(f, x, y, zmin, zmax) \ 48 | swZbufSetTileRange((f)->zbuf, x, y, zmin, zmax) 49 | 50 | #define swFbufUpdateTileRange(f, x, y, zmin, zmax) \ 51 | swZbufUpdateTileRange((f)->zbuf, x, y, zmin, zmax) 52 | 53 | #define swFbufTestTileRange(f, x, y, zmin, zmax) \ 54 | swZbufTestTileRange((f)->zbuf, x, y, zmin, zmax) 55 | 56 | 57 | #define swFbufClearDepth(f, z) \ 58 | swZbufClearDepth((f)->zbuf, (z)) 59 | 60 | #define swFbufBGRA8888_SetPixel_FRGBA(f, x, y, col) \ 61 | swPx_BGRA8888_SetColor_FRGBA((f)->w, (f)->h, (f)->pixels, (x), (y), (col)) 62 | 63 | void swFbufBlitPixels(SWframebuffer *f, SWint x, SWint y, SWint pitch, SWenum type, SWenum mode, SWint w, SWint h, const void *pixels, SWfloat scale); 64 | void swFbufBlitTexture(SWframebuffer *f, SWint x, SWint y, const struct SWtexture *t, SWfloat scale); 65 | 66 | /*static sw_inline SWint swFbufTestDepth(SWframebuffer *f, SWint x, SWint y, SWfloat z) { 67 | return !(z > f->depth[y * f->w + x]); 68 | }*/ 69 | 70 | #endif /* SW_FRAMEBUFFER_H */ -------------------------------------------------------------------------------- /SWintrin.inl: -------------------------------------------------------------------------------- 1 | #if defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64) 2 | #include 3 | #include 4 | 5 | #ifdef _MSC_VER 6 | #define force_inline __forceinline 7 | #else 8 | #define force_inline __attribute__((always_inline)) inline 9 | #endif 10 | #else 11 | #include 12 | #include 13 | #include 14 | #endif 15 | 16 | #if defined(USE_SSE2) 17 | #define POSTFIX SSE2 18 | #define SIMD_WIDTH 4 19 | 20 | #define SIMD_LANE_NDX _mm_setr_epi32(0, 1, 2, 3) 21 | 22 | typedef __m128 __mXXX; 23 | typedef __m128i __mXXXi; 24 | 25 | #define _mmXXX_castsiXXX_ps _mm_castsi128_ps 26 | #define _mmXXX_castps_siXXX _mm_castps_si128 27 | 28 | #define _mmXXX_setzero_ps _mm_setzero_ps 29 | #define _mmXXX_setzero_siXXX _mm_setzero_si128 30 | #define _mmXXX_set1_ps _mm_set1_ps 31 | #define _mmXXX_setr_ps _mm_setr_ps 32 | #define _mmXXX_set1_epi32 _mm_set1_epi32 33 | #define _mmXXX_setr_epi32 _mm_setr_epi32 34 | 35 | #define _mmXXX_add_ps _mm_add_ps 36 | #define _mmXXX_add_epi32 _mm_add_epi32 37 | #define _mmXXX_sub_ps _mm_sub_ps 38 | #define _mmXXX_sub_epi32 _mm_sub_epi32 39 | #define _mmXXX_mul_ps _mm_mul_ps 40 | #define _mmXXX_div_ps _mm_div_ps 41 | 42 | #define _mmXXX_neg_ps(a) _mm_xor_ps((a), _mm_set1_ps(-0.0f)) 43 | 44 | #define _mmXXX_mullo_epi32 _mm_mullo_epi32 45 | #define _mmXXX_subs_epu16 _mm_subs_epu16 46 | 47 | #define _mmXXX_min_ps _mm_min_ps 48 | #define _mmXXX_min_epi32 _mm_min_epi32 49 | #define _mmXXX_max_ps _mm_max_ps 50 | #define _mmXXX_max_epi32 _mm_max_epi32 51 | 52 | #define _mmXXX_floor_ps(v) _mm_round_ps((v), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) 53 | #define _mmXXX_ceil_ps(v) _mm_round_ps((v), _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) 54 | 55 | #define _mmXXX_cvttps_epi32 _mm_cvttps_epi32 56 | #define _mmXXX_cvtepi32_ps _mm_cvtepi32_ps 57 | 58 | #define _mmXXX_cmpeq_ps _mm_cmpeq_ps 59 | #define _mmXXX_cmpgt_ps _mm_cmpgt_ps 60 | #define _mmXXX_cmpge_ps _mm_cmpge_ps 61 | #define _mmXXX_cmpeq_epi32 _mm_cmpeq_epi32 62 | #define _mmXXX_cmpgt_epi32 _mm_cmpgt_epi32 63 | 64 | #define _mmXXX_not_ps(v) _mm_xor_ps((v), _mm_castsi128_ps(_mm_set1_epi32(~0))) 65 | #define _mmXXX_not_siXXX(a) _mm_xor_si128((a), _mm_set1_epi32(~0)) 66 | #define _mmXXX_and_ps _mm_and_ps 67 | #define _mmXXX_and_siXXX _mm_and_si128 68 | #define _mmXXX_andnot_ps _mm_andnot_ps 69 | #define _mmXXX_andnot_siXXX _mm_andnot_si128 70 | #define _mmXXX_or_ps _mm_or_ps 71 | #define _mmXXX_or_siXXX _mm_or_si128 72 | 73 | #define _mmXXX_movemask_ps _mm_movemask_ps 74 | 75 | #define _mmXXX_fmadd_ps(a, b, c) _mm_add_ps(_mm_mul_ps(a, b), c) 76 | #define _mmXXX_fmsub_ps(a, b, c) _mm_sub_ps(_mm_mul_ps(a, b), c) 77 | 78 | #define _mmXXX_slli_epi32 _mm_slli_epi32 79 | #define _mmXXX_srli_epi32 _mm_srli_epi32 80 | #define _mmXXX_srai_epi32 _mm_srai_epi32 81 | 82 | #ifdef USE_SSE41 83 | #define _mmXXX_blendv_ps _mm_blendv_ps 84 | #else // USE_SSE41 85 | static __m128 _mmXXX_blendv_ps(const __m128 a, const __m128 b, const __m128 c) { 86 | __m128 cond = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(c), 31)); 87 | return _mm_or_ps(_mm_andnot_ps(cond, a), _mm_and_ps(cond, b)); 88 | } 89 | #endif // USE_SSE41 90 | 91 | #ifdef USE_SSE42 92 | #define _mmXXX_sllv_ones(count) _mm_sllv_epi32(_mm_set1_epi32(0xffffffff), count) 93 | #else // USE_SSE42 94 | static inline __m128i _mmXXX_sllv_ones(const __m128i ishift) { 95 | union { 96 | __m128i shift_128; 97 | uint32_t shift_32[4]; 98 | } shift; 99 | 100 | shift.shift_128 = _mm_min_epi32(ishift, _mm_set1_epi32(32)); 101 | 102 | // Uses scalar approach to perform _mm_sllv_epi32(~0, shift) 103 | static const unsigned int maskLUT[33] = { 104 | ~0U << 0, ~0U << 1, ~0U << 2, ~0U << 3, ~0U << 4, ~0U << 5, ~0U << 6, 105 | ~0U << 7, ~0U << 8, ~0U << 9, ~0U << 10, ~0U << 11, ~0U << 12, ~0U << 13, 106 | ~0U << 14, ~0U << 15, ~0U << 16, ~0U << 17, ~0U << 18, ~0U << 19, ~0U << 20, 107 | ~0U << 21, ~0U << 22, ~0U << 23, ~0U << 24, ~0U << 25, ~0U << 26, ~0U << 27, 108 | ~0U << 28, ~0U << 29, ~0U << 30, ~0U << 31, 0U}; 109 | 110 | __m128i retMask = 111 | _mm_setr_epi32(maskLUT[shift.shift_32[0]], maskLUT[shift.shift_32[1]], 112 | maskLUT[shift.shift_32[2]], maskLUT[shift.shift_32[3]]); 113 | return retMask; 114 | } 115 | #endif // USE_SSE42 116 | 117 | #ifdef USE_SSE41 118 | static inline __m128i _mmXXX_transpose_epi8(const __m128i a) { 119 | const __m128i shuff = _mm_setr_epi8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 120 | 0xA, 0xE, 0x3, 0x7, 0xB, 0xF); 121 | return _mm_shuffle_epi8(a, shuff); 122 | } 123 | #else // USE_SSE41 124 | static inline __m128i _mmXXX_transpose_epi8(__m128i v) { 125 | // Perform transpose through two 16->8 bit pack and byte shifts 126 | const __m128i mask = 127 | _mm_setr_epi8(~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0); 128 | v = _mm_packus_epi16(_mm_and_si128(v, mask), _mm_srli_epi16(v, 8)); 129 | v = _mm_packus_epi16(_mm_and_si128(v, mask), _mm_srli_epi16(v, 8)); 130 | return v; 131 | } 132 | #endif // USE_SSE41 133 | 134 | #ifdef USE_SSE41 135 | #define _mmXXX_testz_siXXX _mm_testz_si128 136 | #else // USE_SSE41 137 | static inline int _mmXXX_testz_siXXX(const __m128i a, const __m128i b) { 138 | return _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(a, b), _mm_setzero_si128())) == 139 | 0xFFFF; 140 | } 141 | #endif // USE_SSE41 142 | 143 | #elif defined(USE_AVX2) 144 | #define POSTFIX AVX2 145 | #define SIMD_WIDTH 8 146 | 147 | #define SIMD_LANE_NDX _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7) 148 | #define SIMD_SHUFFLE_SCANLINE_TO_SUBTILES \ 149 | _mm256_setr_epi8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, \ 150 | 0x7, 0xB, 0xF, 0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, \ 151 | 0xA, 0xE, 0x3, 0x7, 0xB, 0xF) 152 | 153 | typedef __m256 __mXXX; 154 | typedef __m256i __mXXXi; 155 | 156 | #define _mmXXX_castsiXXX_ps _mm256_castsi256_ps 157 | #define _mmXXX_castps_siXXX _mm256_castps_si256 158 | 159 | #define _mmXXX_setzero_ps _mm256_setzero_ps 160 | #define _mmXXX_setzero_siXXX _mm256_setzero_si256 161 | #define _mmXXX_set1_ps _mm256_set1_ps 162 | #define _mmXXX_setr_ps _mm256_setr_ps 163 | #define _mmXXX_set1_epi32 _mm256_set1_epi32 164 | #define _mmXXX_setr_epi32 _mm256_setr_epi32 165 | 166 | #define _mmXXX_add_ps _mm256_add_ps 167 | #define _mmXXX_add_epi32 _mm256_add_epi32 168 | #define _mmXXX_sub_ps _mm256_sub_ps 169 | #define _mmXXX_sub_epi32 _mm256_sub_epi32 170 | #define _mmXXX_mul_ps _mm256_mul_ps 171 | #define _mmXXX_div_ps _mm256_div_ps 172 | 173 | #define _mmXXX_neg_ps(a) _mm256_xor_ps((a), _mm256_set1_ps(-0.0f)) 174 | 175 | #define _mmXXX_mullo_epi32 _mm256_mullo_epi32 176 | #define _mmXXX_subs_epu16 _mm256_subs_epu16 177 | 178 | #define _mmXXX_min_ps _mm256_min_ps 179 | #define _mmXXX_min_epi32 _mm256_min_epi32 180 | #define _mmXXX_max_ps _mm256_max_ps 181 | #define _mmXXX_max_epi32 _mm256_max_epi32 182 | 183 | #define _mmXXX_floor_ps _mm256_floor_ps 184 | #define _mmXXX_ceil_ps _mm256_ceil_ps 185 | 186 | #define _mmXXX_cvttps_epi32 _mm256_cvttps_epi32 187 | #define _mmXXX_cvtepi32_ps _mm256_cvtepi32_ps 188 | 189 | #define _mmXXX_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OS) 190 | #define _mmXXX_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OS) 191 | #define _mmXXX_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OS) 192 | #define _mmXXX_cmpeq_epi32 _mm256_cmpeq_epi32 193 | #define _mmXXX_cmpgt_epi32 _mm256_cmpgt_epi32 194 | 195 | #define _mmXXX_not_ps(v) _mm256_xor_ps(v, _mm256_set1_ps(-0.0f)) 196 | #define _mmXXX_not_siXXX(a) _mm256_xor_si256((a), _mm256_set1_epi32(~0)) 197 | #define _mmXXX_and_ps _mm256_and_ps 198 | #define _mmXXX_and_siXXX _mm256_and_si256 199 | #define _mmXXX_andnot_ps _mm256_andnot_ps 200 | #define _mmXXX_andnot_siXXX _mm256_andnot_si256 201 | #define _mmXXX_or_ps _mm256_or_ps 202 | #define _mmXXX_or_siXXX _mm256_or_si256 203 | 204 | #define _mmXXX_movemask_ps _mm256_movemask_ps 205 | 206 | #define _mmXXX_fmadd_ps _mm256_fmadd_ps 207 | #define _mmXXX_fmsub_ps _mm256_fmsub_ps 208 | 209 | #define _mmXXX_slli_epi32 _mm256_slli_epi32 210 | #define _mmXXX_srli_epi32 _mm256_srli_epi32 211 | #define _mmXXX_srai_epi32 _mm256_srai_epi32 212 | 213 | #define _mmXXX_blendv_ps _mm256_blendv_ps 214 | 215 | #define _mmXXX_sllv_ones(count) _mm256_sllv_epi32(_mm256_set1_epi32(0xffffffff), count) 216 | 217 | #define _mmXXX_transpose_epi8(x) _mm256_shuffle_epi8(x, SIMD_SHUFFLE_SCANLINE_TO_SUBTILES) 218 | 219 | #define _mmXXX_testz_siXXX _mm256_testz_si256 220 | 221 | #elif defined(USE_AVX512) 222 | #define POSTFIX AVX512 223 | #define SIMD_WIDTH 16 224 | 225 | #define SIMD_LANE_NDX \ 226 | _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 227 | #define SIMD_SHUFFLE_SCANLINE_TO_SUBTILES \ 228 | _mm512_set_epi32(0x0F0B0703, 0x0E0A0602, 0x0D090501, 0x0C080400, 0x0F0B0703, \ 229 | 0x0E0A0602, 0x0D090501, 0x0C080400, 0x0F0B0703, 0x0E0A0602, \ 230 | 0x0D090501, 0x0C080400, 0x0F0B0703, 0x0E0A0602, 0x0D090501, \ 231 | 0x0C080400) 232 | 233 | typedef __m512 __mXXX; 234 | typedef __m512i __mXXXi; 235 | 236 | #define _mmXXX_castsiXXX_ps _mm512_castsi512_ps 237 | #define _mmXXX_castps_siXXX _mm512_castps_si512 238 | 239 | #define _mmXXX_setzero_ps _mm512_setzero_ps 240 | #define _mmXXX_setzero_siXXX _mm512_setzero_si512 241 | #define _mmXXX_set1_ps _mm512_set1_ps 242 | #define _mmXXX_setr_ps _mm512_setr_ps 243 | #define _mmXXX_set1_epi32 _mm512_set1_epi32 244 | #define _mmXXX_setr_epi32 _mm512_setr_epi32 245 | 246 | #define _mmXXX_add_ps _mm512_add_ps 247 | #define _mmXXX_add_epi32 _mm512_add_epi32 248 | #define _mmXXX_sub_ps _mm512_sub_ps 249 | #define _mmXXX_sub_epi32 _mm512_sub_epi32 250 | #define _mmXXX_mul_ps _mm512_mul_ps 251 | #define _mmXXX_div_ps _mm512_div_ps 252 | 253 | #define _mmXXX_neg_ps(a) _mm512_xor_ps((a), _mm512_set1_ps(-0.0f)) 254 | 255 | #define _mmXXX_mullo_epi32 _mm512_mullo_epi32 256 | #define _mmXXX_subs_epu16 _mm512_subs_epu16 257 | 258 | #define _mmXXX_min_ps _mm512_min_ps 259 | #define _mmXXX_min_epi32 _mm512_min_epi32 260 | #define _mmXXX_max_ps _mm512_max_ps 261 | #define _mmXXX_max_epi32 _mm512_max_epi32 262 | 263 | #define _mmXXX_floor_ps _mm512_floor_ps 264 | #define _mmXXX_ceil_ps _mm512_ceil_ps 265 | 266 | #define _mmXXX_cvttps_epi32 _mm512_cvttps_epi32 267 | #define _mmXXX_cvtepi32_ps _mm512_cvtepi32_ps 268 | 269 | static __m512 _mmXXX_cmpeq_ps(__m512 a, __m512 b) { 270 | __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ); 271 | return _mm512_castsi512_ps( 272 | _mm512_mask_mov_epi32(_mm512_set1_epi32(0), mask, _mm512_set1_epi32(~0))); 273 | } 274 | #define _mmXXX_not_siXXX(a) _mm512_xor_si512((a), _mm512_set1_epi32(~0)) 275 | 276 | static __m512 _mmXXX_cmpgt_ps(__m512 a, __m512 b) { 277 | __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ); 278 | return _mm512_castsi512_ps( 279 | _mm512_mask_mov_epi32(_mm512_set1_epi32(0), mask, _mm512_set1_epi32(~0))); 280 | } 281 | 282 | static __m512 _mmXXX_cmpge_ps(__m512 a, __m512 b) { 283 | __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ); 284 | return _mm512_castsi512_ps( 285 | _mm512_mask_mov_epi32(_mm512_set1_epi32(0), mask, _mm512_set1_epi32(~0))); 286 | } 287 | 288 | static __m512i _mmXXX_cmpeq_epi32(__m512i a, __m512i b) { 289 | __mmask16 mask = _mm512_cmpeq_epi32_mask(a, b); 290 | return _mm512_mask_mov_epi32(_mm512_set1_epi32(0), mask, _mm512_set1_epi32(~0)); 291 | } 292 | 293 | static __m512i _mmXXX_cmpgt_epi32(__m512i a, __m512i b) { 294 | __mmask16 mask = _mm512_cmpgt_epi32_mask(a, b); 295 | return _mm512_mask_mov_epi32(_mm512_set1_epi32(0), mask, _mm512_set1_epi32(~0)); 296 | } 297 | 298 | #define _mmXXX_not_ps(v) _mm512_xor_ps(v, _mm512_set1_ps(-0.0f)) 299 | #define _mmXXX_and_ps _mm512_and_ps 300 | #define _mmXXX_and_siXXX _mm512_and_si512 301 | #define _mmXXX_andnot_ps _mm512_andnot_ps 302 | #define _mmXXX_andnot_siXXX _mm512_andnot_si512 303 | #define _mmXXX_or_ps _mm512_or_ps 304 | #define _mmXXX_or_siXXX _mm512_or_si512 305 | 306 | static __mmask16 _mmXXX_movemask_ps(const __m512 a) { 307 | __mmask16 mask = _mm512_cmp_epi32_mask( 308 | _mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x80000000)), 309 | _mm512_set1_epi32(0), 4); // a & 0x8000000 != 0 310 | return mask; 311 | } 312 | 313 | #define _mmXXX_fmadd_ps _mm512_fmadd_ps 314 | #define _mmXXX_fmsub_ps _mm512_fmsub_ps 315 | 316 | #define _mmXXX_slli_epi32 _mm512_slli_epi32 317 | #define _mmXXX_srli_epi32 _mm512_srli_epi32 318 | #define _mmXXX_srai_epi32 _mm512_srai_epi32 319 | 320 | static __m512 _mmXXX_blendv_ps(const __m512 a, const __m512 b, const __m512 c) { 321 | __mmask16 mask = _mmXXX_movemask_ps(c); 322 | return _mm512_mask_mov_ps(a, mask, b); 323 | } 324 | 325 | #define _mmXXX_sllv_ones(count) _mm512_sllv_epi32(_mm512_set1_epi32(0xffffffff), count) 326 | 327 | #define _mmXXX_transpose_epi8(x) _mm512_shuffle_epi8(x, SIMD_SHUFFLE_SCANLINE_TO_SUBTILES) 328 | 329 | static inline int _mmXXX_testz_siXXX(__m512i a, __m512i b) { 330 | __mmask16 mask = 331 | _mm512_cmpeq_epi32_mask(_mm512_and_si512(a, b), _mm512_set1_epi32(0)); 332 | return mask == 0xFFFF; 333 | } 334 | 335 | #elif defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64) 336 | #define POSTFIX NEON 337 | #define SIMD_WIDTH 4 338 | 339 | #ifdef _MSC_VER 340 | #define INITu32x4(w,x,y,z) { ((w) + (unsigned long long(x) << 32)), ((y) + (unsigned long long(z) << 32)) } 341 | #else 342 | #define INITu32x4(w,x,y,z) { (w), (x), (y), (z) } 343 | #endif 344 | 345 | #ifdef _MSC_VER 346 | #define SIMD_LANE_NDX _mmXXX_setr_epi32(0, 1, 2, 3) 347 | #else 348 | #define SIMD_LANE_NDX (int32x4_t){0, 1, 2, 3} 349 | #endif 350 | 351 | typedef float32x4_t __mXXX; 352 | typedef int32x4_t __mXXXi; 353 | 354 | #define _mmXXX_castsiXXX_ps vreinterpretq_f32_s32 355 | #define _mmXXX_castps_siXXX vreinterpretq_s32_f32 356 | 357 | #define _mmXXX_setzero_ps() vdupq_n_f32(0) 358 | #define _mmXXX_setzero_siXXX() vdupq_n_s32(0) 359 | #define _mmXXX_set1_ps(v) vdupq_n_f32(v) 360 | 361 | force_inline float32x4_t _mmXXX_setr_ps(float w, float z, float y, float x) { 362 | #ifdef _MSC_VER 363 | float __declspec(align(16)) data[4] = {w, z, y, x}; 364 | #else 365 | float __attribute__((aligned(16))) data[4] = {w, z, y, x}; 366 | #endif 367 | return vld1q_f32(data); 368 | } 369 | 370 | #define _mmXXX_set1_epi32 vdupq_n_s32 371 | 372 | force_inline int32x4_t _mmXXX_setr_epi32(int32_t w, int32_t z, int32_t y, int32_t x) { 373 | #ifdef _MSC_VER 374 | int32_t __declspec(align(16)) data[4] = {w, z, y, x}; 375 | #else 376 | int32_t __attribute__((aligned(16))) data[4] = {w, z, y, x}; 377 | #endif 378 | return vld1q_s32(data); 379 | } 380 | 381 | #define _mmXXX_add_ps vaddq_f32 382 | #define _mmXXX_add_epi32 vaddq_s32 383 | #define _mmXXX_sub_ps vsubq_f32 384 | #define _mmXXX_sub_epi32 vsubq_s32 385 | #define _mmXXX_mul_ps vmulq_f32 386 | #define _mmXXX_div_ps vdivq_f32 387 | 388 | #define _mmXXX_xor_ps(a, b) vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b))) 389 | #define _mmXXX_neg_ps(a) _mmXXX_xor_ps((a), vdupq_n_f32(-0.0f)) 390 | 391 | #define _mmXXX_mullo_epi32 vmulq_s32 392 | #define _mmXXX_subs_epu16(a, b) \ 393 | vreinterpretq_s32_u16(vqsubq_u16(vreinterpretq_u16_s32(a), vreinterpretq_u16_s32(b))) 394 | 395 | #define _mmXXX_min_ps vminq_f32 396 | #define _mmXXX_min_epi32 vminq_s32 397 | #define _mmXXX_max_ps vmaxq_f32 398 | #define _mmXXX_max_epi32 vmaxq_s32 399 | 400 | #define _mmXXX_floor_ps vrndmq_f32 401 | #define _mmXXX_ceil_ps vrndpq_f32 402 | 403 | #define _mmXXX_cvttps_epi32 vcvtq_s32_f32 404 | #define _mmXXX_cvtepi32_ps vcvtq_f32_s32 405 | 406 | #define _mmXXX_cmpeq_ps vceqq_f32 407 | #define _mmXXX_cmpgt_ps vcgtq_f32 408 | #define _mmXXX_cmpge_ps vcgeq_f32 409 | #define _mmXXX_cmpeq_epi32 vceqq_s32 410 | #define _mmXXX_cmpgt_epi32 vcgtq_s32 411 | 412 | #define _mmXXX_not_ps(v) _mmXXX_xor_ps((v), vreinterpretq_f32_s32(vdupq_n_s32(~0))) 413 | #define _mmXXX_not_siXXX(a) veorq_s32((a), vdupq_n_s32(~0)) 414 | #define _mmXXX_and_ps(a, b) vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b))) 415 | #define _mmXXX_and_siXXX vandq_s32 416 | #define _mmXXX_andnot_ps(a, b) vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(b), vreinterpretq_s32_f32(a))) 417 | #define _mmXXX_andnot_siXXX(a, b) vbicq_s32(b, a) 418 | #define _mmXXX_or_ps(a, b) vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b))) 419 | #define _mmXXX_or_siXXX vorrq_s32 420 | 421 | force_inline int _mmXXX_movemask_ps(float32x4_t a) { 422 | uint32x4_t input = vreinterpretq_u32_f32(a); 423 | #if defined(__aarch64__) || defined(_M_ARM64) 424 | #ifdef _MSC_VER 425 | static const int32x4_t shift = {.n128_i32 = {0, 1, 2, 3}}; 426 | #else 427 | static const int32x4_t shift = {0, 1, 2, 3}; 428 | #endif 429 | uint32x4_t tmp = vshrq_n_u32(input, 31); 430 | return vaddvq_u32(vshlq_u32(tmp, shift)); 431 | #else 432 | // Uses the exact same method as _mm_movemask_epi8, see that for details. 433 | // Shift out everything but the sign bits with a 32-bit unsigned shift 434 | // right. 435 | uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31)); 436 | // Merge the two pairs together with a 64-bit unsigned shift right + add. 437 | uint8x16_t paired = 438 | vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); 439 | // Extract the result. 440 | return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); 441 | #endif 442 | } 443 | 444 | #define _mmXXX_fmadd_ps(a, b, c) vaddq_f32(vmulq_f32(a, b), c) 445 | #define _mmXXX_fmsub_ps(a, b, c) vsubq_f32(vmulq_f32(a, b), c) 446 | 447 | #ifdef _MSC_VER 448 | #define _likely(x) x 449 | #define _unlikely(x) x 450 | #else 451 | #define _likely(x) __builtin_expect(!!(x), 1) 452 | #define _unlikely(x) __builtin_expect(!!(x), 0) 453 | #endif 454 | 455 | force_inline float32x4_t _mmXXX_slli_epi32(int32x4_t a, int imm) { 456 | if (_unlikely(imm & ~31)) { 457 | return vdupq_n_f32(0); 458 | } 459 | return vreinterpretq_f32_s32(vshlq_s32(a, vdupq_n_s32(imm))); 460 | } 461 | 462 | force_inline int32x4_t _mmXXX_srli_epi32(int32x4_t a, int imm) { 463 | int32x4_t ret; 464 | if (_unlikely((imm) & ~31)) { 465 | ret = vdupq_n_s32(0); 466 | } else { 467 | ret = vreinterpretq_s32_u32( 468 | vshlq_u32(vreinterpretq_u32_s32(a), vdupq_n_s32(-(imm)))); 469 | } 470 | return ret; 471 | } 472 | 473 | force_inline int32x4_t _mmXXX_srai_epi32(int32x4_t a, int imm) { 474 | int32x4_t ret; 475 | if (0 < (imm) && (imm) < 32) { 476 | ret = vshlq_s32(a, vdupq_n_s32(-imm)); 477 | } else { 478 | ret = vshrq_n_s32(a, 31); 479 | } 480 | return ret; 481 | } 482 | 483 | force_inline int32x4_t _mmXXX_sllv_ones(const int32x4_t ishift) { 484 | union { 485 | int32x4_t shift_128; 486 | uint32_t shift_32[4]; 487 | } shift; 488 | 489 | shift.shift_128 = _mmXXX_min_epi32(ishift, _mmXXX_set1_epi32(32)); 490 | 491 | // Uses scalar approach to perform _mm_sllv_epi32(~0, shift) 492 | static const unsigned int maskLUT[33] = { 493 | ~0U << 0, ~0U << 1, ~0U << 2, ~0U << 3, ~0U << 4, ~0U << 5, ~0U << 6, 494 | ~0U << 7, ~0U << 8, ~0U << 9, ~0U << 10, ~0U << 11, ~0U << 12, ~0U << 13, 495 | ~0U << 14, ~0U << 15, ~0U << 16, ~0U << 17, ~0U << 18, ~0U << 19, ~0U << 20, 496 | ~0U << 21, ~0U << 22, ~0U << 23, ~0U << 24, ~0U << 25, ~0U << 26, ~0U << 27, 497 | ~0U << 28, ~0U << 29, ~0U << 30, ~0U << 31, 0U}; 498 | 499 | int32x4_t retMask = 500 | _mmXXX_setr_epi32(maskLUT[shift.shift_32[0]], maskLUT[shift.shift_32[1]], 501 | maskLUT[shift.shift_32[2]], maskLUT[shift.shift_32[3]]); 502 | return retMask; 503 | } 504 | 505 | force_inline float32x4_t _mmXXX_blendv_ps(float32x4_t a, float32x4_t b, float32x4_t _mask) { 506 | // Use a signed shift right to create a mask with the sign bit 507 | uint32x4_t mask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(_mask), 31)); 508 | return vbslq_f32(mask, b, a); 509 | } 510 | 511 | force_inline int32x4_t _mm_shuffle_epi8(int32x4_t a, int32x4_t b) { 512 | int8x16_t tbl = vreinterpretq_s8_s32(a); // input a 513 | uint8x16_t idx = vreinterpretq_u8_s32(b); // input b 514 | uint8x16_t idx_masked = vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits 515 | #if defined(__aarch64__) || defined(_M_ARM64) 516 | return vreinterpretq_s32_s8(vqtbl1q_s8(tbl, idx_masked)); 517 | #elif defined(__GNUC__) 518 | int8x16_t ret; 519 | // %e and %f represent the even and odd D registers 520 | // respectively. 521 | __asm__ __volatile__( 522 | "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" 523 | "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" 524 | : [ret] "=&w"(ret) 525 | : [tbl] "w"(tbl), [idx] "w"(idx_masked)); 526 | return vreinterpretq_m128i_s8(ret); 527 | #else 528 | // use this line if testing on aarch64 529 | int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)}; 530 | return vreinterpretq_m128i_s8( 531 | vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)), 532 | vtbl2_s8(a_split, vget_high_u8(idx_masked)))); 533 | #endif 534 | } 535 | 536 | force_inline int32x4_t _mm_srli_epi16(int32x4_t a, int imm) { 537 | int32x4_t ret; 538 | if (_unlikely((imm) & ~15)) { 539 | ret = vdupq_n_s32(0); 540 | } else { 541 | ret = vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(a), vdupq_n_s16(-(imm)))); 542 | } 543 | return ret; 544 | } 545 | 546 | force_inline int32x4_t _mmXXX_transpose_epi8(int32x4_t v) { 547 | // Perform transpose through two 16->8 bit pack and byte shifts 548 | #ifdef _MSC_VER 549 | int8_t __declspec(align(16)) data[16] = {~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0}; 550 | #else 551 | int8_t __attribute__((aligned(16))) 552 | data[16] = {~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0, ~0, 0}; 553 | #endif 554 | const int32x4_t mask = vld1q_s8(data); 555 | 556 | #define _mm_packus_epi16(a, b) \ 557 | vreinterpretq_s32_u8( \ 558 | vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(a)), \ 559 | vqmovun_s16(vreinterpretq_s16_s32(b)))); 560 | 561 | v = _mm_packus_epi16(vandq_s32(v, mask), _mm_srli_epi16(v, 8)); 562 | v = _mm_packus_epi16(vandq_s32(v, mask), _mm_srli_epi16(v, 8)); 563 | 564 | #undef _mm_packus_epi16 565 | #undef _mm_srli_epi16 566 | 567 | return v; 568 | } 569 | 570 | force_inline int _mmXXX_testz_siXXX(int32x4_t a, int32x4_t b) { 571 | int64x2_t s64 = 572 | vandq_s64(vreinterpretq_s64_s32(a), vreinterpretq_s64_s32(b)); 573 | return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); 574 | } 575 | 576 | #endif 577 | 578 | #if !defined(__aarch64__) && !defined(_M_ARM) && !defined(_M_ARM64) 579 | #define _mm128_setzero_si128 _mm_setzero_si128 580 | #define _mm128_set1_ps _mm_set1_ps 581 | #define _mm128_setr_ps _mm_setr_ps 582 | #define _mm128_setr_epi32 _mm_setr_epi32 583 | 584 | #define _mm128_add_ps _mm_add_ps 585 | #define _mm128_sub_ps _mm_sub_ps 586 | #define _mm128_mul_ps _mm_mul_ps 587 | #define _mm128_div_ps _mm_div_ps 588 | 589 | #define _mm128_fmadd_ps(a, b, c) _mm_add_ps(_mm_mul_ps(a, b), c) 590 | 591 | #define _mm128_add_epi32 _mm_add_epi32 592 | 593 | #define _mm128_and_si128 _mm_and_si128 594 | 595 | #define _mm128_min_epi32 _mm_min_epi32 596 | #define _mm128_max_epi32 _mm_max_epi32 597 | 598 | #define _mm128_xor_ps _mm_xor_ps 599 | 600 | #define _mm128_movemask_ps _mm_movemask_ps 601 | 602 | #define _mm128_cvtps_epi32 _mm_cvtps_epi32 603 | 604 | static inline __m128 _mm128_dp4_ps(const __m128 a, const __m128 b) { 605 | __m128 prod = _mm_mul_ps(a, b); 606 | __m128 dp = _mm_add_ps(prod, _mm_shuffle_ps(prod, prod, _MM_SHUFFLE(2, 3, 0, 1))); 607 | dp = _mm_add_ps(dp, _mm_shuffle_ps(dp, dp, _MM_SHUFFLE(0, 1, 2, 3))); 608 | return dp; 609 | } 610 | #else 611 | typedef float32x4_t __m128; 612 | typedef int32x4_t __m128i; 613 | 614 | #define _mm128_setzero_si128() vdupq_n_s32(0) 615 | #define _mm128_set1_ps vdupq_n_f32 616 | #define _mm128_setr_ps _mmXXX_setr_ps 617 | #define _mm128_setr_epi32 _mmXXX_setr_epi32 618 | 619 | #define _mm128_add_ps vaddq_f32 620 | #define _mm128_sub_ps vsubq_f32 621 | #define _mm128_mul_ps vmulq_f32 622 | #define _mm128_div_ps vdivq_f32 623 | 624 | #define _mm128_fmadd_ps(a, b, c) vaddq_f32(vmulq_f32(a, b), c) 625 | 626 | #define _mm128_add_epi32 vaddq_s32 627 | 628 | #define _mm128_and_si128 vandq_s32 629 | 630 | #define _mm128_min_epi32 vminq_s32 631 | #define _mm128_max_epi32 vmaxq_s32 632 | 633 | #define _mm128_xor_ps(a, b) vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a), vreinterpretq_s32_f32(b))) 634 | 635 | #define _mm128_movemask_ps _mmXXX_movemask_ps 636 | 637 | #define _mm128_cvtps_epi32 vcvtmq_s32_f32 638 | 639 | force_inline float32x4_t _mm_shuffle_ps_2301(float32x4_t a, float32x4_t b) { 640 | float32x2_t a01 = vrev64_f32(vget_low_f32(a)); 641 | float32x2_t b23 = vrev64_f32(vget_high_f32(b)); 642 | return vcombine_f32(a01, b23); 643 | } 644 | 645 | force_inline float32x4_t _mm128_dp4_ps(const float32x4_t a, const float32x4_t b) { 646 | float32x4_t prod = vmulq_f32(a, b); 647 | float32x4_t dp = vaddq_f32(prod, _mm_shuffle_ps_2301(prod, prod)); 648 | 649 | float32x4_t v = vrev64q_f32(dp); 650 | v = vextq_f32(v, v, 2); 651 | 652 | dp = vaddq_f32(dp, v); 653 | return dp; 654 | } 655 | #endif 656 | 657 | #undef force_inline 658 | -------------------------------------------------------------------------------- /SWpixels.h: -------------------------------------------------------------------------------- 1 | #ifndef SWPIXELS_H 2 | #define SWPIXELS_H 3 | 4 | #include "SWcore.h" 5 | #include "SWtypes.h" 6 | 7 | #define UBYTE_TO_FLOAT(x) ((x) * ((SWfloat)1.0 / 255)) 8 | #define FLOAT_TO_UBYTE(x) (SWubyte)((x) * ((SWfloat)255.999)) 9 | 10 | /* Read operations */ 11 | 12 | #define _swPx_RGB888_GetColor_RGBA8888(p, rgba) \ 13 | rgba[3] = 255; \ 14 | rgba[0] = ((SWubyte*)p)[0]; \ 15 | rgba[1] = ((SWubyte*)p)[1]; \ 16 | rgba[2] = ((SWubyte*)p)[2]; 17 | 18 | #define _swPx_RGBA8888_GetColor_RGBA8888(p, rgba) \ 19 | rgba[0] = ((SWubyte*)p)[0]; \ 20 | rgba[1] = ((SWubyte*)p)[1]; \ 21 | rgba[2] = ((SWubyte*)p)[2]; \ 22 | rgba[3] = ((SWubyte*)p)[3]; 23 | 24 | #define _swPx_RGB888_GetColor_FRGBA(p, rgba) \ 25 | rgba[0] = UBYTE_TO_FLOAT(((SWubyte*)p)[0]); \ 26 | rgba[1] = UBYTE_TO_FLOAT(((SWubyte*)p)[1]); \ 27 | rgba[2] = UBYTE_TO_FLOAT(((SWubyte*)p)[2]); \ 28 | rgba[3] = (SWfloat)1.0; 29 | 30 | #define _swPx_RGBA8888_GetColor_FRGBA(p, rgba) \ 31 | rgba[0] = UBYTE_TO_FLOAT(((SWubyte*)p)[0]); \ 32 | rgba[1] = UBYTE_TO_FLOAT(((SWubyte*)p)[1]); \ 33 | rgba[2] = UBYTE_TO_FLOAT(((SWubyte*)p)[2]); \ 34 | rgba[3] = UBYTE_TO_FLOAT(((SWubyte*)p)[3]); 35 | 36 | #define _swPx_BGRA8888_GetColor_FRGBA(p, rgba) \ 37 | rgba[0] = UBYTE_TO_FLOAT(((SWubyte*)p)[2]); \ 38 | rgba[1] = UBYTE_TO_FLOAT(((SWubyte*)p)[1]); \ 39 | rgba[2] = UBYTE_TO_FLOAT(((SWubyte*)p)[0]); \ 40 | rgba[3] = UBYTE_TO_FLOAT(((SWubyte*)p)[3]); \ 41 | 42 | #define _swPx_RGB888_GetColor_BGRA8888(p, bgra) \ 43 | bgra[3] = 255; \ 44 | bgra[0] = ((SWubyte*)p)[2]; \ 45 | bgra[1] = ((SWubyte*)p)[1]; \ 46 | bgra[2] = ((SWubyte*)p)[0]; 47 | 48 | #define _swPx_RGBA8888_GetColor_BGRA8888(p, bgra) \ 49 | bgra[0] = ((SWubyte*)p)[2]; \ 50 | bgra[1] = ((SWubyte*)p)[1]; \ 51 | bgra[2] = ((SWubyte*)p)[0]; \ 52 | bgra[3] = ((SWubyte*)p)[3]; 53 | 54 | #define _swPx_RGB888_GetColor_FBGRA(p, rgba) \ 55 | rgba[0] = UBYTE_TO_FLOAT(((SWubyte*)p)[2]); \ 56 | rgba[1] = UBYTE_TO_FLOAT(((SWubyte*)p)[1]); \ 57 | rgba[2] = UBYTE_TO_FLOAT(((SWubyte*)p)[0]); \ 58 | rgba[3] = (SWfloat)1.0; 59 | 60 | #define _swPx_RGBA8888_GetColor_FBGRA(p, rgba) \ 61 | rgba[0] = UBYTE_TO_FLOAT(((SWubyte*)p)[2]); \ 62 | rgba[1] = UBYTE_TO_FLOAT(((SWubyte*)p)[1]); \ 63 | rgba[2] = UBYTE_TO_FLOAT(((SWubyte*)p)[0]); \ 64 | rgba[3] = UBYTE_TO_FLOAT(((SWubyte*)p)[3]); 65 | 66 | 67 | #define swPx_RGB888_GetColor_RGBA8888(w, h, pixels, x, y, rgba) \ 68 | _swPx_RGB888_GetColor_RGBA8888((SWubyte*)(pixels) + 3 * ((y) * (w) + (x)), (rgba)) 69 | 70 | #define swPx_RGBA8888_GetColor_RGBA8888(w, h, pixels, x, y, rgba) \ 71 | _swPx_RGBA8888_GetColor_RGBA8888((SWubyte*)(pixels) + 4 * ((y) * (w) + (x)), (rgba)) 72 | 73 | #define swPx_RGB888_GetColor_FRGBA(w, h, pixels, x, y, rgba) \ 74 | _swPx_RGB888_GetColor_FRGBA((SWubyte*)(pixels) + 3 * ((y) * (w) + (x)), (rgba)) 75 | 76 | #define swPx_RGBA8888_GetColor_FRGBA(w, h, pixels, x, y, rgba) \ 77 | _swPx_RGBA8888_GetColor_FRGBA((SWubyte*)(pixels) + 4 * ((y) * (w) + (x)), (rgba)) 78 | 79 | #define swPx_BGRA8888_GetColor_FRGBA(w, h, pixels, x, y, rgba) \ 80 | _swPx_BGRA8888_GetColor_FRGBA((SWubyte*)(pixels) + 4 * ((y) * (w) + (x)), (rgba)) 81 | 82 | #define swPx_RGB888_GetColor_BGRA8888(w, h, pixels, x, y, rgba) \ 83 | _swPx_RGB888_GetColor_BGRA8888((SWubyte*)(pixels) + 3 * ((y) * (w) + (x)), (rgba)) 84 | 85 | #define swPx_RGBA8888_GetColor_BGRA8888(w, h, pixels, x, y, rgba) \ 86 | _swPx_RGBA8888_GetColor_BGRA8888((SWubyte*)(pixels) + 4 * ((y) * (w) + (x)), (rgba)) 87 | 88 | #define swPx_RGB888_GetColor_FBGRA(w, h, pixels, x, y, rgba) \ 89 | _swPx_RGB888_GetColor_FBGRA((SWubyte*)(pixels) + 3 * ((y) * (w) + (x)), (rgba)) 90 | 91 | #define swPx_RGBA8888_GetColor_FBGRA(w, h, pixels, x, y, rgba) \ 92 | _swPx_RGBA8888_GetColor_FBGRA((SWubyte*)(pixels) + 4 * ((y) * (w) + (x)), (rgba)) 93 | 94 | /* wrong negative rounding but ok */ 95 | #define swPx_RGB888_GetColor_RGBA8888_UV(w, h, pixels, u, v, rgba) \ 96 | _swPx_RGB888_GetColor_RGBA8888((SWubyte*)(pixels) + 3 * (((SWint)((v) * (h)) & ((h) - 1)) * (w) + ((SWint)((u) * (w)) & ((w) - 1))), (rgba)) 97 | 98 | #define swPx_RGBA8888_GetColor_RGBA8888_UV(w, h, pixels, u, v, rgba) \ 99 | _swPx_RGBA8888_GetColor_RGBA8888((SWubyte*)(pixels) + 4 * (((SWint)((v) * (h)) & ((h) - 1)) * (w) + ((SWint)((u) * (w)) & ((w) - 1))), (rgba)) 100 | 101 | #define swPx_RGB888_GetColor_FRGBA_UV(w, h, pixels, u, v, rgba) \ 102 | _swPx_RGB888_GetColor_FRGBA((SWubyte*)(pixels) + 3 * (((SWint)((v) * (h)) & ((h) - 1)) * (w) + ((SWint)((u) * (w)) & ((w) - 1))), (rgba)) 103 | 104 | #define swPx_RGBA8888_GetColor_FRGBA_UV(w, h, pixels, u, v, rgba) \ 105 | _swPx_RGBA8888_GetColor_FRGBA((SWubyte*)(pixels) + 4 * (((SWint)((v) * (h)) & ((h) - 1)) * (w) + ((SWint)((u) * (w)) & ((w) - 1))), (rgba)) 106 | 107 | #define swPx_RGB888_GetColor_BGRA8888_UV(w, h, pixels, u, v, bgra) \ 108 | _swPx_RGB888_GetColor_BGRA8888((SWubyte*)(pixels) + 3 * (((SWint)((v) * (h)) & ((h) - 1)) * (w) + ((SWint)((u) * (w)) & ((w) - 1))), (bgra)) 109 | 110 | #define swPx_RGB888_GetColor_BGRA8888_UV_norepeat_unsafe(w, h, pixels, u, v, bgra) \ 111 | _swPx_RGB888_GetColor_BGRA8888((SWubyte*)(pixels) + 3 * ((SWint)((v) * (h)) * (w) + (SWint)((u) * (w))), (bgra)) 112 | 113 | #define swPx_RGBA8888_GetColor_BGRA8888_UV(w, h, pixels, u, v, bgra) \ 114 | _swPx_RGBA8888_GetColor_BGRA8888((SWubyte*)(pixels) + 4 * (((SWint)((v) * (h)) & ((h) - 1)) * (w) + ((SWint)((u) * (w)) & ((w) - 1))), (bgra)) 115 | 116 | #define swPx_RGBA8888_GetColor_BGRA8888_UV_norepeat_unsafe(w, h, pixels, u, v, bgra) \ 117 | _swPx_RGBA8888_GetColor_BGRA8888((SWubyte*)(pixels) + 4 * ((SWint)((v) * (h)) * (w) + (SWint)((u) * (w))), (bgra)) 118 | 119 | #define swPx_RGB888_GetColor_FBGRA_UV(w, h, pixels, u, v, rgba) \ 120 | _swPx_RGB888_GetColor_FBGRA((SWubyte*)(pixels) + 3 * (((SWint)((v) * (h)) & ((h) - 1)) * (w) + ((SWint)((u) * (w)) & ((w) - 1))), (rgba)) 121 | 122 | #define swPx_RGBA8888_GetColor_FBGRA_UV(w, h, pixels, u, v, rgba) \ 123 | _swPx_RGBA8888_GetColor_FBGRA((SWubyte*)(pixels) + 4 * (((SWint)((v) * (h)) & ((h) - 1)) * (w) + ((SWint)((u) * (w)) & ((w) - 1))), (rgba)) 124 | 125 | 126 | #define swPxGetColorUbyte_RGBA(type, mode, w, h, pixels, u, v, rgba) \ 127 | if (type == SW_UNSIGNED_BYTE) { \ 128 | if (mode == SW_RGB) { \ 129 | swPx_RGB888_GetColor_RGBA8888_UV(w, h, pixels, u, v, rgba); \ 130 | } else if (mode == SW_RGBA) { \ 131 | swPx_RGBA8888_GetColor_RGBA8888_UV(w, h, pixels, u, v, rgba); \ 132 | } \ 133 | } 134 | 135 | #define swPxGetColorUbyte_BGRA(type, mode, w, h, pixels, u, v, bgra) \ 136 | if (type == SW_UNSIGNED_BYTE) { \ 137 | if (mode == SW_RGB) { \ 138 | swPx_RGB888_GetColor_BGRA8888_UV(w, h, pixels, u, v, bgra); \ 139 | } else if (mode == SW_RGBA) { \ 140 | swPx_RGBA8888_GetColor_BGRA8888_UV(w, h, pixels, u, v, bgra); \ 141 | } \ 142 | } 143 | 144 | #define swPxGetColorFloat_RGBA(type, mode, w, h, pixels, u, v, rgba) \ 145 | if (type == SW_UNSIGNED_BYTE) { \ 146 | if (mode == SW_RGB) { \ 147 | swPx_RGB888_GetColor_FRGBA_UV(w, h, pixels, u, v, rgba); \ 148 | } else if (mode == SW_RGBA) { \ 149 | swPx_RGBA8888_GetColor_FRGBA_UV(w, h, pixels, u, v, rgba); \ 150 | } \ 151 | } 152 | 153 | #define swPxGetColorFloat_BGRA(type, mode, w, h, pixels, u, v, bgra) \ 154 | if (type == SW_UNSIGNED_BYTE) { \ 155 | if (mode == SW_RGB) { \ 156 | swPx_RGB888_GetColor_FBGRA_UV(w, h, pixels, u, v, bgra); \ 157 | } else if (mode == SW_RGBA) { \ 158 | swPx_RGBA8888_GetColor_FBGRA_UV(w, h, pixels, u, v, bgra); \ 159 | } \ 160 | } 161 | 162 | /* Write operations */ 163 | 164 | #define _swPx_RGBA8888_SetColor_FRGBA_(rgba, fr, fg, fb, fa) \ 165 | ((SWubyte*)(rgba))[0] = FLOAT_TO_UBYTE(fr); \ 166 | ((SWubyte*)(rgba))[1] = FLOAT_TO_UBYTE(fg); \ 167 | ((SWubyte*)(rgba))[2] = FLOAT_TO_UBYTE(fb); \ 168 | ((SWubyte*)(rgba))[3] = FLOAT_TO_UBYTE(fa); 169 | 170 | #define _swPx_RGBA8888_SetColor_FRGBA(rgba, frgba) \ 171 | ((SWubyte*)(rgba))[0] = FLOAT_TO_UBYTE((frgba)[0]); \ 172 | ((SWubyte*)(rgba))[1] = FLOAT_TO_UBYTE((frgba)[1]); \ 173 | ((SWubyte*)(rgba))[2] = FLOAT_TO_UBYTE((frgba)[2]); \ 174 | ((SWubyte*)(rgba))[3] = FLOAT_TO_UBYTE((frgba)[3]); 175 | 176 | #define _swPx_BGRA8888_SetColor_RGBA8888(bgra, rgba) \ 177 | ((SWubyte*)(bgra))[0] = (rgba)[2]; \ 178 | ((SWubyte*)(bgra))[1] = (rgba)[1]; \ 179 | ((SWubyte*)(bgra))[2] = (rgba)[0]; \ 180 | ((SWubyte*)(bgra))[3] = (rgba)[3]; 181 | 182 | #define _swPx_BGRA8888_SetColor_FRGBA(bgra, frgba) \ 183 | ((SWubyte*)(bgra))[0] = FLOAT_TO_UBYTE((frgba)[2]); \ 184 | ((SWubyte*)(bgra))[1] = FLOAT_TO_UBYTE((frgba)[1]); \ 185 | ((SWubyte*)(bgra))[2] = FLOAT_TO_UBYTE((frgba)[0]); \ 186 | ((SWubyte*)(bgra))[3] = FLOAT_TO_UBYTE((frgba)[3]); 187 | 188 | #define _swPx_BGRA8888_SetColor_BGRA8888(bgra, rgba) \ 189 | ((SWubyte*)(bgra))[0] = (rgba)[0]; \ 190 | ((SWubyte*)(bgra))[1] = (rgba)[1]; \ 191 | ((SWubyte*)(bgra))[2] = (rgba)[2]; \ 192 | ((SWubyte*)(bgra))[3] = (rgba)[3]; 193 | 194 | #define _swPx_FRGBA_SetColor_FRGBA(_frgba, frgba) \ 195 | ((SWfloat*)(_frgba))[0] = (frgba)[0]; \ 196 | ((SWfloat*)(_frgba))[1] = (frgba)[1]; \ 197 | ((SWfloat*)(_frgba))[2] = (frgba)[2]; \ 198 | ((SWfloat*)(_frgba))[3] = (frgba)[3]; 199 | 200 | #define swPx_BGRA8888_SetColor_RGBA8888(w, h, pixels, x, y, rgba) \ 201 | _swPx_BGRA8888_SetColor_RGBA8888((SWubyte*)(pixels) + 4 * ((y) * (w) + (x)), (rgba)) 202 | 203 | #define swPx_BGRA8888_SetColor_FRGBA(w, h, pixels, x, y, rgba) \ 204 | _swPx_BGRA8888_SetColor_FRGBA((SWubyte*)(pixels) + 4 * ((y) * (w) + (x)), (rgba)) 205 | 206 | #define swPx_BGRA8888_SetColor_BGRA8888(w, h, pixels, x, y, bgra) \ 207 | _swPx_BGRA8888_SetColor_BGRA8888((SWubyte*)(pixels) + 4 * ((y) * (w) + (x)), (bgra)) 208 | 209 | #define swPx_FRGBA_SetColor_FRGBA(w, h, pixels, x, y, rgba) \ 210 | _swPx_FRGBA_SetColor_FRGBA(((SWfloat*)(pixels)) + 4 * ((y) * (w) + (x)), (rgba)) 211 | 212 | #endif /* SWPIXELS_H */ 213 | -------------------------------------------------------------------------------- /SWprogram.h: -------------------------------------------------------------------------------- 1 | #ifndef SW_PROGRAM_H 2 | #define SW_PROGRAM_H 3 | 4 | #include "SWcore.h" 5 | 6 | #define SW_MAX_VTX_ATTRIBS 16 7 | #define SW_MAX_UNIFORMS 32 8 | 9 | typedef struct SWprogram { 10 | vtx_shader_proc v_proc; 11 | SWint v_out_size; 12 | frag_shader_proc f_proc; 13 | SWvtx_attribute vertex_attributes[SW_MAX_VTX_ATTRIBS]; 14 | SWuint num_attributes; 15 | SWuniform uniforms[SW_MAX_UNIFORMS]; 16 | SWubyte *uniform_buf; 17 | SWuint unifrom_buf_size; 18 | } SWprogram; 19 | 20 | void swProgInit(SWprogram *p, SWubyte *uniform_buf, vtx_shader_proc v_proc, 21 | frag_shader_proc f_proc, SWint v_out_floats); 22 | void swProgDestroy(SWprogram *p); 23 | 24 | void swProgSetVtxAttribPointer(SWprogram *p, SWcontext *ctx, SWuint index, SWint size, 25 | SWint stride, const void *pointer); 26 | void swProgDisableVtxAttrib(SWprogram *p, SWuint index); 27 | 28 | void swProgRegUniform(SWprogram *p, SWint index, SWenum type); 29 | void swProgRegUniformv(SWprogram *p, SWint index, SWenum type, SWint num); 30 | void swProgSetProgramUniform(SWprogram *p, SWint index, SWenum type, const void *data); 31 | void swProgSetProgramUniformv(SWprogram *p, SWint index, SWenum type, SWint num, 32 | const void *data); 33 | 34 | void swProgDrawLinesArray(SWprogram *p, SWcontext *ctx, SWuint first, SWuint count); 35 | void swProgDrawLineStripArray(SWprogram *p, SWcontext *ctx, SWuint first, SWuint count); 36 | void swProgDrawLinesIndexed(SWprogram *p, SWcontext *ctx, SWuint count, SWenum index_type, 37 | const void *indices); 38 | void swProgDrawLineStripIndexed(SWprogram *p, SWcontext *ctx, SWuint count, 39 | SWenum index_type, const void *indices); 40 | 41 | void swProgDrawCurvesArray(SWprogram *p, SWcontext *ctx, SWuint first, SWuint count); 42 | void swProgDrawCurveStripArray(SWprogram *p, SWcontext *ctx, SWuint first, SWuint count); 43 | void swProgDrawCurvesIndexed(SWprogram *p, SWcontext *ctx, SWuint count, 44 | SWenum index_type, const void *indices); 45 | void swProgDrawCurveStripIndexed(SWprogram *p, SWcontext *ctx, SWuint count, 46 | SWenum index_type, const void *indices); 47 | 48 | void swProgDrawTrianglesArray(SWprogram *p, SWcontext *ctx, SWuint first, SWuint count); 49 | void swProgDrawTriangleStripArray(SWprogram *p, SWcontext *ctx, SWuint first, 50 | SWuint count); 51 | void swProgDrawTrianglesIndexed(SWprogram *p, SWcontext *ctx, SWuint count, 52 | SWenum index_type, const void *indices); 53 | void swProgDrawTriangleStripIndexed(SWprogram *p, SWcontext *ctx, SWuint count, 54 | SWenum index_type, const void *indices); 55 | 56 | #endif /* SW_PROGRAM_H */ 57 | -------------------------------------------------------------------------------- /SWrasterize.c: -------------------------------------------------------------------------------- 1 | #include "SWrasterize.h" 2 | 3 | void _swProcessCurveRecursive(SWprogram *p, SWframebuffer *f, 4 | SWfloat p1[SW_MAX_VTX_ATTRIBS], 5 | SWfloat p2[SW_MAX_VTX_ATTRIBS], 6 | SWfloat p3[SW_MAX_VTX_ATTRIBS], 7 | SWfloat p4[SW_MAX_VTX_ATTRIBS], SWint b_depth_test, 8 | SWint b_depth_write, SWfloat tolerance) { 9 | SWfloat p12[SW_MAX_VTX_ATTRIBS], p23[SW_MAX_VTX_ATTRIBS], p34[SW_MAX_VTX_ATTRIBS], 10 | p123[SW_MAX_VTX_ATTRIBS], p234[SW_MAX_VTX_ATTRIBS], p1234[SW_MAX_VTX_ATTRIBS]; 11 | SWint i; 12 | 13 | for (i = 0; i < p->v_out_size; i++) { 14 | p12[i] = (p1[i] + p2[i]) * 0.5f; 15 | p23[i] = (p2[i] + p3[i]) * 0.5f; 16 | p34[i] = (p3[i] + p4[i]) * 0.5f; 17 | p123[i] = (p12[i] + p23[i]) * 0.5f; 18 | p234[i] = (p23[i] + p34[i]) * 0.5f; 19 | p1234[i] = (p123[i] + p234[i]) * 0.5f; 20 | } 21 | 22 | SWfloat dx = p4[0] - p1[0], dy = p4[1] - p1[1]; 23 | SWfloat d2 = sw_abs((p2[0] - p4[0]) * dy - (p2[1] - p4[1]) * dx), 24 | d3 = sw_abs((p3[0] - p4[0]) * dy - (p3[1] - p4[1]) * dx); 25 | 26 | if ((d2 + d3) * (d2 + d3) < tolerance * (dx * dx + dy * dy)) { 27 | _swProcessLine(p, f, p1, p4, b_depth_test, b_depth_write); 28 | } else { 29 | _swProcessCurveRecursive(p, f, p1, p12, p123, p1234, b_depth_test, b_depth_write, 30 | tolerance); 31 | _swProcessCurveRecursive(p, f, p1234, p234, p34, p4, b_depth_test, b_depth_write, 32 | tolerance); 33 | } 34 | } -------------------------------------------------------------------------------- /SWtexture.c: -------------------------------------------------------------------------------- 1 | #include "SWtexture.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "SWcontext.h" 8 | 9 | SWfloat _sw_ubyte_to_float_table[256]; 10 | 11 | sw_inline void _swGetSizes(SWenum mode, SWenum type, SWint *t_size, SWint *p_size, 12 | SWint *additional) { 13 | SWint type_size, pp_size; 14 | switch (type) { 15 | case SW_UNSIGNED_BYTE: 16 | type_size = 1; 17 | break; 18 | case SW_UNSIGNED_SHORT: 19 | type_size = 2; 20 | break; 21 | case SW_UNSIGNED_INT: 22 | type_size = 4; 23 | break; 24 | case SW_COMPRESSED: 25 | type_size = 0; 26 | if (additional) 27 | *additional = 256 * 4 * 4; 28 | break; 29 | default: 30 | type_size = 0; 31 | assert(0); 32 | } 33 | switch (mode) { 34 | case SW_RGB: 35 | pp_size = type_size * 3; 36 | break; 37 | case SW_RGBA: 38 | pp_size = type_size * 4; 39 | break; 40 | default: 41 | pp_size = 0; 42 | assert(0); 43 | } 44 | 45 | if (t_size) 46 | (*t_size) = type_size; 47 | if (p_size) 48 | (*p_size) = pp_size; 49 | } 50 | 51 | void swTexInit(SWtexture *t, SWenum mode, SWenum type, SWint w, SWint h, 52 | const void *pixels) { 53 | SWint pp_size, additional = 0; 54 | _swGetSizes(mode, type, NULL, &pp_size, &additional); 55 | size_t total_size = (size_t)pp_size * w * h + additional; 56 | if (type == SW_COMPRESSED) { 57 | total_size += w * h / 4; 58 | } 59 | void *p = malloc(total_size); 60 | memcpy(p, pixels, total_size); 61 | swTexInitMove_malloced(t, mode, type, w, h, p); 62 | } 63 | 64 | void swTexInitMove(SWtexture *t, SWenum mode, SWenum type, SWint w, SWint h, void *pixels, 65 | void (*free)(void *)) { 66 | assert(w == 1 || w % 2 == 0); 67 | assert(h == 1 || h % 2 == 0); 68 | t->mode = mode; 69 | t->type = type; 70 | t->pixels = pixels; 71 | t->w = w; 72 | t->h = h; 73 | t->free = free; 74 | 75 | if (type == SW_COMPRESSED) { 76 | assert(w > 1 && h > 1); 77 | } 78 | 79 | _swGetSizes(mode, type, &t->type_size, &t->pp_size, NULL); 80 | } 81 | 82 | void swTexInitMove_malloced(SWtexture *t, SWenum mode, SWenum type, SWint w, SWint h, 83 | void *pixels) { 84 | swTexInitMove(t, mode, type, w, h, pixels, &free); 85 | } 86 | 87 | void swTexDestroy(SWtexture *t) { 88 | if (t->free) { 89 | (*t->free)(t->pixels); 90 | } 91 | memset(t, 0, sizeof(SWtexture)); 92 | } 93 | 94 | /*void swTexGetColorFloat_RGBA(SWtexture *t, SWfloat u, SWfloat v, SWfloat *rgba) { 95 | SWint x = (SWint)(u * t->w) & (t->w - 1); 96 | SWint y = (SWint)(v * t->h) & (t->h - 1); 97 | 98 | switch (t->type) { 99 | case SW_UNSIGNED_BYTE:{ 100 | const SWfloat conv = (SWfloat)1.0 / 255; 101 | SWubyte *p = t->pixels; 102 | if (t->mode == SW_RGB) { 103 | p += 3 * (y * t->w + x); 104 | rgba[3] = (SWfloat)1.0; 105 | } else if (t->mode == SW_RGBA) { 106 | p += 4 * (y * t->w + x); 107 | rgba[3] = p[3] * conv; 108 | } 109 | rgba[0] = p[0] * conv; 110 | rgba[1] = p[1] * conv; 111 | rgba[2] = p[2] * conv; 112 | }break; 113 | } 114 | }*/ 115 | -------------------------------------------------------------------------------- /SWtexture.h: -------------------------------------------------------------------------------- 1 | #ifndef SW_TEXTURE_H 2 | #define SW_TEXTURE_H 3 | 4 | #include "SWcore.h" 5 | #include "SWpixels.h" 6 | 7 | typedef struct SWtexture { 8 | SWenum mode, type; 9 | SWint w, h, type_size, pp_size; 10 | void *pixels; 11 | void (*free)(void *); 12 | } SWtexture; 13 | 14 | extern SWfloat _sw_ubyte_to_float_table[256]; 15 | 16 | void swTexInit(SWtexture *t, SWenum mode, SWenum type, SWint w, SWint h, 17 | const void *pixels); 18 | void swTexInitMove(SWtexture *t, SWenum mode, SWenum type, SWint w, SWint h, void *pixels, 19 | void (*free)(void *)); 20 | void swTexInitMove_malloced(SWtexture *t, SWenum mode, SWenum type, SWint w, SWint h, 21 | void *pixels); 22 | void swTexDestroy(SWtexture *t); 23 | 24 | #define swTex_RGB888_GetColorFloat_RGBA(t, u, v, rgba) \ 25 | swPx_RGB888_GetColor_FRGBA_UV((t)->w, (t)->h, (t)->pixels, u, v, rgba) 26 | 27 | #define swTexGetColorFloat_RGBA(t, u, v, rgba) \ 28 | { \ 29 | SWint x = (SWint)(u * (t)->w) & ((t)->w - 1); \ 30 | SWint y = (SWint)(v * (t)->h) & ((t)->h - 1); \ 31 | \ 32 | switch ((t)->type) { \ 33 | case SW_UNSIGNED_BYTE: { \ 34 | SWint i; \ 35 | SWubyte *p = (SWubyte *)(t)->pixels; \ 36 | if ((t)->mode == SW_RGB) { \ 37 | p += 3 * (y * (t)->w + x); \ 38 | rgba[3] = (SWfloat)1.0; \ 39 | } else if ((t)->mode == SW_RGBA) { \ 40 | p += 4 * (y * (t)->w + x); \ 41 | rgba[3] = _sw_ubyte_to_float_table[p[3]]; \ 42 | } \ 43 | \ 44 | for (i = 0; i < 3; i++) { \ 45 | rgba[i] = _sw_ubyte_to_float_table[p[i]]; \ 46 | } \ 47 | } break; \ 48 | case SW_COMPRESSED: { \ 49 | const SWubyte *table = (const SWubyte *)(t)->pixels; \ 50 | const SWubyte *pixels = (const SWubyte *)(t)->pixels + 256 * 4 * 4; \ 51 | \ 52 | SWint i = 4 * (2 * (y & 1) + (x & 1)); \ 53 | x >>= 1; \ 54 | y >>= 1; \ 55 | const SWubyte index = pixels[y * ((t)->w >> 1) + x]; \ 56 | const SWubyte *_col = &table[index * 16 + i]; \ 57 | \ 58 | for (i = 0; i < 4; i++) { \ 59 | rgba[i] = _sw_ubyte_to_float_table[_col[i]]; \ 60 | } \ 61 | } break; \ 62 | default: \ 63 | break; \ 64 | } \ 65 | } 66 | 67 | #define swTexGetColorUbyte_RGBA(t, u, v, rgba) \ 68 | swPxGetColorUbyte_RGBA(t->type, t->mode, t->w, t->h, t->pixels, u, v, rgba) 69 | #define swTexGetColorUbyte_BGRA(t, u, v, bgra) \ 70 | swPxGetColorUbyte_BGRA(t->type, t->mode, t->w, t->h, t->pixels, u, v, bgra) 71 | 72 | #endif /* SW_TEXTURE_H */ 73 | -------------------------------------------------------------------------------- /SWtypes.h: -------------------------------------------------------------------------------- 1 | #ifndef SW_TYPES_H 2 | #define SW_TYPES_H 3 | 4 | #include 5 | 6 | typedef uint8_t SWboolean; 7 | typedef int32_t SWint; 8 | typedef uint32_t SWuint; 9 | typedef uint8_t SWubyte; 10 | typedef uint16_t SWushort; 11 | typedef float SWfloat; 12 | 13 | typedef struct SWvtx_attribute { 14 | SWint size, stride; 15 | void *data; 16 | } SWvtx_attribute; 17 | 18 | typedef struct SWuniform { 19 | SWint type; 20 | void *data; 21 | } SWuniform; 22 | 23 | /* Check windows */ 24 | #if defined(_WIN32) || defined(_WIN64) 25 | #if defined(_WIN64) && _WIN64 26 | #define ENVIRONMENT64 27 | #else 28 | #define ENVIRONMENT32 29 | #endif 30 | #endif 31 | 32 | /* Check GCC */ 33 | #ifdef __GNUC__ 34 | #if __x86_64__ || __ppc64__ || __aarch64__ 35 | #define ENVIRONMENT64 36 | #else 37 | #define ENVIRONMENT32 38 | #endif 39 | #endif 40 | 41 | #ifdef __GNUC__ 42 | //#define sw_inline __attribute__((always_inline)) 43 | //#define sw_inline __attribute__((inline)) 44 | #if !defined(__ANDROID__) && !defined(__EMSCRIPTEN__) 45 | //#define sw_inline __always_inline 46 | #else 47 | #define sw_inline __inline__ 48 | #endif 49 | #define RESTRICT __restrict__ 50 | 51 | #define ALIGNED(x, N) x __attribute__((aligned(N))); 52 | #endif 53 | #ifdef _MSC_VER 54 | //#define sw_inline __forceinline 55 | 56 | #define RESTRICT __restrict 57 | 58 | #define ALIGNED(x, N) __declspec(align(N)) x 59 | #endif 60 | 61 | #ifndef sw_inline 62 | #define sw_inline static 63 | #endif 64 | 65 | #if defined(ENVIRONMENT32) && !defined(__ANDROID__) 66 | #ifdef __GNUC__ 67 | #define FASTCALL __attribute__((fastcall)) 68 | #else 69 | #define FASTCALL __fastcall 70 | #endif 71 | #else 72 | /* should be fastcall as it is */ 73 | #define FASTCALL 74 | #endif 75 | 76 | /*#ifdef __cplusplus 77 | extern "C" { 78 | #endif 79 | extern struct SWcontext *sw_cur_context; 80 | #ifdef __cplusplus 81 | } 82 | #endif*/ 83 | 84 | #define VS_IN SWvtx_attribute *RESTRICT attribs, SWuint index, SWuniform *uniforms 85 | #define VS_OUT SWfloat *RESTRICT out_data 86 | #define FS_IN SWfloat *RESTRICT f_in_data, SWuniform *RESTRICT uniforms 87 | #define FS_OUT SWfloat *RESTRICT f_out_data, SWint *RESTRICT b_discard 88 | 89 | #define V_FATTR(x) ((SWfloat *)((char *)attribs[x].data + index * attribs[x].stride)) 90 | #define V_POS_OUT out_data 91 | #define V_FVARYING(x) ((SWfloat *)(out_data + 4 + x)) 92 | 93 | #define F_COL_OUT f_out_data 94 | #define F_POS_IN f_in_data 95 | #define F_FVARYING_IN(x) ((const SWfloat *)(f_in_data + 4 + x)) 96 | 97 | #define I_UNIFORM(x) ((SWint *)uniforms[(x)].data) 98 | #define I_UNIFORM_S(x) (*(SWint *)uniforms[(x)].data) 99 | #define F_UNIFORM(x) ((SWfloat *)uniforms[(x)].data) 100 | #define F_UNIFORM_S(x) (*(SWfloat *)uniforms[(x)].data) 101 | 102 | #define VEC4_SIZE (4 * sizeof(SWfloat)) 103 | #define VEC3_SIZE (3 * sizeof(SWfloat)) 104 | #define VEC2_SIZE (2 * sizeof(SWfloat)) 105 | #define FLOAT_SIZE sizeof(SWfloat) 106 | 107 | #define INTERP_DATA1 void *varying1 108 | #define INTERP_DATA2 void *varying2 109 | #define INTERP_RES void *res 110 | 111 | #define DISCARD \ 112 | (*b_discard) = 1; \ 113 | return 114 | 115 | #define TEXTURE(slot, uv, col) swTexture(slot, uv, col) 116 | /*#define TESTURE_RGB888(slot, uv, col) { \ 117 | SWtexture *t = &sw_cur_context->textures[sw_cur_context->binded_textures[slot]];\ 118 | swTex_RGB888_GetColorFloat_RGBA(t, uv[0], uv[1], col); \ 119 | }*/ 120 | 121 | #define lerpff(f1, f2, t) ((f1) + t * ((f2) - (f1))) 122 | #define lerpfff(f1, f2, f3, uvw) ((f1) * (uvw)[0] + (f2) * (uvw)[1] + (f3) * (uvw)[2]) 123 | 124 | typedef void(FASTCALL *vtx_shader_proc)(VS_IN, VS_OUT); 125 | typedef void(FASTCALL *frag_shader_proc)(FS_IN, FS_OUT); 126 | 127 | #define VSHADER void FASTCALL 128 | #define FSHADER void FASTCALL 129 | 130 | #define sw_swap(x, y, T) \ 131 | { \ 132 | T tmp = x; \ 133 | x = y; \ 134 | y = tmp; \ 135 | } 136 | #define sw_abs(x) ((x > 0) ? (x) : -(x)) 137 | #define sw_min(_1, _2) (((_1) < (_2)) ? (_1) : (_2)) 138 | #define sw_max(_1, _2) (((_1) > (_2)) ? (_1) : (_2)) 139 | #define sw_clamp(x, min, max) ((x) < (min) ? (min) : ((x) > (max) ? (max) : (x))) 140 | /*#define sw_rotatei(_1, _2, _3) { SWint tmp = _1; _1 = _3; _3 = _2; _2 = tmp; }*/ 141 | #define sw_swapi(x, y) \ 142 | { \ 143 | SWint tmp = x; \ 144 | x = y; \ 145 | y = tmp; \ 146 | } 147 | #define sw_rotate_lefti(_0, _1, _2) \ 148 | { \ 149 | SWint tmp = (_0); \ 150 | (_0) = (_1); \ 151 | (_1) = (_2); \ 152 | (_2) = tmp; \ 153 | } 154 | #define sw_rotate_leftf(_0, _1, _2) \ 155 | { \ 156 | SWfloat tmp = (_0); \ 157 | (_0) = (_1); \ 158 | (_1) = (_2); \ 159 | (_2) = tmp; \ 160 | } 161 | #define sw_rotate_righti(_0, _1, _2) \ 162 | { \ 163 | SWint tmp = (_0); \ 164 | (_0) = (_2); \ 165 | (_2) = (_1); \ 166 | (_1) = tmp; \ 167 | } 168 | #define sw_rotate_lefti4(_0, _1, _2, _3) \ 169 | { \ 170 | SWint tmp = (_0); \ 171 | (_0) = (_1); \ 172 | (_1) = (_2); \ 173 | (_2) = (_3); \ 174 | (_3) = tmp; \ 175 | } 176 | 177 | #define sw_cross(res, v1, v2) \ 178 | res[0] = v1[1] * v2[2] - v1[2] * v2[1]; \ 179 | res[1] = v1[2] * v2[0] - v1[0] * v2[2]; \ 180 | res[2] = v1[0] * v2[1] - v1[1] * v2[0]; 181 | 182 | #define sw_copy(dest, src, num) \ 183 | switch (num) { \ 184 | case 16: \ 185 | (dest)[15] = (src)[15]; \ 186 | case 15: \ 187 | (dest)[14] = (src)[14]; \ 188 | case 14: \ 189 | (dest)[13] = (src)[13]; \ 190 | case 13: \ 191 | (dest)[12] = (src)[12]; \ 192 | case 12: \ 193 | (dest)[11] = (src)[11]; \ 194 | case 11: \ 195 | (dest)[10] = (src)[10]; \ 196 | case 10: \ 197 | (dest)[9] = (src)[9]; \ 198 | case 9: \ 199 | (dest)[8] = (src)[8]; \ 200 | case 8: \ 201 | (dest)[7] = (src)[7]; \ 202 | case 7: \ 203 | (dest)[6] = (src)[6]; \ 204 | case 6: \ 205 | (dest)[5] = (src)[5]; \ 206 | case 5: \ 207 | (dest)[4] = (src)[4]; \ 208 | case 4: \ 209 | (dest)[3] = (src)[3]; \ 210 | case 3: \ 211 | (dest)[2] = (src)[2]; \ 212 | case 2: \ 213 | (dest)[1] = (src)[1]; \ 214 | case 1: \ 215 | (dest)[0] = (src)[0]; \ 216 | default:; \ 217 | } 218 | 219 | #define sw_add(dest, src, num) \ 220 | switch (num) { \ 221 | case 16: \ 222 | (dest)[15] += (src)[15]; \ 223 | case 15: \ 224 | (dest)[14] += (src)[14]; \ 225 | case 14: \ 226 | (dest)[13] += (src)[13]; \ 227 | case 13: \ 228 | (dest)[12] += (src)[12]; \ 229 | case 12: \ 230 | (dest)[11] += (src)[11]; \ 231 | case 11: \ 232 | (dest)[10] += (src)[10]; \ 233 | case 10: \ 234 | (dest)[9] += (src)[9]; \ 235 | case 9: \ 236 | (dest)[8] += (src)[8]; \ 237 | case 8: \ 238 | (dest)[7] += (src)[7]; \ 239 | case 7: \ 240 | (dest)[6] += (src)[6]; \ 241 | case 6: \ 242 | (dest)[5] += (src)[5]; \ 243 | case 5: \ 244 | (dest)[4] += (src)[4]; \ 245 | case 4: \ 246 | (dest)[3] += (src)[3]; \ 247 | case 3: \ 248 | (dest)[2] += (src)[2]; \ 249 | case 2: \ 250 | (dest)[1] += (src)[1]; \ 251 | case 1: \ 252 | (dest)[0] += (src)[0]; \ 253 | default:; \ 254 | } 255 | 256 | #define sw_add_3(dest, src) \ 257 | (dest)[0] += (src)[0]; \ 258 | (dest)[1] += (src)[1]; \ 259 | (dest)[2] += (src)[2]; 260 | 261 | #define sw_add_q(dest, src, num) \ 262 | switch (num) { \ 263 | case 16: \ 264 | case 15: \ 265 | case 14: \ 266 | case 13: \ 267 | (dest)[12] += (src)[12]; \ 268 | (dest)[13] += (src)[13]; \ 269 | (dest)[14] += (src)[14]; \ 270 | (dest)[15] += (src)[15]; \ 271 | case 12: \ 272 | case 11: \ 273 | case 10: \ 274 | case 9: \ 275 | (dest)[8] += (src)[8]; \ 276 | (dest)[9] += (src)[9]; \ 277 | (dest)[10] += (src)[10]; \ 278 | (dest)[11] += (src)[11]; \ 279 | case 8: \ 280 | case 7: \ 281 | case 6: \ 282 | case 5: \ 283 | (dest)[4] += (src)[4]; \ 284 | (dest)[5] += (src)[5]; \ 285 | (dest)[6] += (src)[6]; \ 286 | (dest)[7] += (src)[7]; \ 287 | case 4: \ 288 | case 3: \ 289 | case 2: \ 290 | case 1: \ 291 | (dest)[3] += (src)[3]; \ 292 | (dest)[2] += (src)[2]; \ 293 | (dest)[1] += (src)[1]; \ 294 | (dest)[0] += (src)[0]; \ 295 | default:; \ 296 | } 297 | 298 | #endif /* SW_TYPES_H */ 299 | -------------------------------------------------------------------------------- /SWzbuffer.c: -------------------------------------------------------------------------------- 1 | #include "SWzbuffer.h" 2 | 3 | #include 4 | #include 5 | 6 | #define _swGetTile(zb, x, y) \ 7 | &zb->tiles[(y & ~(SW_TILE_SIZE - 1) * zb->tile_w) + (x & ~(SW_TILE_SIZE - 1))] 8 | 9 | void swZbufInit(SWzbuffer *zb, const SWint w, const SWint h, const SWfloat zmax) { 10 | memset(zb, 0, sizeof(SWzbuffer)); 11 | zb->w = w; 12 | zb->h = h; 13 | zb->tile_w = (w + (SW_TILE_SIZE - 1)) / SW_TILE_SIZE; 14 | zb->tile_h = (h + (SW_TILE_SIZE - 1)) / SW_TILE_SIZE; 15 | zb->depth = (SWfloat *)malloc(sizeof(SWfloat) * w * h); 16 | zb->tiles = (SWzrange *)malloc(sizeof(SWzrange) * zb->tile_w * zb->tile_h); 17 | zb->zmax = zmax; 18 | swZbufClearDepth(zb, 1); 19 | } 20 | 21 | void swZbufDestroy(SWzbuffer *zb) { 22 | free(zb->depth); 23 | free(zb->tiles); 24 | memset(zb, 0, sizeof(SWzbuffer)); 25 | } 26 | 27 | void swZbufClearDepth(SWzbuffer *zb, const SWfloat val) { 28 | SWint i; 29 | for (i = 0; i < zb->w; i++) { 30 | zb->depth[i] = val; 31 | } 32 | for (i = 1; i < zb->h; i++) { 33 | memcpy(&zb->depth[i * zb->w], zb->depth, sizeof(SWfloat) * zb->w); 34 | } 35 | 36 | for (i = 0; i < zb->tile_w; i++) { 37 | zb->tiles[i].min = zb->tiles[i].max = val; 38 | } 39 | 40 | for (i = 1; i < zb->tile_h; i++) { 41 | memcpy(&zb->tiles[i * zb->tile_w], zb->tiles, sizeof(SWzrange) * zb->tile_w); 42 | } 43 | } 44 | 45 | /* Without tiles */ 46 | #if 1 47 | void swZbufSetDepth_(SWzbuffer *zb, const SWint x, const SWint y, const SWfloat val) { 48 | const SWint index = y * zb->w + x; 49 | zb->depth[index] = val; 50 | } 51 | SWint swZbufTestDepth_(SWzbuffer *zb, const SWint x, const SWint y, const SWfloat z) { 52 | return z <= zb->depth[y * zb->w + x]; 53 | } 54 | #else 55 | 56 | #include 57 | 58 | void swZbufSetDepth_(SWzbuffer *zb, SWint x, SWint y, SWfloat val) { 59 | SWzrange *r = _swGetTile(zb, x, y); 60 | SWint i, j; 61 | SWint index = y * zb->w + x; 62 | /*SWint iiii = y & ~(SW_TILE_SIZE - 1) * zb->tile_w + (x & ~(SW_TILE_SIZE - 1));*/ 63 | /*SWint tile_index1 = y & ~(SW_TILE_SIZE - 1) * zb->tile_w + (x & ~(SW_TILE_SIZE - 64 | 1)); SWint tile_index = index / (SW_TILE_SIZE * SW_TILE_SIZE); assert(tile_index1 == 65 | tile_index); SWzrange *r = &zb->tiles[tile_index];*/ 66 | zb->depth[index] = val; 67 | if (val < r->min) { 68 | r->min = val; 69 | r->min_index = index; 70 | } else if (index == r->min_index && val != r->min) { 71 | r->min = val; 72 | goto RECALC; 73 | } 74 | 75 | if (val > r->max) { 76 | r->max = val; 77 | r->max_index = index; 78 | } else if (index == r->min_index && val != r->max) { 79 | r->max = val; 80 | goto RECALC; 81 | } 82 | return; 83 | 84 | RECALC: 85 | x &= ~(SW_TILE_SIZE - 1); 86 | y &= ~(SW_TILE_SIZE - 1); 87 | 88 | for (i = 0; i < SW_TILE_SIZE; i++) { 89 | for (j = 0; j < SW_TILE_SIZE; j++) { 90 | SWint ind = (y + i) * zb->w + (x + j); 91 | /*SWint tile_ind = ind & ~(SW_TILE_SIZE - 1);*/ 92 | if (zb->depth[ind] < r->min) { 93 | r->min = zb->depth[ind]; 94 | r->min_index = ind; 95 | } else if (zb->depth[ind] > r->max) { 96 | r->max = zb->depth[ind]; 97 | r->max_index = ind; 98 | } 99 | } 100 | } 101 | } 102 | 103 | SWint swZbufTestDepth_(SWzbuffer *zb, SWint x, SWint y, SWfloat z) { 104 | SWzrange *r = _swGetTile(zb, x, y); 105 | if (z < r->max) { 106 | if (z < r->min) { 107 | r->min = z; 108 | return 1; 109 | } 110 | return z < zb->depth[y * zb->w + x]; 111 | } else { 112 | r->max = z; 113 | } 114 | return 0; 115 | } 116 | 117 | #endif 118 | 119 | SWoccresult swZbufTriTestDepth(SWzbuffer *zb, SWint min[2], SWint max[2], SWfloat *attrs1, 120 | SWfloat *attrs2, SWfloat *attrs3) { 121 | SWfloat /*tri_min_z, */ tri_max_z, buf_min_z = 1, buf_max_z = 0; 122 | SWint i, j; 123 | /*tri_min_z = sw_min(attrs1[2], sw_min(attrs2[2], attrs3[2]));*/ 124 | tri_max_z = sw_max(attrs1[2], sw_max(attrs2[2], attrs3[2])); 125 | 126 | for (j = min[1] / SW_TILE_SIZE; j < max[1] / SW_TILE_SIZE; j++) { 127 | for (i = min[0] / SW_TILE_SIZE; i < max[0] / SW_TILE_SIZE; i++) { 128 | SWzrange *r = &zb->tiles[j * zb->tile_w + i]; 129 | buf_min_z = sw_min(buf_min_z, r->min); 130 | buf_max_z = sw_max(buf_max_z, r->max); 131 | } 132 | } 133 | 134 | if (tri_max_z < buf_min_z) { 135 | return SW_NONOCCLUDED; 136 | } else { 137 | return SW_PARTIAL; 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /SWzbuffer.h: -------------------------------------------------------------------------------- 1 | #ifndef SW_ZBUFFER_H 2 | #define SW_ZBUFFER_H 3 | 4 | #include "SWcore.h" 5 | 6 | typedef struct SWzrange { 7 | SWfloat min, max; 8 | } SWzrange; 9 | 10 | typedef struct SWzbuffer { 11 | SWint w, h; 12 | SWfloat *depth; 13 | SWint tile_w, tile_h; 14 | SWzrange *tiles; 15 | SWfloat zmax; 16 | } SWzbuffer; 17 | 18 | typedef enum SWoccresult { SW_OCCLUDED = 0, SW_NONOCCLUDED, SW_PARTIAL } SWoccresult; 19 | 20 | void swZbufInit(SWzbuffer *zb, SWint w, SWint h, SWfloat zmax); 21 | void swZbufDestroy(SWzbuffer *zb); 22 | 23 | void swZbufClearDepth(SWzbuffer *zb, SWfloat val); 24 | 25 | #define swZbufSetDepth(zb, x, y, val) (zb)->depth[(y) * (zb)->w + (x)] = (val) 26 | #define swZbufGetDepth(zb, x, y) (zb)->depth[(y) * (zb)->w + (x)] 27 | #define swZbufTestDepth(zb, x, y, z) ((z) <= (zb)->depth[(y) * (zb)->w + (x)]) 28 | 29 | #define swZbufSetTileRange(zb, x, y, zmin, zmax) \ 30 | { \ 31 | SWzrange *_zr = \ 32 | &(zb)->tiles[((y) / SW_TILE_SIZE) * (zb)->tile_w + ((x) / SW_TILE_SIZE)]; \ 33 | _zr->min = (zmin); \ 34 | _zr->max = (zmax); \ 35 | } 36 | 37 | #define swZbufUpdateTileRange(zb, x, y, zmin, zmax) \ 38 | { \ 39 | SWzrange *_zr = \ 40 | &(zb)->tiles[((y) / SW_TILE_SIZE) * (zb)->tile_w + ((x) / SW_TILE_SIZE)]; \ 41 | _zr->min = sw_min((zmin), _zr->min); \ 42 | _zr->max = sw_max((zmax), _zr->max); \ 43 | } 44 | 45 | sw_inline SWoccresult swZbufTestTileRange(const SWzbuffer *zb, SWint x, SWint y, 46 | SWfloat min, SWfloat max) { 47 | SWzrange *zr = &zb->tiles[(y / SW_TILE_SIZE) * zb->tile_w + (x / SW_TILE_SIZE)]; 48 | if (max < zr->min) { 49 | return SW_NONOCCLUDED; 50 | } else if (min > zr->max) { 51 | return SW_OCCLUDED; 52 | } else { 53 | return SW_PARTIAL; 54 | } 55 | } 56 | 57 | sw_inline SWzrange *swZbufGetTileRange(const SWzbuffer *zb, SWint x, SWint y) { 58 | return &zb->tiles[(y / SW_TILE_SIZE) * zb->tile_w + (x / SW_TILE_SIZE)]; 59 | } 60 | 61 | void swZbufSetDepth_(SWzbuffer *zb, SWint x, SWint y, SWfloat val); 62 | SWint swZbufTestDepth_(SWzbuffer *zb, SWint x, SWint y, SWfloat z); 63 | 64 | SWoccresult swZbufTriTestDepth(SWzbuffer *zb, SWint min[2], SWint max[2], SWfloat *attrs1, 65 | SWfloat *attrs2, SWfloat *attrs3); 66 | 67 | #endif /* SW_ZBUFFER_H */ 68 | -------------------------------------------------------------------------------- /_SW.c: -------------------------------------------------------------------------------- 1 | 2 | #include "SWbuffer.c" 3 | #include "SWcompress.c" 4 | #include "SWcontext.c" 5 | #include "SWcore.c" 6 | #include "SWcpu.c" 7 | #include "SWdraw.c" 8 | #include "SWframebuffer.c" 9 | #include "SWprogram.c" 10 | #include "SWrasterize.c" 11 | #include "SWtexture.c" 12 | #include "SWzbuffer.c" 13 | 14 | #include "SWculling.c" 15 | #include "SWculling_Ref.c" -------------------------------------------------------------------------------- /_SW_AVX2.c: -------------------------------------------------------------------------------- 1 | #if defined(_M_X86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__) 2 | 3 | #ifdef __GNUC__ 4 | #pragma GCC push_options 5 | #pragma GCC target ("avx2") 6 | #pragma GCC target ("fma") 7 | #pragma clang attribute push (__attribute__((target("avx2,fma"))), apply_to=function) 8 | #endif 9 | 10 | #include "SWculling_AVX2.c" 11 | 12 | unsigned long long get_xcr_feature_mask() { 13 | return _xgetbv(0); 14 | } 15 | 16 | #ifdef __GNUC__ 17 | #pragma clang attribute pop 18 | #pragma GCC pop_options 19 | #endif 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /_SW_AVX512.c: -------------------------------------------------------------------------------- 1 | #if (defined(_M_X86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__)) && \ 2 | (!defined(_MSC_VER) || _MSC_VER > 1916) 3 | 4 | #ifdef __GNUC__ 5 | #pragma GCC push_options 6 | #pragma GCC target("avx512f") 7 | #pragma GCC target("avx512bw") 8 | #pragma GCC target("avx512dq") 9 | #pragma clang attribute push(__attribute__((target("avx512f,avx512bw,avx512dq"))), apply_to = function) 10 | #endif 11 | 12 | #include "SWculling_AVX512.c" 13 | 14 | #ifdef __GNUC__ 15 | #pragma clang attribute pop 16 | #pragma GCC pop_options 17 | #endif 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /_SW_NEON.c: -------------------------------------------------------------------------------- 1 | #if defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64) 2 | #include "SWculling_NEON.c" 3 | #endif 4 | -------------------------------------------------------------------------------- /_SW_SSE2.c: -------------------------------------------------------------------------------- 1 | #if defined(_M_X86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__) 2 | 3 | #ifdef __GNUC__ 4 | #pragma GCC push_options 5 | #pragma GCC target ("sse4.1") 6 | #endif 7 | 8 | #include "SWculling_SSE2.c" 9 | 10 | #ifdef __GNUC__ 11 | #pragma GCC pop_options 12 | #endif 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sergcpp/SW/4681722d9b76a129965085a0767c897f3a8ea146/img1.jpg -------------------------------------------------------------------------------- /img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sergcpp/SW/4681722d9b76a129965085a0767c897f3a8ea146/img2.jpg -------------------------------------------------------------------------------- /img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sergcpp/SW/4681722d9b76a129965085a0767c897f3a8ea146/img3.jpg -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.1) 2 | project(test_SW) 3 | 4 | add_executable(test_SW main.c 5 | test_buffer.c 6 | test_common.h 7 | test_context.c 8 | test_framebuffer.c 9 | test_pixels.c 10 | test_program.c 11 | test_texture.c 12 | test_zbuffer.c) 13 | target_link_libraries(test_SW ${LIBS} SW) 14 | -------------------------------------------------------------------------------- /tests/main.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | 4 | void test_buffer(); 5 | void test_context(); 6 | void test_framebuffer(); 7 | void test_pixels(); 8 | void test_program(); 9 | void test_texture(); 10 | void test_zbuffer(); 11 | 12 | int main() { 13 | test_buffer(); 14 | test_context(); 15 | test_framebuffer(); 16 | test_pixels(); 17 | test_program(); 18 | test_texture(); 19 | test_zbuffer(); 20 | puts("OK"); 21 | } -------------------------------------------------------------------------------- /tests/test_buffer.c: -------------------------------------------------------------------------------- 1 | #include "test_common.h" 2 | 3 | #include 4 | 5 | #include "../SWbuffer.h" 6 | 7 | void test_buffer() { 8 | { 9 | // Buffer init/destroy 10 | SWbuffer b; 11 | const char data1[] = "Data we put in buffer"; 12 | swBufInit(&b, sizeof(data1), data1); 13 | require(b.data); 14 | swBufDestroy(&b); 15 | require(b.data == NULL); 16 | } 17 | 18 | { 19 | // Buffer data 20 | SWbuffer b; 21 | const char data1[] = "Data we put in buffer"; 22 | swBufInit(&b, sizeof(data1), data1); 23 | char data1_chk[sizeof(data1)]; 24 | swBufGetData(&b, 0, sizeof(data1), data1_chk); 25 | require(strcmp(data1, data1_chk) == 0); 26 | swBufDestroy(&b); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /tests/test_common.h: -------------------------------------------------------------------------------- 1 | #ifndef TEST_COMMON_H 2 | #define TEST_COMMON_H 3 | 4 | #include 5 | #include 6 | 7 | static void handle_assert(int passed, const char* assert, const char* file, long line) { 8 | if (!passed) { 9 | printf("Assertion failed %s in %s at line %i\n", assert, file, (int)line); 10 | exit(-1); 11 | } 12 | } 13 | 14 | #define require(x) handle_assert((x) != 0, #x , __FILE__, __LINE__ ) 15 | 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /tests/test_context.c: -------------------------------------------------------------------------------- 1 | #include "test_common.h" 2 | 3 | #include 4 | #include 5 | 6 | #include "../SWcontext.h" 7 | 8 | void test_context() { 9 | SWcontext *ctx = malloc(sizeof(SWcontext)); 10 | swCtxInit(ctx, 100, 100); 11 | require(ctx->framebuffers[0].pixels != NULL); 12 | 13 | { 14 | // Context buffer creation 15 | SWint buf1 = swCtxCreateBuffer(ctx); 16 | SWint buf2 = swCtxCreateBuffer(ctx); 17 | require(buf2 == buf1 + 1); 18 | const char data1[] = "Data we put in buffer1"; 19 | const char data2[] = "Data for buffer2"; 20 | swCtxBindBuffer(ctx, SW_ARRAY_BUFFER, buf1); 21 | swCtxBindBuffer(ctx, SW_INDEX_BUFFER, buf2); 22 | swCtxBufferData(ctx, SW_ARRAY_BUFFER, sizeof(data1), data1); 23 | swCtxBufferData(ctx, SW_INDEX_BUFFER, sizeof(data2), data2); 24 | char data1_chk[sizeof(data1)]; 25 | char data2_chk[sizeof(data2)]; 26 | swCtxGetBufferSubData(ctx, SW_ARRAY_BUFFER, 0, sizeof(data1), data1_chk); 27 | swCtxGetBufferSubData(ctx, SW_INDEX_BUFFER, 0, sizeof(data2), data2_chk); 28 | require(strcmp(data1, data1_chk) == 0); 29 | require(strcmp(data2, data2_chk) == 0); 30 | swCtxDeleteBuffer(ctx, buf1); 31 | SWint buf3 = swCtxCreateBuffer(ctx); 32 | require(buf3 == buf1); 33 | swCtxDeleteBuffer(ctx, buf2); 34 | swCtxDeleteBuffer(ctx, buf3); 35 | } 36 | 37 | swCtxDestroy(ctx); 38 | free(ctx); 39 | } 40 | -------------------------------------------------------------------------------- /tests/test_framebuffer.c: -------------------------------------------------------------------------------- 1 | #include "test_common.h" 2 | 3 | #include "../SWframebuffer.h" 4 | #include "../SWtexture.h" 5 | 6 | SWubyte tex[] = { 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 255, 7 | 1, 1, 0, 11, 13, 14, 190, 111, 20, 20, 20, 20, 8 | 10, 111, 12, 190, 111, 20, 0, 1, 0, 0, 0, 1, 9 | 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0 10 | }; 11 | 12 | void test_framebuffer() { 13 | 14 | #define TEST_BEGIN \ 15 | SWframebuffer f; \ 16 | swFbufInit(&f, SW_BGRA8888, 5, 20, 1); \ 17 | require(f.pixels != NULL); \ 18 | require(f.zbuf != NULL) 19 | 20 | #define TEST_END \ 21 | swFbufDestroy(&f); \ 22 | require(f.pixels == NULL); \ 23 | require(f.zbuf == NULL) 24 | 25 | { 26 | // Framebuffer swFbufClearColor_RGBA 27 | TEST_BEGIN; 28 | 29 | SWubyte col[4] = { 1, 2, 3, 4 }; 30 | swFbufClearColor_RGBA(&f, col); 31 | for (int y = 0; y < f.h; y++) { 32 | for (int x = 0; x < f.w; x++) { 33 | require(((SWubyte*)f.pixels)[4 * (y * f.w + x) + 0] == 3); 34 | require(((SWubyte*)f.pixels)[4 * (y * f.w + x) + 1] == 2); 35 | require(((SWubyte*)f.pixels)[4 * (y * f.w + x) + 2] == 1); 36 | require(((SWubyte*)f.pixels)[4 * (y * f.w + x) + 3] == 4); 37 | } 38 | } 39 | 40 | TEST_END; 41 | } 42 | 43 | { 44 | // Framebuffer swFbufClearColorFloat 45 | TEST_BEGIN; 46 | 47 | swFbufClearColorFloat(&f, 1, 0.5f, 0.25f, 0.75f); 48 | for (int y = 0; y < f.h; y++) { 49 | for (int x = 0; x < f.w; x++) { 50 | require(((SWubyte*)f.pixels)[4 * (y * f.w + x) + 0] == 63); 51 | require(((SWubyte*)f.pixels)[4 * (y * f.w + x) + 1] == 127); 52 | require(((SWubyte*)f.pixels)[4 * (y * f.w + x) + 2] == 255); 53 | require(((SWubyte*)f.pixels)[4 * (y * f.w + x) + 3] == 191); 54 | } 55 | } 56 | 57 | TEST_END; 58 | } 59 | 60 | { 61 | // Framebuffer swFbufSetPixelFloat_RGBA 62 | TEST_BEGIN; 63 | 64 | swFbufClearColorFloat(&f, 1, 0.5f, 0.25f, 1); 65 | SWfloat col[4] = { 1, 0, 0.5f, 0.5f }; 66 | swFbufSetPixel_FRGBA(&f, 4, 13, col); 67 | 68 | require(((SWubyte*)f.pixels)[4 * (13 * f.w + 4) + 0] == 127); 69 | require(((SWubyte*)f.pixels)[4 * (13 * f.w + 4) + 1] == 0); 70 | require(((SWubyte*)f.pixels)[4 * (13 * f.w + 4) + 2] == 255); 71 | require(((SWubyte*)f.pixels)[4 * (13 * f.w + 4) + 3] == 127); 72 | 73 | TEST_END; 74 | } 75 | 76 | { 77 | // Framebuffer swFbufBlitTexture 78 | TEST_BEGIN; 79 | 80 | swFbufClearColorFloat(&f, 1, 0.0f, 0.0f, 1); 81 | swFbufBlitPixels(&f, 1, 10, 0, SW_UNSIGNED_BYTE, SW_RGB, 4, 4, tex, 1); 82 | 83 | require(((SWubyte*)f.pixels)[4 * (11 * f.w + 2) + 0] == 14); 84 | require(((SWubyte*)f.pixels)[4 * (11 * f.w + 2) + 1] == 13); 85 | require(((SWubyte*)f.pixels)[4 * (11 * f.w + 2) + 2] == 11); 86 | require(((SWubyte*)f.pixels)[4 * (11 * f.w + 2) + 3] == 255); 87 | 88 | require(((SWubyte*)f.pixels)[4 * (12 * f.w + 2) + 0] == 20); 89 | require(((SWubyte*)f.pixels)[4 * (12 * f.w + 2) + 1] == 111); 90 | require(((SWubyte*)f.pixels)[4 * (12 * f.w + 2) + 2] == 190); 91 | require(((SWubyte*)f.pixels)[4 * (12 * f.w + 2) + 3] == 255); 92 | 93 | require(((SWubyte*)f.pixels)[4 * (5 * f.w + 3) + 0] == 0); 94 | require(((SWubyte*)f.pixels)[4 * (5 * f.w + 3) + 1] == 0); 95 | require(((SWubyte*)f.pixels)[4 * (5 * f.w + 3) + 2] == 255); 96 | require(((SWubyte*)f.pixels)[4 * (5 * f.w + 3) + 3] == 255); 97 | 98 | TEST_END; 99 | } 100 | } 101 | #undef TEST_BEGIN 102 | #undef TEST_END -------------------------------------------------------------------------------- /tests/test_pixels.c: -------------------------------------------------------------------------------- 1 | #include "test_common.h" 2 | 3 | #include 4 | #include 5 | 6 | #include "../SWtypes.h" 7 | #include "../SWpixels.h" 8 | 9 | static const SWubyte px_RGB888[] = { 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 255, 10 | 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 11 | 1, 1, 0, 11, 13, 14, 190, 111, 20, 20, 20, 20, 12 | 10, 111, 12, 190, 111, 20, 0, 1, 0, 0, 0, 1 13 | }; 14 | 15 | static const SWubyte px_RGBA8888[] = { 0, 0, 0, 1, 1, 0, 0, 2, 0, 1, 0, 3, 0, 0, 255, 4, 16 | 1, 0, 0, 13, 0, 1, 0, 14, 0, 0, 1, 15, 0, 0, 0, 16, 17 | 1, 1, 0, 5, 11, 13, 14, 6, 190, 111, 20, 7, 20, 20, 20, 8, 18 | 10, 111, 12, 9, 190, 111, 20, 10, 0, 1, 0, 11, 0, 0, 1, 12 19 | }; 20 | 21 | static const SWfloat px_FRGBA[] = { 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 22 | 0.5f, 0, 0, 0, 0, 0.5f, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 23 | 0, 0.5f, 0, 0, 0, 0.5f, 0.5f, 0, 0, 0, 0, 0, 1, 1, 1, 0, 24 | 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0 25 | }; 26 | 27 | #define REQUIRE_VEC4_EQ(vec, _0, _1, _2, _3) \ 28 | require((vec)[0] == (_0)); \ 29 | require((vec)[1] == (_1)); \ 30 | require((vec)[2] == (_2)); \ 31 | require((vec)[3] == (_3)); 32 | 33 | #define REQUIRE_FVEC4_EQ(vec, _0, _1, _2, _3) \ 34 | require(fabs((vec)[0] - (_0)) < 0.1f); \ 35 | require(fabs((vec)[1] - (_1)) < 0.1f); \ 36 | require(fabs((vec)[2] - (_2)) < 0.1f); \ 37 | require(fabs((vec)[3] - (_3)) < 0.1f); 38 | 39 | void test_pixels() { 40 | 41 | { 42 | // Get pixels RGB888 43 | SWubyte rgba[4], bgra[4]; 44 | SWfloat frgba[4], fbgra[4]; 45 | 46 | // First row 47 | swPx_RGB888_GetColor_RGBA8888(4, 4, px_RGB888, 0, 0, rgba); 48 | swPx_RGB888_GetColor_BGRA8888(4, 4, px_RGB888, 0, 0, bgra); 49 | swPx_RGB888_GetColor_FRGBA(4, 4, px_RGB888, 0, 0, frgba); 50 | swPx_RGB888_GetColor_FBGRA(4, 4, px_RGB888, 0, 0, fbgra); 51 | REQUIRE_VEC4_EQ(rgba, 0, 0, 0, 255); 52 | REQUIRE_VEC4_EQ(bgra, 0, 0, 0, 255); 53 | REQUIRE_FVEC4_EQ(frgba, 0, 0, 0, 1.0); 54 | REQUIRE_FVEC4_EQ(fbgra, 0, 0, 0, 1.0); 55 | 56 | swPx_RGB888_GetColor_RGBA8888(4, 4, px_RGB888, 3, 0, rgba); 57 | swPx_RGB888_GetColor_BGRA8888(4, 4, px_RGB888, 3, 0, bgra); 58 | swPx_RGB888_GetColor_FRGBA(4, 4, px_RGB888, 3, 0, frgba); 59 | swPx_RGB888_GetColor_FBGRA(4, 4, px_RGB888, 3, 0, fbgra); 60 | REQUIRE_VEC4_EQ(rgba, 0, 0, 255, 255); 61 | REQUIRE_VEC4_EQ(bgra, 255, 0, 0, 255); 62 | REQUIRE_FVEC4_EQ(frgba, 0, 0, 1.0, 1.0); 63 | REQUIRE_FVEC4_EQ(fbgra, 1.0, 0, 0, 1.0); 64 | 65 | // Last row 66 | swPx_RGB888_GetColor_RGBA8888(4, 4, px_RGB888, 0, 3, rgba); 67 | swPx_RGB888_GetColor_BGRA8888(4, 4, px_RGB888, 0, 3, bgra); 68 | swPx_RGB888_GetColor_FRGBA(4, 4, px_RGB888, 0, 3, frgba); 69 | swPx_RGB888_GetColor_FBGRA(4, 4, px_RGB888, 0, 3, fbgra); 70 | REQUIRE_VEC4_EQ(rgba, 10, 111, 12, 255); 71 | REQUIRE_VEC4_EQ(bgra, 12, 111, 10, 255); 72 | REQUIRE_FVEC4_EQ(frgba, 10.0 / 255, 111.0 / 255, 12.0 / 255, 1.0); 73 | REQUIRE_FVEC4_EQ(fbgra, 12.0 / 255, 111.0 / 255, 10.0 / 255, 1.0); 74 | 75 | swPx_RGB888_GetColor_RGBA8888(4, 4, px_RGB888, 3, 3, rgba); 76 | swPx_RGB888_GetColor_BGRA8888(4, 4, px_RGB888, 3, 3, bgra); 77 | swPx_RGB888_GetColor_FRGBA(4, 4, px_RGB888, 3, 3, frgba); 78 | swPx_RGB888_GetColor_FBGRA(4, 4, px_RGB888, 3, 3, fbgra); 79 | REQUIRE_VEC4_EQ(rgba, 0, 0, 1, 255); 80 | REQUIRE_VEC4_EQ(bgra, 1, 0, 0, 255); 81 | REQUIRE_FVEC4_EQ(frgba, 0, 0, 1.0 / 255, 1.0); 82 | REQUIRE_FVEC4_EQ(fbgra, 1.0 / 255, 0, 0, 1.0); 83 | } 84 | 85 | { 86 | // Get pixels RGB888 87 | SWubyte rgba[4], bgra[4]; 88 | SWfloat frgba[4], fbgra[4]; 89 | 90 | // First row 91 | swPx_RGB888_GetColor_RGBA8888(4, 4, px_RGB888, 0, 0, rgba); 92 | swPx_RGB888_GetColor_BGRA8888(4, 4, px_RGB888, 0, 0, bgra); 93 | swPx_RGB888_GetColor_FRGBA(4, 4, px_RGB888, 0, 0, frgba); 94 | swPx_RGB888_GetColor_FBGRA(4, 4, px_RGB888, 0, 0, fbgra); 95 | REQUIRE_VEC4_EQ(rgba, 0, 0, 0, 255); 96 | REQUIRE_VEC4_EQ(bgra, 0, 0, 0, 255); 97 | REQUIRE_FVEC4_EQ(frgba, 0, 0, 0, 1.0); 98 | REQUIRE_FVEC4_EQ(fbgra, 0, 0, 0, 1.0); 99 | 100 | swPx_RGB888_GetColor_RGBA8888(4, 4, px_RGB888, 3, 0, rgba); 101 | swPx_RGB888_GetColor_BGRA8888(4, 4, px_RGB888, 3, 0, bgra); 102 | swPx_RGB888_GetColor_FRGBA(4, 4, px_RGB888, 3, 0, frgba); 103 | swPx_RGB888_GetColor_FBGRA(4, 4, px_RGB888, 3, 0, fbgra); 104 | REQUIRE_VEC4_EQ(rgba, 0, 0, 255, 255); 105 | REQUIRE_VEC4_EQ(bgra, 255, 0, 0, 255); 106 | REQUIRE_FVEC4_EQ(frgba, 0, 0, 1.0, 1.0); 107 | REQUIRE_FVEC4_EQ(fbgra, 1.0, 0, 0, 1.0); 108 | 109 | // Last row 110 | swPx_RGB888_GetColor_RGBA8888(4, 4, px_RGB888, 0, 3, rgba); 111 | swPx_RGB888_GetColor_BGRA8888(4, 4, px_RGB888, 0, 3, bgra); 112 | swPx_RGB888_GetColor_FRGBA(4, 4, px_RGB888, 0, 3, frgba); 113 | swPx_RGB888_GetColor_FBGRA(4, 4, px_RGB888, 0, 3, fbgra); 114 | REQUIRE_VEC4_EQ(rgba, 10, 111, 12, 255); 115 | REQUIRE_VEC4_EQ(bgra, 12, 111, 10, 255); 116 | REQUIRE_FVEC4_EQ(frgba, 10.0 / 255, 111.0 / 255, 12.0 / 255, 1.0); 117 | REQUIRE_FVEC4_EQ(fbgra, 12.0 / 255, 111.0 / 255, 10.0 / 255, 1.0); 118 | 119 | swPx_RGB888_GetColor_RGBA8888(4, 4, px_RGB888, 3, 3, rgba); 120 | swPx_RGB888_GetColor_BGRA8888(4, 4, px_RGB888, 3, 3, bgra); 121 | swPx_RGB888_GetColor_FRGBA(4, 4, px_RGB888, 3, 3, frgba); 122 | swPx_RGB888_GetColor_FBGRA(4, 4, px_RGB888, 3, 3, fbgra); 123 | REQUIRE_VEC4_EQ(rgba, 0, 0, 1, 255); 124 | REQUIRE_VEC4_EQ(bgra, 1, 0, 0, 255); 125 | REQUIRE_FVEC4_EQ(frgba, 0, 0, 1.0 / 255, 1.0); 126 | REQUIRE_FVEC4_EQ(fbgra, 1.0 / 255, 0, 0, 1.0); 127 | } 128 | 129 | { 130 | // Get pixels RGB888 by uv 131 | SWubyte rgba[4], bgra[4]; 132 | SWfloat frgba[4], fbgra[4]; 133 | 134 | // First row 135 | swPx_RGB888_GetColor_RGBA8888_UV(4, 4, px_RGB888, 0.1f, 0.1f, rgba); 136 | swPx_RGB888_GetColor_BGRA8888_UV(4, 4, px_RGB888, 0.1f, 0.1f, bgra); 137 | swPx_RGB888_GetColor_FRGBA_UV(4, 4, px_RGB888, 0.1f, 0.1f, frgba); 138 | swPx_RGB888_GetColor_FBGRA_UV(4, 4, px_RGB888, 0.1f, 0.1f, fbgra); 139 | REQUIRE_VEC4_EQ(rgba, 0, 0, 0, 255); 140 | REQUIRE_VEC4_EQ(bgra, 0, 0, 0, 255); 141 | REQUIRE_FVEC4_EQ(frgba, 0, 0, 0, 1.0); 142 | REQUIRE_FVEC4_EQ(fbgra, 0, 0, 0, 1.0); 143 | 144 | swPx_RGB888_GetColor_RGBA8888_UV(4, 4, px_RGB888, 0.7f, 0.1f, rgba); 145 | swPx_RGB888_GetColor_BGRA8888_UV(4, 4, px_RGB888, 0.7f, 0.1f, bgra); 146 | swPx_RGB888_GetColor_FRGBA_UV(4, 4, px_RGB888, 0.7f, 0.1f, frgba); 147 | swPx_RGB888_GetColor_FBGRA_UV(4, 4, px_RGB888, 0.7f, 0.1f, fbgra); 148 | REQUIRE_VEC4_EQ(rgba, 0, 1, 0, 255); 149 | REQUIRE_VEC4_EQ(bgra, 0, 1, 0, 255); 150 | REQUIRE_FVEC4_EQ(frgba, 0, 1.0 / 255, 0, 1.0); 151 | REQUIRE_FVEC4_EQ(fbgra, 0, 1.0 / 255, 0, 1.0); 152 | 153 | // Last row 154 | swPx_RGB888_GetColor_RGBA8888_UV(4, 4, px_RGB888, 1.1f, 0.9f, rgba); 155 | swPx_RGB888_GetColor_BGRA8888_UV(4, 4, px_RGB888, 1.1f, 0.9f, bgra); 156 | swPx_RGB888_GetColor_FRGBA_UV(4, 4, px_RGB888, 1.1f, 0.9f, frgba); 157 | swPx_RGB888_GetColor_FBGRA_UV(4, 4, px_RGB888, 1.1f, 0.9f, fbgra); 158 | REQUIRE_VEC4_EQ(rgba, 10, 111, 12, 255); 159 | REQUIRE_VEC4_EQ(bgra, 12, 111, 10, 255); 160 | REQUIRE_FVEC4_EQ(frgba, 10.0 / 255, 111.0 / 255, 12.0 / 255, 1.0); 161 | REQUIRE_FVEC4_EQ(fbgra, 12.0 / 255, 111.0 / 255, 10.0 / 255, 1.0); 162 | 163 | swPx_RGB888_GetColor_RGBA8888_UV(4, 4, px_RGB888, 1.9f, 1.9f, rgba); 164 | swPx_RGB888_GetColor_BGRA8888_UV(4, 4, px_RGB888, 1.9f, 1.9f, bgra); 165 | swPx_RGB888_GetColor_FRGBA_UV(4, 4, px_RGB888, 1.9f, 1.9f, frgba); 166 | swPx_RGB888_GetColor_FBGRA_UV(4, 4, px_RGB888, 1.9f, 1.9f, fbgra); 167 | REQUIRE_VEC4_EQ(rgba, 0, 0, 1, 255); 168 | REQUIRE_VEC4_EQ(bgra, 1, 0, 0, 255); 169 | REQUIRE_FVEC4_EQ(frgba, 0, 0, 1.0 / 255, 1.0); 170 | REQUIRE_FVEC4_EQ(fbgra, 1.0 / 255, 0, 0, 1.0); 171 | } 172 | 173 | { 174 | // Get pixels RGBA8888 175 | SWubyte rgba[4], bgra[4]; 176 | SWfloat frgba[4], fbgra[4]; 177 | 178 | // First row 179 | swPx_RGBA8888_GetColor_RGBA8888(4, 4, px_RGBA8888, 0, 0, rgba); 180 | swPx_RGBA8888_GetColor_BGRA8888(4, 4, px_RGBA8888, 0, 0, bgra); 181 | swPx_RGBA8888_GetColor_FRGBA(4, 4, px_RGBA8888, 0, 0, frgba); 182 | swPx_RGBA8888_GetColor_FBGRA(4, 4, px_RGBA8888, 0, 0, fbgra); 183 | REQUIRE_VEC4_EQ(rgba, 0, 0, 0, 1); 184 | REQUIRE_VEC4_EQ(bgra, 0, 0, 0, 1); 185 | REQUIRE_FVEC4_EQ(frgba, 0, 0, 0, 1.0 / 255); 186 | REQUIRE_FVEC4_EQ(fbgra, 0, 0, 0, 1.0 / 255); 187 | 188 | swPx_RGBA8888_GetColor_RGBA8888(4, 4, px_RGBA8888, 3, 0, rgba); 189 | swPx_RGBA8888_GetColor_BGRA8888(4, 4, px_RGBA8888, 3, 0, bgra); 190 | swPx_RGBA8888_GetColor_FRGBA(4, 4, px_RGBA8888, 3, 0, frgba); 191 | swPx_RGBA8888_GetColor_FBGRA(4, 4, px_RGBA8888, 3, 0, fbgra); 192 | REQUIRE_VEC4_EQ(rgba, 0, 0, 255, 4); 193 | REQUIRE_VEC4_EQ(bgra, 255, 0, 0, 4); 194 | REQUIRE_FVEC4_EQ(frgba, 0, 0, 1.0, 4.0 / 255); 195 | REQUIRE_FVEC4_EQ(fbgra, 1.0, 0, 0, 4.0 / 255); 196 | 197 | // Last row 198 | swPx_RGBA8888_GetColor_RGBA8888(4, 4, px_RGBA8888, 0, 3, rgba); 199 | swPx_RGBA8888_GetColor_BGRA8888(4, 4, px_RGBA8888, 0, 3, bgra); 200 | swPx_RGBA8888_GetColor_FRGBA(4, 4, px_RGBA8888, 0, 3, frgba); 201 | swPx_RGBA8888_GetColor_FBGRA(4, 4, px_RGBA8888, 0, 3, fbgra); 202 | REQUIRE_VEC4_EQ(rgba, 10, 111, 12, 9); 203 | REQUIRE_VEC4_EQ(bgra, 12, 111, 10, 9); 204 | REQUIRE_FVEC4_EQ(frgba, 10.0 / 255, 111.0 / 255, 12.0 / 255, 9.0 / 255); 205 | REQUIRE_FVEC4_EQ(fbgra, 12.0 / 255, 111.0 / 255, 10.0 / 255, 9.0 / 255); 206 | 207 | swPx_RGBA8888_GetColor_RGBA8888(4, 4, px_RGBA8888, 3, 3, rgba); 208 | swPx_RGBA8888_GetColor_BGRA8888(4, 4, px_RGBA8888, 3, 3, bgra); 209 | swPx_RGBA8888_GetColor_FRGBA(4, 4, px_RGBA8888, 3, 3, frgba); 210 | swPx_RGBA8888_GetColor_FBGRA(4, 4, px_RGBA8888, 3, 3, fbgra); 211 | REQUIRE_VEC4_EQ(rgba, 0, 0, 1, 12); 212 | REQUIRE_VEC4_EQ(bgra, 1, 0, 0, 12); 213 | REQUIRE_FVEC4_EQ(frgba, 0, 0, 1.0 / 255, 12.0 / 255); 214 | REQUIRE_FVEC4_EQ(fbgra, 1.0 / 255, 0, 0, 12.0 / 255); 215 | } 216 | 217 | { 218 | // Get pixels RGBA8888 by uv 219 | SWubyte rgba[4], bgra[4]; 220 | SWfloat frgba[4], fbgra[4]; 221 | 222 | // First row 223 | swPx_RGBA8888_GetColor_RGBA8888_UV(4, 4, px_RGBA8888, 0.1f, 0.1f, rgba); 224 | swPx_RGBA8888_GetColor_BGRA8888_UV(4, 4, px_RGBA8888, 0.1f, 0.1f, bgra); 225 | swPx_RGBA8888_GetColor_FRGBA_UV(4, 4, px_RGBA8888, 0.1f, 0.1f, frgba); 226 | swPx_RGBA8888_GetColor_FBGRA_UV(4, 4, px_RGBA8888, 0.1f, 0.1f, fbgra); 227 | REQUIRE_VEC4_EQ(rgba, 0, 0, 0, 1); 228 | REQUIRE_VEC4_EQ(bgra, 0, 0, 0, 1); 229 | REQUIRE_FVEC4_EQ(frgba, 0, 0, 0, 1.0 / 255); 230 | REQUIRE_FVEC4_EQ(fbgra, 0, 0, 0, 1.0 / 255); 231 | 232 | swPx_RGBA8888_GetColor_RGBA8888_UV(4, 4, px_RGBA8888, 0.7f, 0.1f, rgba); 233 | swPx_RGBA8888_GetColor_BGRA8888_UV(4, 4, px_RGBA8888, 0.7f, 0.1f, bgra); 234 | swPx_RGBA8888_GetColor_FRGBA_UV(4, 4, px_RGBA8888, 0.7f, 0.1f, frgba); 235 | swPx_RGBA8888_GetColor_FBGRA_UV(4, 4, px_RGBA8888, 0.7f, 0.1f, fbgra); 236 | REQUIRE_VEC4_EQ(rgba, 0, 1, 0, 3); 237 | REQUIRE_VEC4_EQ(bgra, 0, 1, 0, 3); 238 | REQUIRE_FVEC4_EQ(frgba, 0, 1.0 / 255, 0, 3.0 / 255); 239 | REQUIRE_FVEC4_EQ(fbgra, 0, 1.0 / 255, 0, 3.0 / 255); 240 | 241 | // Last row 242 | swPx_RGBA8888_GetColor_RGBA8888_UV(4, 4, px_RGBA8888, -0.9f, 0.9f, rgba); // wrong but ok 243 | swPx_RGBA8888_GetColor_BGRA8888_UV(4, 4, px_RGBA8888, -0.9f, 0.9f, bgra); 244 | swPx_RGBA8888_GetColor_FRGBA_UV(4, 4, px_RGBA8888, -0.9f, 0.9f, frgba); 245 | swPx_RGBA8888_GetColor_FBGRA_UV(4, 4, px_RGBA8888, -0.9f, 0.9f, fbgra); 246 | REQUIRE_VEC4_EQ(rgba, 190, 111, 20, 10); 247 | REQUIRE_VEC4_EQ(bgra, 20, 111, 190, 10); 248 | REQUIRE_FVEC4_EQ(frgba, 190.0 / 255, 111.0 / 255, 20.0 / 255, 10.0 / 255); 249 | REQUIRE_FVEC4_EQ(fbgra, 20.0 / 255, 111.0 / 255, 190.0 / 255, 10.0 / 255); 250 | 251 | swPx_RGBA8888_GetColor_RGBA8888_UV(4, 4, px_RGBA8888, 1.9f, 1.9f, rgba); 252 | swPx_RGBA8888_GetColor_BGRA8888_UV(4, 4, px_RGBA8888, 1.9f, 1.9f, bgra); 253 | swPx_RGBA8888_GetColor_FRGBA_UV(4, 4, px_RGBA8888, 1.9f, 1.9f, frgba); 254 | swPx_RGBA8888_GetColor_FBGRA_UV(4, 4, px_RGBA8888, 1.9f, 1.9f, fbgra); 255 | REQUIRE_VEC4_EQ(rgba, 0, 0, 1, 12); 256 | REQUIRE_VEC4_EQ(bgra, 1, 0, 0, 12); 257 | REQUIRE_FVEC4_EQ(frgba, 0, 0, 1.0 / 255, 12.0 / 255); 258 | REQUIRE_FVEC4_EQ(fbgra, 1.0 / 255, 0, 0, 12.0 / 255); 259 | } 260 | 261 | { 262 | // Set pixels RGBA8888 263 | SWubyte bgra_buf[4 * 4 * 4]; 264 | for (int x = 0; x < 4; x++) { 265 | for (int y = 0; y < 4; y++) { 266 | swPx_BGRA8888_SetColor_RGBA8888(4, 4, bgra_buf, x, y, &px_RGBA8888[4 * (4 * y + x)]); 267 | } 268 | } 269 | 270 | REQUIRE_VEC4_EQ(&bgra_buf[4 * (4 * 0 + 0)], 0, 0, 0, 1); 271 | REQUIRE_VEC4_EQ(&bgra_buf[4 * (4 * 0 + 3)], 255, 0, 0, 4); 272 | REQUIRE_VEC4_EQ(&bgra_buf[4 * (4 * 3 + 0)], 12, 111, 10, 9); 273 | REQUIRE_VEC4_EQ(&bgra_buf[4 * (4 * 3 + 3)], 1, 0, 0, 12); 274 | 275 | for (int x = 0; x < 4; x++) { 276 | for (int y = 0; y < 4; y++) { 277 | swPx_BGRA8888_SetColor_FRGBA(4, 4, bgra_buf, x, y, &px_FRGBA[4 * (4 * y + x)]); 278 | } 279 | } 280 | 281 | REQUIRE_VEC4_EQ(&bgra_buf[4 * (4 * 0 + 0)], 0, 0, 0, 255); 282 | REQUIRE_VEC4_EQ(&bgra_buf[4 * (4 * 0 + 3)], 0, 0, 255, 255); 283 | REQUIRE_VEC4_EQ(&bgra_buf[4 * (4 * 3 + 0)], 255, 255, 255, 0); 284 | REQUIRE_VEC4_EQ(&bgra_buf[4 * (4 * 3 + 3)], 255, 0, 255, 0); 285 | } 286 | } 287 | 288 | #undef REQUIRE_VEC4_EQ 289 | #undef REQUIRE_FVEC4_EQ 290 | -------------------------------------------------------------------------------- /tests/test_program.c: -------------------------------------------------------------------------------- 1 | #include "test_common.h" 2 | 3 | #include "../SWprogram.h" 4 | 5 | void test_program() { 6 | 7 | { 8 | // Program swProgRegUniform 9 | SWprogram p_; 10 | SWubyte uniform_buf[128]; 11 | swProgInit(&p_, uniform_buf, NULL, NULL, 0); 12 | 13 | swProgRegUniform(&p_, 0, SW_VEC2); 14 | swProgRegUniform(&p_, 1, SW_FLOAT); 15 | swProgRegUniform(&p_, 2, SW_VEC3); 16 | require((uintptr_t)p_.uniforms[0].data == (uintptr_t)p_.uniform_buf); 17 | require((uintptr_t)p_.uniforms[1].data == (uintptr_t)p_.uniform_buf + 2 * sizeof(SWfloat)); 18 | require((uintptr_t)p_.uniforms[2].data == (uintptr_t)p_.uniform_buf + 3 * sizeof(SWfloat)); 19 | require(p_.unifrom_buf_size == 6 * sizeof(SWfloat)); 20 | 21 | swProgDestroy(&p_); 22 | } 23 | 24 | { 25 | // Program swProgSetProgramUniform 26 | SWprogram p_; 27 | SWubyte uniform_buf[128]; 28 | swProgInit(&p_, uniform_buf, NULL, NULL, 0); 29 | 30 | swProgRegUniform(&p_, 0, SW_VEC2); 31 | swProgRegUniform(&p_, 1, SW_FLOAT); 32 | swProgRegUniform(&p_, 2, SW_VEC3); 33 | SWfloat uv_scale[] = { 2, 4, 1 }; 34 | swProgSetProgramUniform(&p_, 2, SW_VEC3, uv_scale); 35 | SWfloat *f = (SWfloat*)p_.uniforms[2].data; 36 | require(f[0] == 2); 37 | require(f[1] == 4); 38 | require(f[2] == 1); 39 | 40 | swProgDestroy(&p_); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /tests/test_texture.c: -------------------------------------------------------------------------------- 1 | #include "test_common.h" 2 | 3 | #include 4 | #include 5 | 6 | #include "../SWtexture.h" 7 | 8 | static SWubyte test_texture_tex[] = { 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 255, 9 | 1, 1, 0, 11, 13, 14, 190, 111, 20, 20, 20, 20, 10 | 10, 111, 12, 190, 111, 20, 0, 1, 0, 0, 0, 1, 11 | 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0 12 | }; 13 | 14 | void test_texture() { 15 | 16 | { 17 | // Texture init move 18 | SWtexture t; 19 | swTexInitMove(&t, SW_RGB, SW_UNSIGNED_BYTE, 4, 4, test_texture_tex, NULL); 20 | require(t.pixels == test_texture_tex); 21 | swTexDestroy(&t); 22 | } 23 | 24 | { 25 | // Texture init malloced 26 | SWtexture t; 27 | void *tex_data = malloc(sizeof(test_texture_tex)); 28 | memcpy(tex_data, test_texture_tex, sizeof(test_texture_tex)); 29 | swTexInitMove_malloced(&t, SW_RGB, SW_UNSIGNED_BYTE, 4, 4, tex_data); 30 | require(t.pixels == tex_data); 31 | swTexDestroy(&t); 32 | } 33 | 34 | { 35 | // Texture swTexGetColorFloat_RGBA 36 | SWtexture t_; 37 | swTexInit(&t_, SW_RGB, SW_UNSIGNED_BYTE, 4, 4, test_texture_tex); 38 | require(t_.pixels != NULL); 39 | require(((SWubyte*)t_.pixels)[3] == 1); 40 | 41 | SWfloat rgba[4]; 42 | swTexGetColorFloat_RGBA(&t_, 0.9f, 0.0f, rgba); 43 | require(rgba[0] == 0); 44 | require(rgba[1] == 0); 45 | require(rgba[2] == 1); 46 | require(rgba[3] == 1); 47 | 48 | swTexDestroy(&t_); 49 | require(t_.pixels == NULL); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /tests/test_zbuffer.c: -------------------------------------------------------------------------------- 1 | #include "test_common.h" 2 | 3 | #include 4 | 5 | #include "../SWzbuffer.h" 6 | 7 | static const int RES_W = 50; 8 | static const int RES_H = 16; 9 | 10 | void test_zbuffer() { 11 | const SWfloat eps = (SWfloat)0.0001; 12 | 13 | #define TEST_BEGIN \ 14 | SWzbuffer zb_; \ 15 | swZbufInit(&zb_, RES_W, RES_H, 1.0f); \ 16 | require(zb_.depth != NULL); \ 17 | require(zb_.depth[0] == 1); 18 | 19 | #define TEST_END \ 20 | swZbufDestroy(&zb_); \ 21 | require(zb_.depth == NULL); 22 | 23 | { 24 | // Zbuffer swZbufClearDepth 25 | TEST_BEGIN 26 | 27 | swZbufClearDepth(&zb_, 0.5f); 28 | for (int i = 0; i < zb_.w * zb_.h; i++) { 29 | require(zb_.depth[i] == 0.5f); 30 | } 31 | for (int i = 0; i < zb_.tile_w * zb_.tile_h; i++) { 32 | require(zb_.tiles[i].min == 0.5f); 33 | require(zb_.tiles[i].max == 0.5f); 34 | } 35 | 36 | TEST_END 37 | } 38 | 39 | { 40 | // Zbuffer swZbufTestDepth 41 | TEST_BEGIN 42 | 43 | for (int j = 0; j < RES_H; j++) { 44 | for (int i = 0; i < RES_W; i++) { 45 | swZbufSetDepth(&zb_, i, j, 100 * 0.01f * i + j * 0.01f); 46 | } 47 | } 48 | 49 | 50 | for (int j = 0; j < RES_H; j++) { 51 | for (int i = 0; i < RES_W; i++) { 52 | SWfloat z = (100 * 0.01f * i + j * 0.01f); 53 | require(fabs(swZbufGetDepth(&zb_, i, j) - z) < eps); 54 | require(!swZbufTestDepth(&zb_, i, j, z + 0.01f)); 55 | require(swZbufTestDepth(&zb_, i, j, z - 0.01f)); 56 | } 57 | } 58 | 59 | TEST_END 60 | } 61 | 62 | { 63 | // Zbuffer swZbufTestTile 64 | TEST_BEGIN 65 | 66 | for (int j = 0; j < RES_H; j += SW_TILE_SIZE) { 67 | for (int i = 0; i < RES_W; i += SW_TILE_SIZE) { 68 | SWfloat min = i * 0.4f + j * 0.6f; 69 | SWfloat max = min + 0.15f; 70 | 71 | swZbufSetTileRange(&zb_, i, j, min, max); 72 | } 73 | } 74 | 75 | for (int j = 0; j < RES_H; j += SW_TILE_SIZE) { 76 | for (int i = 0; i < RES_W; i += SW_TILE_SIZE) { 77 | SWfloat min = i * 0.4f + j * 0.6f; 78 | SWfloat max = min + 0.15f; 79 | 80 | require(swZbufTestTileRange(&zb_, i, j, max + 0.1f, max + 0.2f) == SW_OCCLUDED); 81 | require(swZbufTestTileRange(&zb_, i, j, min - 0.5f, min - 0.2f) == SW_NONOCCLUDED); 82 | require(swZbufTestTileRange(&zb_, i, j, min - 0.2f, min + 0.2f) == SW_PARTIAL); 83 | require(swZbufTestTileRange(&zb_, i, j, max - 0.2f, max + 0.2f) == SW_PARTIAL); 84 | } 85 | } 86 | 87 | TEST_END 88 | } 89 | } 90 | 91 | #undef TEST_BEGIN 92 | #undef TEST_END 93 | --------------------------------------------------------------------------------