├── .gitattributes ├── gpu-scanline └── src │ ├── rvg_loader.cpp │ ├── mochimazui │ ├── stdext.cpp │ ├── stdext.h │ ├── vector_type.h │ ├── color.h │ ├── camera_3d.cpp │ ├── file.cpp │ ├── file.h │ ├── option.h │ ├── bitmap.h │ ├── camera_2d.h │ ├── camera_controller_2d.h │ ├── camera_controller_3d.h │ ├── stdio_ext.h │ ├── bitmap.cpp │ ├── camera_3d.h │ ├── config.cpp │ ├── camera_controller_3d.cpp │ └── cuda_array.h │ ├── text_util.h │ ├── text_util.cpp │ ├── rvg_loader.h │ ├── svg_load.cpp │ ├── rasterizer │ ├── shared │ │ ├── ras_cut.cu │ │ ├── ras_base.cu │ │ ├── ras_factory.h │ │ ├── ras_pipeline_mode.h │ │ ├── ras_scan.h │ │ ├── ras_define.h │ │ └── ras_qm_mask.cu │ ├── kernel │ │ └── animation.h │ └── R_cut_A_mask_comb_scanline │ │ └── ras_cut_mask_comb_scanline.h │ ├── cuda │ ├── cuda_sort.h │ ├── cuda_cached_allocator.cpp │ ├── cuda_sort.cu │ └── cuda_cached_allocator.h │ ├── svg_loader.cpp │ ├── svg_loader.h │ ├── bezier_curve_type.h │ ├── tiger │ └── tiger.h │ ├── thrust_impl.h │ ├── rvg.h │ ├── bounding_box.h │ ├── svg.h │ ├── modern_gpu │ └── include │ │ ├── kernels │ │ ├── cubradixsort.cuh │ │ ├── loadbalance.cuh │ │ ├── localitysort.cuh │ │ ├── reduce.cuh │ │ └── bulkremove.cuh │ │ ├── moderngpu.cuh │ │ ├── mgpuenums.h │ │ ├── util │ │ ├── mgpualloc.h │ │ ├── util.h │ │ ├── format.h │ │ └── static.h │ │ ├── kernels_ext │ │ └── search_ext.cuh │ │ ├── device │ │ ├── launchbox.cuh │ │ ├── deviceutil.cuh │ │ ├── ctasegscan.cuh │ │ └── ctaloadbalance.cuh │ │ ├── sparsematrix.h │ │ └── mmio.h │ ├── gradient.h │ ├── thrust_impl_scan.cu │ ├── timer.h │ ├── rapidxml_utils.hpp │ ├── vg_config.cpp │ ├── vg_config.h │ └── rapidxml_iterators.hpp ├── working_directory ├── shader │ ├── shared │ │ ├── curve.frag.glsl │ │ ├── output_scale.frag.glsl │ │ ├── fps.vert.glsl │ │ ├── curve.vert.glsl │ │ ├── integrate_samples.vert.glsl │ │ ├── output_scale.vert.glsl │ │ ├── integrate_samples.frag.glsl │ │ └── fps.frag.glsl │ └── R_cut_A_stencil │ │ ├── output_8.frag.glsl │ │ ├── ms_output_32.frag.glsl │ │ ├── ms_output_8.frag.glsl │ │ ├── output_32.frag.glsl │ │ ├── ms_output_32.frag.glsl.before_368.22 │ │ ├── ms_output_8.vert.glsl │ │ ├── output_8.vert.glsl │ │ ├── output_32.vert.glsl │ │ └── ms_output_32.vert.glsl ├── .gitignore ├── ui │ ├── minimal_ui.json │ └── ui.json └── vg_default.cfg ├── gpu-scanline-path-rendering-core.sln ├── LICENSE ├── README.md └── .gitignore /.gitattributes: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gpu-scanline/src/rvg_loader.cpp: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/stdext.cpp: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/stdext.h: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gpu-scanline/src/text_util.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | -------------------------------------------------------------------------------- /gpu-scanline/src/text_util.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "text_util.h" 3 | -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/vector_type.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | namespace Mochimazui { 5 | } 6 | -------------------------------------------------------------------------------- /gpu-scanline/src/rvg_loader.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | namespace Mochimazui { 5 | 6 | void load_rvg(); 7 | 8 | } 9 | -------------------------------------------------------------------------------- /gpu-scanline/src/svg_load.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mochimazui/gpu-scanline-path-rendering/HEAD/gpu-scanline/src/svg_load.cpp -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/color.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Mochimazui/gpu-scanline-path-rendering/HEAD/gpu-scanline/src/mochimazui/color.h -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/camera_3d.cpp: -------------------------------------------------------------------------------- 1 | #include "camera_3d.h" 2 | 3 | namespace Mochimazui { 4 | 5 | glm::mat4x4 Camera3D::matrix() { 6 | return _matrix; 7 | } 8 | 9 | } -------------------------------------------------------------------------------- /gpu-scanline/src/rasterizer/shared/ras_cut.cu: -------------------------------------------------------------------------------- 1 | 2 | namespace Mochimazui { 3 | 4 | namespace Rasterizer { 5 | 6 | } // end of namespace Rasterizers 7 | 8 | } // end of namespace Mochimazui 9 | -------------------------------------------------------------------------------- /gpu-scanline/src/cuda/cuda_sort.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include 5 | 6 | namespace Mochimazui { 7 | void cuda_seg_sort_int_by_int(int* key,int* data,int n,int* segs,int nsegs); 8 | } 9 | -------------------------------------------------------------------------------- /working_directory/shader/shared/curve.frag.glsl: -------------------------------------------------------------------------------- 1 | 2 | #version 450 3 | 4 | flat in vec4 fragColor; 5 | 6 | layout(location = 0) out vec4 color; 7 | 8 | void main() { 9 | color = fragColor; 10 | } 11 | -------------------------------------------------------------------------------- /working_directory/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | font/ 3 | output/ 4 | test/ 5 | result/ 6 | upload/ 7 | debug_dump/ 8 | 9 | *.cmd 10 | *.txt 11 | *.bmp 12 | *.png 13 | *.pdf 14 | *.lnk 15 | *.exe 16 | *.ttf 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /gpu-scanline/src/cuda/cuda_cached_allocator.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "cuda_cached_allocator.h" 3 | 4 | namespace Mochimazui { 5 | 6 | cuda_cached_allocator g_thrustCachedAllocator; 7 | cuda_cached_allocator &g_alloc = g_thrustCachedAllocator; 8 | 9 | } -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/file.cpp: -------------------------------------------------------------------------------- 1 | 2 | #define _CRT_SECURE_NO_WARNINGS 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | namespace Mochimazui { 12 | } -------------------------------------------------------------------------------- /gpu-scanline/src/svg_loader.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "svg_loader.h" 3 | 4 | #include "vg_container.h" 5 | 6 | namespace Mochimazui { 7 | 8 | std::shared_ptr 9 | load_svg(const std::string &file_name, bool stroke_to_fill) { 10 | return nullptr; 11 | } 12 | 13 | } 14 | -------------------------------------------------------------------------------- /gpu-scanline/src/svg_loader.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include 5 | #include 6 | 7 | namespace Mochimazui { 8 | 9 | class VGContainer; 10 | 11 | std::shared_ptr 12 | load_svg(const std::string &file_name, bool stroke_to_fill); 13 | 14 | } 15 | -------------------------------------------------------------------------------- /working_directory/ui/minimal_ui.json: -------------------------------------------------------------------------------- 1 | { 2 | "window": { 3 | "id": "top", "type": "window", 4 | "layout": "horizontal", 5 | "width": 1200, "height": 1024, 6 | "title": "GPU Scanline VG", 7 | "subwindows": [ 8 | { "id": "display","type": "subwindow" } 9 | ] 10 | }, 11 | 12 | "subwindows": { 13 | } 14 | } 15 | 16 | -------------------------------------------------------------------------------- /working_directory/shader/shared/output_scale.frag.glsl: -------------------------------------------------------------------------------- 1 | 2 | #version 450 3 | 4 | layout(binding = 0) uniform sampler2D scale_tex; 5 | 6 | in vec2 texcoord; 7 | 8 | layout(location = 0) out vec4 color; 9 | 10 | void main(){ 11 | if (texcoord.x < 0 || texcoord.y < 0) { 12 | color = vec4(0, 0, 0, 1); 13 | } 14 | color = texture2D(scale_tex, texcoord); 15 | }; 16 | -------------------------------------------------------------------------------- /gpu-scanline/src/cuda/cuda_sort.cu: -------------------------------------------------------------------------------- 1 | 2 | #include "cuda_sort.h" 3 | 4 | #include 5 | 6 | #include 7 | 8 | #include "../modern_gpu/include/kernels_ext/segmentedsort_ext.cuh" 9 | 10 | namespace Mochimazui { 11 | 12 | void cuda_seg_sort_int_by_int(int* key,int* data,int n,int* segs,int nsegs){ 13 | mgpu_ext::SegSortPairsFromIndices(key, data, n, segs,nsegs); 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /gpu-scanline/src/bezier_curve_type.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | namespace Mochimazui { 5 | 6 | enum BezierCurveType { 7 | BCT_Linear = 0x02, 8 | BCT_Quadratic = 0x03, 9 | BCT_Cubic = 0x04, 10 | BCT_Rational = 0x13, 11 | }; 12 | 13 | enum VGCurveType { 14 | CT_Linear = BCT_Linear, 15 | CT_Quadratic = BCT_Quadratic, 16 | CT_Cubic = BCT_Cubic, 17 | CT_Rational = BCT_Rational, 18 | }; 19 | 20 | } // end of namespace Mochimazui 21 | -------------------------------------------------------------------------------- /working_directory/shader/shared/fps.vert.glsl: -------------------------------------------------------------------------------- 1 | 2 | #version 450 3 | 4 | #define SIZE 10 5 | 6 | uniform ivec2 vp_size; 7 | 8 | vec2 vs[4] = { 9 | vec2(0, vp_size.y - 60), 10 | vec2(300, vp_size.y - 60), 11 | vec2(300, vp_size.y), 12 | vec2(0, vp_size.y) 13 | }; 14 | 15 | void main() { 16 | vec2 v = vs[gl_VertexID]; 17 | gl_Position = vec4( 18 | v.x / vp_size.x * 2 - 1, 19 | v.y / vp_size.y * 2 - 1, 20 | 0, 1 ); 21 | } 22 | -------------------------------------------------------------------------------- /working_directory/shader/shared/curve.vert.glsl: -------------------------------------------------------------------------------- 1 | 2 | #version 450 3 | 4 | uniform ivec2 vp_size; 5 | 6 | layout(binding = 0) uniform samplerBuffer tb_vertex; 7 | layout(binding = 1) uniform samplerBuffer tb_color; 8 | 9 | flat out vec4 fragColor; 10 | 11 | void main() { 12 | vec4 draw = texelFetch(tb_vertex, gl_VertexID); 13 | vec2 p = vec2(draw.x / vp_size.x, draw.y / vp_size.y) * 2 - vec2(1.0, 1.0); 14 | fragColor = texelFetch(tb_color, gl_VertexID); 15 | gl_Position = vec4(p.x, p.y, 0, 1); 16 | } 17 | -------------------------------------------------------------------------------- /gpu-scanline/src/rasterizer/kernel/animation.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include 5 | #include "../shared/ras_base.h" 6 | 7 | namespace Mochimazui { 8 | 9 | void vg_animation( 10 | int last_frame_timestamp, 11 | int next_frame_timestamp, 12 | RasterizerBase::VGInputCurveDataPack &_last_frame_curve_in, 13 | RasterizerBase::VGInputCurveDataPack &_next_frame_curve_in, 14 | RasterizerBase::VGInputPathDataPack &_last_frame_path_in, 15 | RasterizerBase::VGInputPathDataPack &_next_frame_path_in 16 | ); 17 | 18 | } 19 | -------------------------------------------------------------------------------- /gpu-scanline/src/rasterizer/shared/ras_base.cu: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | //#include 10 | 11 | #include "ras_define.h" 12 | #include "ras_cut.h" 13 | 14 | //#define LAUNCH(kernel,N,NT,args) {kernel <<< divup(N,NT),NT >>>args;DEBUG_CUDA_DEVICE_SYNC_AND_CHECK_ERROR(#kernel);} 15 | //#define GET_ID() (blockDim.x * blockIdx.x + threadIdx.x) 16 | 17 | #define DEV static __device__ inline 18 | #define BOTH __device__ __host__ inline 19 | 20 | typedef long long i64; 21 | -------------------------------------------------------------------------------- /gpu-scanline/src/tiger/tiger.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _MOCHIMAZUI_TIGER_H_ 3 | #define _MOCHIMAZUI_TIGER_H_ 4 | 5 | #include 6 | 7 | namespace Mochimazui { 8 | 9 | namespace Tiger { 10 | 11 | struct TigerStyle { 12 | GLuint fill_color; 13 | GLuint stroke_color; 14 | GLfloat stroke_width; 15 | }; 16 | 17 | extern const char *tiger_path[240]; 18 | extern const TigerStyle tiger_style[240]; 19 | 20 | extern const unsigned int tiger_path_count; 21 | extern GLuint tiger_path_base; 22 | 23 | void initTiger(); 24 | void drawTiger(int filling, int stroking); 25 | 26 | } 27 | } 28 | 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /working_directory/shader/shared/integrate_samples.vert.glsl: -------------------------------------------------------------------------------- 1 | 2 | #version 450 3 | 4 | // -------------------------------- 5 | //layout(location = 0) in vec2 vertex; 6 | 7 | uniform ivec2 vp_size; 8 | 9 | // -------------------------------- 10 | void main() { 11 | 12 | int width = vp_size.x; 13 | int height = vp_size.y; 14 | 15 | vec2 v; 16 | 17 | if (gl_VertexID == 0) { 18 | v = vec2(0.f, 0.f); 19 | } 20 | else if (gl_VertexID == 1) { 21 | v = vec2(0.f, height); 22 | } 23 | else if (gl_VertexID == 2) { 24 | v = vec2(width, height); 25 | } 26 | else { 27 | v = vec2(width, 0.f); 28 | } 29 | 30 | v.x = v.x / width * 2.0 - 1.0; 31 | v.y = v.y / height * 2.0 - 1.0; 32 | 33 | gl_Position = vec4(v.xy, 0.5, 1); 34 | } 35 | -------------------------------------------------------------------------------- /gpu-scanline/src/thrust_impl.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _MOCHIMAZUI_THRUST_IMPL_H_ 3 | #define _MOCHIMAZUI_THRUST_IMPL_H_ 4 | 5 | #include 6 | 7 | namespace Mochimazui { 8 | 9 | void thrust_exclusive_scan(int8_t *ibegin, uint32_t number, int8_t *obegin); 10 | void thrust_exclusive_scan(uint8_t *ibegin, uint32_t number, uint8_t *obegin); 11 | 12 | void thrust_exclusive_scan(int32_t *ibegin, uint32_t number, int32_t *obegin); 13 | void thrust_exclusive_scan(uint32_t *ibegin, uint32_t number, uint32_t *obegin); 14 | 15 | void thrust_exclusive_scan(float *ibegin, uint32_t number, float *obegin); 16 | 17 | void thrust_inclusive_scan(int32_t *ibegin, uint32_t number, int32_t *obegin); 18 | void thrust_inclusive_scan(uint32_t *ibegin, uint32_t number, uint32_t *obegin); 19 | 20 | } 21 | 22 | #endif -------------------------------------------------------------------------------- /working_directory/shader/shared/output_scale.vert.glsl: -------------------------------------------------------------------------------- 1 | 2 | #version 450 3 | 4 | uniform ivec2 vp_size; 5 | 6 | uniform ivec2 vp_translate; 7 | uniform float vp_scale; 8 | 9 | uniform mat4x4 o_tmat; 10 | 11 | out vec2 texcoord; 12 | 13 | vec2 v[4] = { 14 | vec2(0, 0), 15 | vec2(0, 1), 16 | vec2(1, 1), 17 | vec2(1, 0) 18 | }; 19 | 20 | void calc_texcoord() { 21 | vec4 ov = vec4(v[gl_VertexID], 0, 1); 22 | 23 | ov.x *= vp_size.x; 24 | ov.y *= vp_size.y; 25 | 26 | ov = inverse(o_tmat) * ov; 27 | ov /= ov.w; 28 | 29 | ov.x /= vp_size.x; 30 | ov.y /= vp_size.y; 31 | 32 | texcoord = ov.xy; 33 | } 34 | 35 | void calc_position() { 36 | vec2 ov = v[gl_VertexID]; 37 | //ov.y = 1.0 - ov.y; 38 | gl_Position = vec4(ov * 2 - vec2(1, 1), 0.0, 1.0); 39 | } 40 | 41 | void main() { 42 | calc_texcoord(); 43 | calc_position(); 44 | } 45 | -------------------------------------------------------------------------------- /gpu-scanline/src/rvg.h: -------------------------------------------------------------------------------- 1 | #ifndef _MOCHIMAZUI_RVG_H_ 2 | #define _MOCHIMAZUI_RVG_H_ 3 | 4 | #include 5 | #include 6 | 7 | #include "vg_container.h" 8 | 9 | namespace Mochimazui { 10 | 11 | using std::string; 12 | 13 | class RVG { 14 | 15 | public: 16 | void setA128(bool f) { _a128 = f; } 17 | void load(const string &fileName); 18 | 19 | int32_t width(){ return _viewport[1].x; } 20 | int32_t height() { return _viewport[1].y; } 21 | 22 | const std::shared_ptr &vgContainer() const { return _spVGContainer; } 23 | 24 | void saveSelectedPath(const std::vector &pids); 25 | 26 | private: 27 | 28 | bool _a128 = false; 29 | 30 | glm::ivec2 _viewport[2]; 31 | glm::ivec2 _window[2]; 32 | 33 | std::shared_ptr _spVGContainer; 34 | 35 | std::string _header; 36 | std::vector _lines; 37 | }; 38 | } 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/file.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _MOCHIMAZUI_FILE_H_ 3 | #define _MOCHIMAZUI_FILE_H_ 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | namespace Mochimazui { 12 | 13 | template 14 | void readAll(const charT *fn, std::basic_string &odata) { 15 | 16 | FILE *fin; 17 | fin = fopen(fn, "rb"); 18 | if (!fin) { 19 | auto msg = "Error in readAll: can not open file \"" + std::string(fn) + "\""; 20 | stdext::error_printf("%s", fn); 21 | throw std::runtime_error(msg); 22 | } 23 | 24 | fseek(fin, 0, SEEK_END); 25 | long size = ftell(fin); 26 | 27 | charT *data = new charT[size + 1]; 28 | if (!data) { printf("Error in readAll: new char returnd 0\n"); return; } 29 | 30 | fseek(fin, 0, SEEK_SET); 31 | size_t size_read = fread(data, 1, size, fin); 32 | fclose(fin); 33 | 34 | data[size] = '\0'; 35 | odata = data; 36 | } 37 | 38 | } 39 | 40 | #endif -------------------------------------------------------------------------------- /gpu-scanline-path-rendering-core.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.24720.0 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gpu-scanline", "gpu-scanline\gpu-scanline.vcxproj", "{FD594DB2-BFC4-4F3E-BDF4-0C80C702BBF9}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Release|x64 = Release|x64 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {FD594DB2-BFC4-4F3E-BDF4-0C80C702BBF9}.Debug|x64.ActiveCfg = Debug|x64 15 | {FD594DB2-BFC4-4F3E-BDF4-0C80C702BBF9}.Debug|x64.Build.0 = Debug|x64 16 | {FD594DB2-BFC4-4F3E-BDF4-0C80C702BBF9}.Release|x64.ActiveCfg = Release|x64 17 | {FD594DB2-BFC4-4F3E-BDF4-0C80C702BBF9}.Release|x64.Build.0 = Release|x64 18 | EndGlobalSection 19 | GlobalSection(SolutionProperties) = preSolution 20 | HideSolutionNode = FALSE 21 | EndGlobalSection 22 | EndGlobal 23 | -------------------------------------------------------------------------------- /gpu-scanline/src/rasterizer/shared/ras_factory.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include "ras_base.h" 5 | 6 | #include "rasterizer/R_cut_A_mask_comb_scanline/ras_cut_mask_comb_scanline.h" 7 | 8 | #ifdef ENABLE_COMPARISON 9 | #include "rasterizer/R_cut_A_none/ras_cut_no_aa.h" 10 | #include "rasterizer/R_cut_A_mask_sample_scanline/ras_cut_mask_sample_scanline.h" 11 | #include "rasterizer/R_cut_A_mask_pixel_scanline/ras_cut_mask_pixel_scanline.h" 12 | #include "rasterizer/c_cs_cuda_cell_list/ras_c_cs_cuda_cell_list.h" 13 | #include "rasterizer/c_cs_gl_cell_list/ras_c_cs_gl_cell_list.h" 14 | #endif 15 | 16 | namespace Mochimazui { 17 | 18 | inline std::shared_ptr createRasterizer(RasterizerPipelineMode rpm) { 19 | 20 | std::shared_ptr p_ras; 21 | 22 | if (rpm == PM_Cut_Mask_Comb_Scanline) { 23 | p_ras.reset(new Rasterizer_R_Cut_A_Mask_Comb_Scanline::VGRasterizer); 24 | } 25 | #ifdef ENABLE_COMPARISON 26 | #endif 27 | else { 28 | throw std::runtime_error("unsupported pipeline mode"); 29 | } 30 | 31 | return p_ras; 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016-2017 Zhejiang University 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /gpu-scanline/src/bounding_box.h: -------------------------------------------------------------------------------- 1 | #ifndef _MOCHIMAZUI_BOUNDING_BOX_H_ 2 | #define _MOCHIMAZUI_BOUNDING_BOX_H_ 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | namespace Mochimazui { 10 | 11 | struct BoundingBoxFloat { 12 | 13 | public: 14 | __host__ __device__ BoundingBoxFloat() { 15 | v[0] = make_float2(1e32, 1e32); 16 | v[1] = -v[0]; 17 | } 18 | 19 | __host__ __device__ void update(const float2 &a) { 20 | v[0].x = min(v[0].x, a.x); 21 | v[0].y = min(v[0].y, a.y); 22 | v[1].x = max(v[1].x, a.x); 23 | v[1].y = max(v[1].y, a.y); 24 | } 25 | 26 | public: 27 | float2 v[2]; 28 | }; 29 | 30 | struct BoundingBoxInt { 31 | 32 | public: 33 | __host__ __device__ BoundingBoxInt() { 34 | v[0] = make_int2(0x7FFFFFFF, 0x7FFFFFFF); 35 | v[1] = -v[0]; 36 | } 37 | 38 | __host__ __device__ void update(const int2 &a) { 39 | v[0].x = min(v[0].x, a.x); 40 | v[0].y = min(v[0].y, a.y); 41 | v[1].x = max(v[1].x, a.x); 42 | v[1].y = max(v[1].y, a.y); 43 | } 44 | 45 | public: 46 | int2 v[2]; 47 | }; 48 | 49 | typedef BoundingBoxFloat BBoxF; 50 | typedef BoundingBoxInt BBoxI; 51 | 52 | } 53 | 54 | #endif -------------------------------------------------------------------------------- /gpu-scanline/src/rasterizer/shared/ras_pipeline_mode.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include 5 | 6 | namespace Mochimazui { 7 | 8 | // -------- -------- -------- -------- -------- -------- -------- -------- 9 | enum RasterizerPipelineMode { 10 | PM_Cut_No_AA, 11 | PM_Cut_Mask_Sample_Scanline, 12 | PM_Cut_Mask_Pixel_Scanline, 13 | PM_Cut_Mask_Comb_Scanline, 14 | }; 15 | 16 | // -------- -------- -------- -------- -------- -------- -------- -------- 17 | typedef RasterizerPipelineMode VGPipelineMode; 18 | 19 | // -------- -------- -------- -------- -------- -------- -------- -------- 20 | inline std::string ras_pipeline_mode_to_string(RasterizerPipelineMode rpm) { 21 | if (rpm == PM_Cut_No_AA) { 22 | return "cut fragment, no AA"; 23 | } 24 | else if (rpm == PM_Cut_Mask_Sample_Scanline) { 25 | return "cut fragment, per sample scanline"; 26 | } 27 | else if (rpm == PM_Cut_Mask_Pixel_Scanline) { 28 | return "cut fragment, per pixel scanline"; 29 | } 30 | else if (rpm == PM_Cut_Mask_Comb_Scanline) { 31 | return "cut fragment, comb scanline"; 32 | } 33 | else { 34 | throw std::runtime_error("ras_pipeline_mode_to_string: unsupported pipeline mode"); 35 | } 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /gpu-scanline/src/rasterizer/shared/ras_scan.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _MOCHIMAZUI_RASTERIZER_SHARED_SCAN_H_ 3 | #define _MOCHIMAZUI_RASTERIZER_SHARED_SCAN_H_ 4 | 5 | #include 6 | #include "thrust_impl.h" 7 | 8 | namespace Mochimazui { 9 | 10 | namespace Rasterizer { 11 | 12 | inline void escan_with_ret(int* p, int n, int *ret) { 13 | thrust_exclusive_scan((uint32_t*)p, n + 1, (uint32_t*)p); 14 | cudaMemcpy(ret, p + n, sizeof(int), cudaMemcpyDeviceToHost); 15 | } 16 | 17 | inline int escan(int* p, int n) { 18 | int ret = 0; 19 | thrust_exclusive_scan((uint32_t*)p, n + 1, (uint32_t*)p); 20 | cudaMemcpy(&ret, p + n, sizeof(int), cudaMemcpyDeviceToHost); 21 | return ret; 22 | } 23 | 24 | inline int escan(int* i, int *o, int n) { 25 | int ret = 0; 26 | thrust_exclusive_scan((uint32_t*)i, n + 1, (uint32_t*)o); 27 | cudaMemcpy(&ret, o + n, sizeof(int), cudaMemcpyDeviceToHost); 28 | return ret; 29 | } 30 | 31 | inline int iscan(int* i, int *o, int n) { 32 | int ret = 0; 33 | thrust_inclusive_scan((uint32_t*)i, n, (uint32_t*)o); 34 | cudaMemcpy(&ret, o + n - 1, sizeof(int), cudaMemcpyDeviceToHost); 35 | return ret; 36 | } 37 | 38 | } // end of namespace Rasterizers 39 | 40 | } // end of namespace Mochimazui 41 | 42 | #endif -------------------------------------------------------------------------------- /gpu-scanline/src/svg.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _MOCHIMAZUI_SVG_H_ 3 | #define _MOCHIMAZUI_SVG_H_ 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | #include 16 | 17 | #include "vg_container.h" 18 | 19 | #include "gradient.h" 20 | 21 | namespace Mochimazui { 22 | 23 | struct SVG { 24 | 25 | public: 26 | //SVG(); 27 | //~SVG(); 28 | 29 | public: 30 | 31 | void setA128(bool f) { _a128 = f; } 32 | 33 | void load(const stdext::string &fileName, bool gen_nvpr_path_commands = false); 34 | void save(const stdext::string &fileName); 35 | 36 | const std::shared_ptr &vgContainer() const { return _spVGContainer; } 37 | void setVg(std::shared_ptr &pVg); 38 | 39 | public: 40 | uint32_t width() { return _width; } 41 | uint32_t height() { return _height; } 42 | 43 | private: 44 | 45 | bool _a128 = false; 46 | 47 | uint32_t _width = 0, _height = 0; 48 | glm::vec2 _viewBox[2]; 49 | 50 | //std::vector _gradients; 51 | //std::map _gradientMap; 52 | 53 | std::shared_ptr _spVGContainer; 54 | }; 55 | 56 | } 57 | 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/option.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _MOCHIMAZUI_OPTION_H_ 3 | #define _MOCHIMAZUI_OPTION_H_ 4 | 5 | #include 6 | #include 7 | #include "string.h" 8 | 9 | namespace Mochimazui { 10 | 11 | enum OptionType { 12 | Int, 13 | Float, 14 | String, 15 | 16 | IntArray, 17 | FloatArray, 18 | StringArray, 19 | }; 20 | 21 | class OptionInfo { 22 | std::string name; 23 | std::string shortcut; 24 | OptionType valueType; 25 | std::string value; 26 | }; 27 | 28 | template 29 | class OptionWithPointer { 30 | }; 31 | 32 | template 33 | class OptionWithReference { 34 | }; 35 | 36 | // -------- -------- -------- -------- -------- -------- -------- -------- 37 | class Option { 38 | 39 | public: 40 | Option &addOption(const std::string &name, OptionType type) { 41 | return *this; 42 | } 43 | 44 | template 45 | Option &addOption(const std::string &name, OptionType type, T*) { 46 | return *this; 47 | } 48 | 49 | template 50 | Option &addOption(const std::string &name, OptionType type, T&) { 51 | return *this; 52 | } 53 | 54 | //Option &addOption(const std::string &name, OptionType type, ) {} 55 | 56 | public: 57 | Option &operator()(int argc, char *argv[]) {} 58 | Option &operator()(const std::string &fileName) {} 59 | 60 | private: 61 | 62 | }; 63 | 64 | } 65 | 66 | #endif 67 | -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/bitmap.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _MOCHIMAZUI_BITMAP_H_ 3 | #define _MOCHIMAZUI_BITMAP_H_ 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | #include "color.h" 12 | 13 | namespace Mochimazui { 14 | 15 | struct Bitmap { 16 | 17 | public: 18 | //Bitmap() {} 19 | //~Bitmap() {} 20 | 21 | public: 22 | //void load(const std::string &fileName); 23 | bool save(const std::string &fileName); 24 | 25 | void fill(const u8rgba &c) { 26 | for (uint32_t i = 0; i < _height; ++i) { 27 | for (uint32_t j = 0; j < _width; ++j) { 28 | pixel(j, i) = c; 29 | } 30 | } 31 | } 32 | 33 | void resize(int w, int h) { 34 | _width = w; 35 | _height = h; 36 | _pixel.resize(h*w); 37 | } 38 | 39 | uint32_t width() { return _width; } 40 | uint32_t height() { return _height; } 41 | 42 | const unsigned char * data() { return (const unsigned char*)_pixel.data(); } 43 | 44 | public: 45 | 46 | void setPixel(int x, int y, const u8rgba &color) { 47 | if (0 <= x && x < (int)_width && 0 <= y && y < (int)_height) { 48 | _pixel[y * _width + x] = color; 49 | } 50 | } 51 | 52 | u8rgba &pixel(int x, int y) { 53 | return _pixel[y * _width + x]; 54 | } 55 | 56 | const u8rgba &pixel(int x, int y) const { 57 | return _pixel[y * _width + x]; 58 | } 59 | 60 | private: 61 | uint32_t _width = 0; 62 | uint32_t _height = 0; 63 | std::vector _pixel; 64 | }; 65 | 66 | } 67 | 68 | #endif 69 | 70 | -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/camera_2d.h: -------------------------------------------------------------------------------- 1 | #ifndef _MOCHIMAZUI_CAMERA_2D_H_ 2 | #define _MOCHIMAZUI_CAMERA_2D_H_ 3 | 4 | #include 5 | #include 6 | 7 | namespace Mochimazui { 8 | 9 | using glm::vec2; 10 | using glm::mat3x3; 11 | 12 | class Camera2D { 13 | 14 | friend class CameraController2D; 15 | 16 | public: 17 | Camera2D() { 18 | } 19 | 20 | public: 21 | 22 | void reset() { 23 | _matrix = mat3x3(); 24 | } 25 | 26 | // 27 | void translate(const vec2 &t) { 28 | translate(t.x, t.y); 29 | } 30 | 31 | template 32 | void translate(const T &x, const T &y) { 33 | _matrix = mat3x3( 34 | 1, 0, 0, 35 | 0, 1, 0, 36 | x, y, 1 37 | ) * _matrix; 38 | } 39 | 40 | // 41 | template 42 | void scale(const T &sx, const T &sy, const vec2 &cp = vec2(0.f, 0.f)) { 43 | translate(-cp); 44 | _matrix = mat3x3( 45 | sx, 0, 0, 46 | 0, sy, 0, 47 | 0, 0, 1 48 | ) * _matrix; 49 | translate(cp); 50 | } 51 | 52 | template 53 | void scale(const T &s, const vec2 &cp = vec2(0.f, 0.f)) { 54 | scale(s, s, cp); 55 | } 56 | 57 | void scale(const vec2 &s, const vec2 &cp = vec2(0.f, 0.f)) { 58 | scale(s.x, s.y, cp); 59 | } 60 | 61 | // 62 | void rotate(const vec2 &c, float a) { 63 | } 64 | 65 | // 66 | glm::mat3x3 matrix() { 67 | return _matrix; 68 | } 69 | 70 | private: 71 | 72 | vec2 _scale; 73 | vec2 _translate; 74 | 75 | glm::mat3x3 _matrix; 76 | }; 77 | 78 | } 79 | 80 | #endif -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/kernels/cubradixsort.cuh: -------------------------------------------------------------------------------- 1 | #include "device/deviceutil.cuh" 2 | #include "cub/cub.cuh" 3 | 4 | namespace mgpu { 5 | 6 | template 7 | bool CubRadixSort(Key* keys_global, Key* keys2_global, int count, int beginBit, 8 | int endBit, CudaContext& context) { 9 | 10 | cub::DoubleBuffer keys(keys_global, keys2_global); 11 | 12 | size_t tempBytes = 0; 13 | cub::DeviceRadixSort::SortKeys(0, tempBytes, keys, count, beginBit, endBit, 14 | context.Stream()); 15 | 16 | MGPU_MEM(byte) tempDevice = context.Malloc(tempBytes); 17 | 18 | cub::DeviceRadixSort::SortKeys(tempDevice->get(), tempBytes, keys, count, 19 | beginBit, endBit, context.Stream()); 20 | MGPU_SYNC_CHECK("cub::DeviceRadixSort::SortKeys"); 21 | 22 | return 1 == keys.selector; 23 | } 24 | 25 | template 26 | bool CubRadixSort(Key* keys_global, Key* keys2_global, Value* values_global, 27 | Value* values2_global, int count, int beginBit, int endBit, 28 | CudaContext& context) { 29 | 30 | cub::DoubleBuffer keys(keys_global, keys2_global); 31 | cub::DoubleBuffer values(values_global, values2_global); 32 | 33 | size_t tempBytes = 0; 34 | cub::DeviceRadixSort::SortPairs(0, tempBytes, keys, values, count, 35 | beginBit, endBit, context.Stream()); 36 | 37 | MGPU_MEM(byte) tempDevice = context.Malloc(tempBytes); 38 | 39 | cub::DeviceRadixSort::SortPairs(tempDevice->get(), tempBytes, keys, values, 40 | count, beginBit, endBit, context.Stream()); 41 | MGPU_SYNC_CHECK("cub::DeviceRadixSort::SortPairs"); 42 | 43 | return 1 == keys.selector; 44 | } 45 | 46 | } // namespace mgpu 47 | -------------------------------------------------------------------------------- /working_directory/ui/ui.json: -------------------------------------------------------------------------------- 1 | { 2 | "window": { 3 | "id": "top", "type": "window", 4 | "layout": "horizontal", 5 | "width": 1200, "height": 1080, 6 | "title": "GLGUI - Test", 7 | "subwindows": [ 8 | { 9 | "id": "left-subwindow", 10 | "type": "frame", 11 | "width": 128, 12 | "size-policy-x": "fix", 13 | "border-width": 1, 14 | "layout": "vertical", 15 | "subwindows": [ 16 | { "id": "frag-size", "type": "label", "text": "Fragment size:" }, 17 | { "id": "frag-size-1", "type": "radio-button", "text": "1" }, 18 | { "id": "frag-size-2", "type": "radio-button", "text": "2" }, 19 | { "id": "frag-size-4", "type": "radio-button", "text": "4" }, 20 | { "id": "frag-size-8", "type": "radio-button", "text": "8" }, 21 | 22 | { "id": "h-line-1", "type": "horizontal-line","line-width": 2 }, 23 | { "id": "empty", "type": "label", "text": "Empty" }, 24 | { "id": "tb", "type": "push-button", "text": "Empty" }, 25 | { "id": "tcb", "type": "check-box", "text": "Empty" }, 26 | 27 | { "id": "h-line-2", "type": "horizontal-line","line-width": 2 }, 28 | { "id": "output-label", "type": "label", "text": "Output" }, 29 | { "id": "default", "type": "radio-button", "text": "Default" }, 30 | { "id": "per-sample-blend", "type": "radio-button", "text": "Per sample blend" } 31 | 32 | ] 33 | }, 34 | { "id": "vline", "type": "vertical-line","line-width": 2 }, 35 | { "id": "display","type": "subwindow" } 36 | ] 37 | }, 38 | 39 | "subwindows": { 40 | } 41 | } 42 | 43 | -------------------------------------------------------------------------------- /working_directory/vg_default.cfg: -------------------------------------------------------------------------------- 1 | 2 | // -------- -------- -------- -------- -------- -------- -------- -------- 3 | // config 4 | 5 | --verbose 6 | --minimal-ui 7 | --show-fps 8 | //--gl-debug 9 | 10 | --reserve-ink 16384 11 | //--reserve-ink 0 12 | //--draw-curve 13 | 14 | --c-m-cs 15 | 16 | --samples 32 17 | //--samples 8 18 | 19 | --ms-output 20 | 21 | --srgb 22 | //--lrgb 23 | 24 | --window-width 1024 25 | --window-height 1024 26 | 27 | //--animation 28 | //--window-width 1920 29 | //--window-height 1080 30 | 31 | //--fit-to-window 32 | //--fit-to-vg 33 | 34 | //--stroke-to-fill 35 | 36 | //--save-output-file 37 | //--o ./output/hehe.png 38 | 39 | // -------- -------- -------- -------- -------- -------- -------- -------- 40 | // RVG input 41 | 42 | //-i ./input/rvg/embrace.rvg 43 | //-i ./input/rvg/tiger.rvg 44 | //-i ./input/rvg/reschart.rvg 45 | //-i ./input/rvg/hawaii.rvg 46 | //-i ./input/rvg/paper-1.rvg 47 | //-i ./input/rvg/chord.rvg 48 | //-i ./input/rvg/paris-30k.rvg 49 | //-i ./input/rvg/contour.rvg 50 | 51 | //-i ./input/rvg/boston.rvg 52 | //-i ./input/rvg/paper-2.rvg 53 | //-i ./input/rvg/car.rvg 54 | //-i ./input/rvg/paris-50k.rvg 55 | //-i ./input/rvg/paris-70k.rvg 56 | 57 | // -------- -------- -------- -------- -------- -------- -------- -------- 58 | // test input 59 | 60 | //-i ./input/quality_test/001.svg 61 | 62 | //-i ./input/quality_test/101.svg 63 | //-i ./input/quality_test/102.svg 64 | //-i ./input/quality_test/103.svg 65 | //-i ./input/quality_test/104.svg 66 | 67 | //-i ./input/quality_test/201.svg 68 | //-i ./input/quality_test/202.svg 69 | //-i ./input/quality_test/203.svg 70 | //-i ./input/quality_test/204.svg 71 | 72 | //-i ./input/quality_test/301.svg 73 | -------------------------------------------------------------------------------- /gpu-scanline/src/gradient.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _MOCHIMAZUI_GRADIENT_H_ 3 | #define _MOCHIMAZUI_GRADIENT_H_ 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include 11 | 12 | namespace Mochimazui { 13 | 14 | enum GradientType { 15 | GT_Linear = 2, 16 | GT_Radial = 3 17 | }; 18 | 19 | enum GradientUnits { 20 | USER_SPACE_ON_USE, 21 | OBJECT_BOUNDING_BOX 22 | }; 23 | 24 | struct GradientStop { 25 | float offset; 26 | u8rgba color; 27 | float opacity = 1.f; 28 | }; 29 | 30 | // Gradient-related enumerations 31 | enum SpreadMethod { 32 | PAD, // clamp to edge 33 | REFLECT, // mirror 34 | REPEAT, // repeat 35 | NONE // clamp to border with (0,0,0,0) border 36 | }; 37 | 38 | inline bool operator < (const GradientStop &a, const GradientStop &b) { 39 | return a.offset < b.offset; 40 | } 41 | 42 | struct Gradient { 43 | 44 | GradientType gradient_type; 45 | GradientUnits gradient_units = USER_SPACE_ON_USE; 46 | glm::mat3x3 gradient_transform; // could be float4x4 47 | SpreadMethod spread_method = PAD; 48 | std::vector gradient_stops; 49 | 50 | std::string href; 51 | 52 | // Linear gradient attributes 53 | glm::vec2 v1, v2; 54 | 55 | // Radial gradient attributes 56 | glm::vec2 c; // center 57 | glm::vec2 f; // focal point 58 | float r; // radius 59 | 60 | bool f_set = false; 61 | 62 | void clear() { 63 | gradient_units = USER_SPACE_ON_USE; 64 | gradient_transform = glm::mat3x3(); 65 | spread_method = PAD; 66 | gradient_stops.clear(); 67 | href.clear(); 68 | 69 | f_set = false; 70 | v1 = v2 = c = f = glm::vec2(); 71 | r = 0.f; 72 | } 73 | 74 | }; 75 | 76 | } 77 | 78 | #endif 79 | -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/camera_controller_2d.h: -------------------------------------------------------------------------------- 1 | #ifndef _MOCHIMAZUI_CAMERA_CONTROLLER_2D_ 2 | #define _MOCHIMAZUI_CAMERA_CONTROLLER_2D_ 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #include "camera_2d.h" 10 | 11 | namespace Mochimazui { 12 | 13 | using glm::vec2; 14 | using glm::ivec2; 15 | using std::map; 16 | 17 | class CameraController2D : public Camera2D { 18 | 19 | public: 20 | 21 | void leftButtonDown(int x, int y) { 22 | _leftButton = true; 23 | _lastPos = _leftButtonClickPos = ivec2(x, y); 24 | } 25 | 26 | void leftButtonUp(int x, int y) { 27 | _leftButton = false; 28 | } 29 | 30 | void rightButtonDown(int x, int y) { 31 | _rightButton = true; 32 | _lastPos = _rightButtonClickPos = ivec2(x, y); 33 | } 34 | 35 | void rightButtonUp(int x, int y) { 36 | _rightButton = false; 37 | } 38 | 39 | void wheel(float dy) { 40 | wheel(0.f, dy); 41 | } 42 | 43 | void wheel(float dx, float dy) { 44 | if (dy > 0) { 45 | scale(1.1f, _lastPos); 46 | } 47 | else { 48 | scale(0.9f, _lastPos); 49 | } 50 | } 51 | 52 | void move(int x, int y) { 53 | ivec2 cp(x, y); 54 | if (_leftButton || _rightButton) { 55 | auto delta = cp - _lastPos; 56 | Camera2D::translate(delta); 57 | } 58 | _lastPos = cp; 59 | } 60 | 61 | void keyDown(int key) { 62 | _keyMap[key] = true; 63 | } 64 | 65 | void keyUp(int key) { 66 | _keyMap[key] = false; 67 | } 68 | 69 | private: 70 | 71 | private: 72 | 73 | bool _leftButton = false; 74 | bool _rightButton = false; 75 | 76 | ivec2 _leftButtonClickPos; 77 | ivec2 _rightButtonClickPos; 78 | 79 | ivec2 _lastPos; 80 | 81 | map _keyMap; 82 | }; 83 | } 84 | 85 | #endif -------------------------------------------------------------------------------- /working_directory/shader/shared/integrate_samples.frag.glsl: -------------------------------------------------------------------------------- 1 | 2 | #version 450 3 | 4 | layout(binding = 0) uniform sampler2DMS tex_msaa; 5 | 6 | uniform bool enable_srgb_correction; 7 | uniform int samples; 8 | 9 | layout(location = 0) out vec4 out_color; 10 | 11 | // -------- -------- -------- -------- -------- -------- -------- -------- 12 | float lrgb_to_srgb_f(float f) { 13 | if (f <= 0.0031308f) { return 12.92f*f; } 14 | else { return (1.f + 0.055f)*pow(f, 1.f / 2.4f) - 0.055f; } 15 | } 16 | 17 | vec4 lrgb_to_srgb(const vec4 c) { 18 | return vec4( 19 | lrgb_to_srgb_f(c.r), 20 | lrgb_to_srgb_f(c.g), 21 | lrgb_to_srgb_f(c.b), 22 | c.a 23 | ); 24 | } 25 | 26 | // -------- -------- -------- -------- -------- -------- -------- -------- 27 | float srgb_to_lrgb_f(float f) { 28 | if (f <= 0.04045f) { return f / 12.92f; } 29 | else { return pow((f + 0.055f) / (1.f + 0.055f), 2.4f); } 30 | } 31 | 32 | vec4 srgb_to_lrgb(vec4 c) { 33 | return vec4( 34 | srgb_to_lrgb_f(c.r), 35 | srgb_to_lrgb_f(c.g), 36 | srgb_to_lrgb_f(c.b), 37 | c.a 38 | ); 39 | } 40 | 41 | // -------- -------- -------- -------- -------- -------- -------- -------- 42 | void main() { 43 | 44 | ivec2 xy = ivec2( gl_FragCoord.x, gl_FragCoord.y); 45 | vec4 acc = vec4(0, 0, 0, 0); 46 | 47 | //if (enable_srgb_correction) { 48 | if (true) { 49 | //if (false) { 50 | for (int i = 0; i 6 | #include 7 | #include 8 | #include 9 | #pragma warning( pop ) 10 | 11 | #include "cuda/cuda_cached_allocator.h" 12 | 13 | namespace Mochimazui { 14 | 15 | // -------- -------- -------- -------- -------- -------- -------- -------- 16 | void thrust_exclusive_scan(int8_t *ibegin, uint32_t number, int8_t *obegin) { 17 | thrust::exclusive_scan(thrust::cuda::par(g_alloc), 18 | ibegin, ibegin + number, obegin, 0); 19 | } 20 | 21 | void thrust_exclusive_scan(uint8_t *ibegin, uint32_t number, uint8_t *obegin) { 22 | thrust::exclusive_scan(thrust::cuda::par(g_alloc), 23 | ibegin, ibegin + number, obegin, 0); 24 | } 25 | 26 | // -------- -------- -------- -------- -------- -------- -------- -------- 27 | void thrust_exclusive_scan(int32_t *ibegin, uint32_t number, int32_t *obegin) { 28 | thrust::exclusive_scan(thrust::cuda::par(g_alloc), 29 | ibegin, ibegin + number, obegin, 0); 30 | } 31 | 32 | void thrust_exclusive_scan(uint32_t *ibegin, uint32_t number, uint32_t *obegin) { 33 | thrust::exclusive_scan(thrust::cuda::par(g_alloc), 34 | ibegin, ibegin + number, obegin, 0); 35 | } 36 | 37 | // -------- -------- -------- -------- -------- -------- -------- -------- 38 | void thrust_exclusive_scan(float *ibegin, uint32_t number, float *obegin) { 39 | thrust::exclusive_scan(thrust::cuda::par(g_alloc), 40 | ibegin, ibegin + number, obegin, 0); 41 | } 42 | 43 | // -------- -------- -------- -------- -------- -------- -------- -------- 44 | void thrust_inclusive_scan(int32_t *ibegin, uint32_t number, int32_t *obegin) { 45 | thrust::inclusive_scan(thrust::cuda::par(g_alloc), 46 | ibegin, ibegin + number, obegin); 47 | } 48 | 49 | void thrust_inclusive_scan(uint32_t *ibegin, uint32_t number, uint32_t *obegin) { 50 | thrust::inclusive_scan(thrust::cuda::par(g_alloc), 51 | ibegin, ibegin + number, obegin); 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /working_directory/shader/R_cut_A_stencil/output_8.frag.glsl: -------------------------------------------------------------------------------- 1 | 2 | #version 430 3 | 4 | // -------- -------- -------- -------- 5 | layout(binding = 4) uniform samplerBuffer tex_table; 6 | layout(binding = 5) uniform sampler2D tex_ramp; 7 | layout(binding = 6) uniform sampler2DMS path_frag_tex; 8 | 9 | uniform bool enable_srgb_correction; 10 | 11 | // -------- -------- -------- -------- 12 | flat in vec4 fragment_color; 13 | 14 | flat in ivec2 path_frag_pos; 15 | 16 | flat in int pixel_mask; 17 | 18 | // -------- -------- -------- -------- -------- -------- -------- -------- 19 | in vec3 gradient_coord_0; 20 | in vec3 gradient_coord_1; 21 | flat in vec3 gradient_ramp_coord; 22 | flat in vec3 gradient_focal_point; 23 | 24 | // -------- -------- -------- -------- 25 | layout(location = 0) out vec4 out_color; 26 | 27 | // -------- -------- -------- -------- 28 | float safeRcpP(float a) { return a > 1e-6 ? 1 / a : 0.0; } 29 | 30 | // -------- -------- -------- -------- -------- -------- -------- -------- 31 | float srgb_to_lrgb_f(float f) { 32 | if (f <= 0.04045f) { return f / 12.92f; } 33 | else { return pow((f + 0.055f) / (1.f + 0.055f), 2.4f); } 34 | } 35 | 36 | vec4 srgb_to_lrgb(vec4 c) { 37 | return vec4(srgb_to_lrgb_f(c.r), srgb_to_lrgb_f(c.g), srgb_to_lrgb_f(c.b), c.a); 38 | } 39 | 40 | // -------- -------- -------- -------- 41 | void main() { 42 | 43 | ivec2 in_frag_pos = ivec2(gl_FragCoord.xy) - path_frag_pos; 44 | 45 | if (gradient_ramp_coord.z > 0.0) { 46 | vec3 gradient_coord = in_frag_pos.y == 0 ? gradient_coord_0 : gradient_coord_1; 47 | vec2 d = gradient_coord.xy - gradient_focal_point.xy; 48 | float A = dot(d, d), B = dot(d, gradient_focal_point.xy); 49 | float c = min(A*safeRcpP(sqrt(B*B + A*gradient_focal_point.z) - B), 1.0); 50 | out_color = textureLod(tex_ramp, gradient_ramp_coord.xy + vec2(c*gradient_ramp_coord.z, 0.0), 0); 51 | } 52 | else { 53 | out_color = fragment_color; 54 | } 55 | 56 | int mask_shift = (in_frag_pos.x * 2 + in_frag_pos.y) * 8; 57 | int count = bitCount((pixel_mask >> mask_shift) & 0xFF); 58 | out_color.a *= count / 8.0; 59 | } 60 | -------------------------------------------------------------------------------- /working_directory/shader/R_cut_A_stencil/ms_output_32.frag.glsl: -------------------------------------------------------------------------------- 1 | 2 | #version 430 3 | 4 | // -------- -------- -------- -------- 5 | layout(binding = 4) uniform samplerBuffer tex_table; 6 | layout(binding = 5) uniform sampler2D tex_ramp; 7 | layout(binding = 6) uniform sampler2DMS path_frag_tex; 8 | 9 | uniform bool enable_srgb_correction; 10 | 11 | // -------- -------- -------- -------- 12 | flat in vec4 fragment_color; 13 | 14 | flat in ivec2 path_frag_pos; 15 | 16 | flat in ivec4 pixel_mask; 17 | 18 | // -------- -------- -------- -------- -------- -------- -------- -------- 19 | in vec3 gradient_coord_0; 20 | in vec3 gradient_coord_1; 21 | flat in vec3 gradient_ramp_coord; 22 | flat in vec3 gradient_focal_point; 23 | 24 | // -------- -------- -------- -------- 25 | layout(location = 0) out vec4 out_color; 26 | 27 | // -------- -------- -------- -------- 28 | float safeRcpP(float a) { return a > 1e-6 ? 1 / a : 0.0; } 29 | 30 | // -------- -------- -------- -------- -------- -------- -------- -------- 31 | float srgb_to_lrgb_f(float f) { 32 | if (f <= 0.04045f) { return f / 12.92f; } 33 | else { return pow((f + 0.055f) / (1.f + 0.055f), 2.4f); } 34 | } 35 | 36 | vec4 srgb_to_lrgb(vec4 c) { 37 | return vec4( srgb_to_lrgb_f(c.r), srgb_to_lrgb_f(c.g), srgb_to_lrgb_f(c.b), c.a ); 38 | } 39 | 40 | // -------- -------- -------- -------- 41 | void main() { 42 | 43 | ivec2 in_frag_pos = ivec2(gl_FragCoord.xy) - path_frag_pos; 44 | 45 | if (gradient_ramp_coord.z > 0.0) { 46 | vec3 gradient_coord = in_frag_pos.y == 0 ? gradient_coord_0 : gradient_coord_1; 47 | vec2 d = gradient_coord.xy - gradient_focal_point.xy; 48 | float A = dot(d, d), B = dot(d, gradient_focal_point.xy); 49 | float c = min(A*safeRcpP(sqrt(B*B + A*gradient_focal_point.z) - B), 1.0); 50 | out_color = textureLod(tex_ramp, gradient_ramp_coord.xy + vec2(c*gradient_ramp_coord.z, 0.0), 0); 51 | } 52 | else { 53 | out_color = fragment_color; 54 | } 55 | 56 | ivec2 sub_pixel_index = ivec2(mod(gl_FragCoord, 1) * 2); 57 | 58 | int mask_index = in_frag_pos.x * 2 + in_frag_pos.y; 59 | gl_SampleMask[0] = pixel_mask[mask_index]; 60 | } 61 | -------------------------------------------------------------------------------- /working_directory/shader/R_cut_A_stencil/ms_output_8.frag.glsl: -------------------------------------------------------------------------------- 1 | 2 | #version 430 3 | 4 | // -------- -------- -------- -------- 5 | layout(binding = 4) uniform samplerBuffer tex_table; 6 | layout(binding = 5) uniform sampler2D tex_ramp; 7 | layout(binding = 6) uniform sampler2DMS path_frag_tex; 8 | 9 | uniform bool enable_srgb_correction; 10 | 11 | // -------- -------- -------- -------- 12 | flat in vec4 fragment_color; 13 | 14 | flat in ivec2 path_frag_pos; 15 | 16 | flat in int pixel_mask; 17 | 18 | // -------- -------- -------- -------- -------- -------- -------- -------- 19 | in vec3 gradient_coord_0; 20 | in vec3 gradient_coord_1; 21 | flat in vec3 gradient_ramp_coord; 22 | flat in vec3 gradient_focal_point; 23 | 24 | // -------- -------- -------- -------- 25 | layout(location = 0) out vec4 out_color; 26 | 27 | // -------- -------- -------- -------- 28 | float safeRcpP(float a) { return a > 1e-6 ? 1 / a : 0.0; } 29 | 30 | // -------- -------- -------- -------- -------- -------- -------- -------- 31 | float srgb_to_lrgb_f(float f) { 32 | if (f <= 0.04045f) { return f / 12.92f; } 33 | else { return pow((f + 0.055f) / (1.f + 0.055f), 2.4f); } 34 | } 35 | 36 | vec4 srgb_to_lrgb(vec4 c) { 37 | return vec4( srgb_to_lrgb_f(c.r), srgb_to_lrgb_f(c.g), srgb_to_lrgb_f(c.b), c.a ); 38 | } 39 | 40 | // -------- -------- -------- -------- 41 | void main() { 42 | 43 | ivec2 in_frag_pos = ivec2(gl_FragCoord.xy) - path_frag_pos; 44 | 45 | if (gradient_ramp_coord.z > 0.0) { 46 | vec3 gradient_coord = in_frag_pos.y == 0 ? gradient_coord_0 : gradient_coord_1; 47 | vec2 d = gradient_coord.xy - gradient_focal_point.xy; 48 | float A = dot(d, d), B = dot(d, gradient_focal_point.xy); 49 | float c = min(A*safeRcpP(sqrt(B*B + A*gradient_focal_point.z) - B), 1.0); 50 | out_color = textureLod(tex_ramp, gradient_ramp_coord.xy + vec2(c*gradient_ramp_coord.z, 0.0), 0); 51 | } 52 | else { 53 | out_color = fragment_color; 54 | } 55 | 56 | ivec2 sub_pixel_index = ivec2(mod(gl_FragCoord, 1) * 2); 57 | 58 | int mask_shift = (in_frag_pos.x * 2 + in_frag_pos.y) * 8; 59 | 60 | gl_SampleMask[0] = (pixel_mask >> mask_shift) & 0xFF; 61 | } 62 | -------------------------------------------------------------------------------- /working_directory/shader/R_cut_A_stencil/output_32.frag.glsl: -------------------------------------------------------------------------------- 1 | 2 | #version 430 3 | 4 | // -------- -------- -------- -------- 5 | layout(binding = 4) uniform samplerBuffer tex_table; 6 | layout(binding = 5) uniform sampler2D tex_ramp; 7 | layout(binding = 6) uniform sampler2DMS path_frag_tex; 8 | 9 | uniform bool enable_srgb_correction; 10 | 11 | // -------- -------- -------- -------- 12 | flat in vec4 fragment_color; 13 | 14 | flat in ivec2 path_frag_pos; 15 | 16 | flat in ivec4 pixel_mask; 17 | 18 | // -------- -------- -------- -------- -------- -------- -------- -------- 19 | in vec3 gradient_coord_0; 20 | in vec3 gradient_coord_1; 21 | flat in vec3 gradient_ramp_coord; 22 | flat in vec3 gradient_focal_point; 23 | 24 | // -------- -------- -------- -------- 25 | layout(location = 0) out vec4 out_color; 26 | 27 | // -------- -------- -------- -------- 28 | float safeRcpP(float a) { return a > 1e-6 ? 1 / a : 0.0; } 29 | 30 | // -------- -------- -------- -------- -------- -------- -------- -------- 31 | float srgb_to_lrgb_f(float f) { 32 | if (f <= 0.04045f) { return f / 12.92f; } 33 | else { return pow((f + 0.055f) / (1.f + 0.055f), 2.4f); } 34 | } 35 | 36 | vec4 srgb_to_lrgb(vec4 c) { 37 | return vec4(srgb_to_lrgb_f(c.r), srgb_to_lrgb_f(c.g), srgb_to_lrgb_f(c.b), c.a); 38 | } 39 | 40 | // -------- -------- -------- -------- 41 | void main() { 42 | 43 | ivec2 in_frag_pos = ivec2(gl_FragCoord.xy) - path_frag_pos; 44 | 45 | if (gradient_ramp_coord.z > 0.0) { 46 | vec3 gradient_coord = in_frag_pos.y == 0 ? gradient_coord_0 : gradient_coord_1; 47 | vec2 d = gradient_coord.xy - gradient_focal_point.xy; 48 | float A = dot(d, d), B = dot(d, gradient_focal_point.xy); 49 | float c = min(A*safeRcpP(sqrt(B*B + A*gradient_focal_point.z) - B), 1.0); 50 | out_color = textureLod(tex_ramp, gradient_ramp_coord.xy + vec2(c*gradient_ramp_coord.z, 0.0), 0); 51 | } 52 | else { 53 | out_color = fragment_color; 54 | } 55 | 56 | ivec2 sub_pixel_index = ivec2(mod(gl_FragCoord, 1) * 2); 57 | 58 | int mask_index = in_frag_pos.x * 2 + in_frag_pos.y; 59 | 60 | int count = bitCount(pixel_mask[mask_index]); 61 | 62 | out_color.a *= count / 32.0; 63 | } 64 | -------------------------------------------------------------------------------- /working_directory/shader/shared/fps.frag.glsl: -------------------------------------------------------------------------------- 1 | 2 | #version 450 3 | 4 | uniform ivec2 vp_size; 5 | uniform int fps; 6 | 7 | #define SIZE 8 8 | 9 | layout(location = 0) out vec4 color; 10 | 11 | int fps_mask[3] = { 12 | 0x13CF, // 001 0011 1100 1111 13 | 0x13EF, // 001 0011 1110 1111 14 | 0x79CF // 111 1001 1100 1111 15 | }; 16 | 17 | int number_mask[10] = { 18 | 0x7B6F, // 111 1011 0110 1111 19 | 0x4924, // 100 1001 0010 0100 20 | 0x73E7, // 111 0011 1110 0111 21 | 0x79E7, // 111 1001 1110 0111 22 | 0x49ED, // 100 1001 1110 1101 23 | 0x79CF, // 111 1001 1100 1111 24 | 0x7BCF, // 111 1011 1100 1111 25 | 0x4927, // 100 1001 0010 0111 26 | 0x7BEF, // 111 1011 1110 1111 27 | 0x79EF // 111 1001 1110 1111 28 | }; 29 | 30 | int mask = 0; 31 | 32 | bool check(int x, int y) { 33 | if (y < 0 || y >=(SIZE * 5) || x < 0 || x >=( SIZE * 3)) { 34 | return false; 35 | } 36 | x /= SIZE; 37 | y /= SIZE; 38 | return (mask >> (y * 3 + x) & 1) == 1; 39 | } 40 | 41 | void main(){ 42 | 43 | vec2 pos; 44 | 45 | if (gl_FragCoord.x < SIZE * 2) { discard; } 46 | 47 | pos.x = gl_FragCoord.x - SIZE * 2; 48 | pos.y = vp_size.y - gl_FragCoord.y - SIZE * 2; 49 | 50 | int char_index = int((pos.x / (SIZE * 4)) + 1); 51 | pos.x = mod(pos.x, (SIZE * 4)); 52 | 53 | if (pos.y < 0 || pos.y > (SIZE * 5) || pos.x < 0 || pos.x > SIZE * 3) { discard; } 54 | 55 | color = vec4(1, 1, 1, 1); 56 | 57 | if (char_index > 4 && char_index <=7) { 58 | mask = fps_mask[char_index - 5]; 59 | } 60 | else if(char_index <=3) { 61 | if (char_index == 1 && fps >= 100) { 62 | mask = number_mask[fps / 100]; 63 | } 64 | if (char_index == 2 && fps >= 10) { 65 | mask = number_mask[(fps / 10) % 10]; 66 | } 67 | if(char_index == 3) { 68 | mask = number_mask[fps % 10]; 69 | } 70 | } 71 | else { 72 | mask = 0; 73 | } 74 | 75 | if (check(int(pos.x), int(pos.y))) { 76 | 77 | bool flag = true; 78 | 79 | for (int dx = -2; dx <= 2; ++dx) { 80 | for (int dy = -2; dy <= 2; ++dy) { 81 | flag = flag && check( 82 | int(pos.x) + dx, int(pos.y) + dy); 83 | } 84 | } 85 | 86 | if (!flag) { 87 | color = vec4(0, 0, 0, 1); 88 | } 89 | } 90 | else { 91 | discard; 92 | } 93 | }; 94 | -------------------------------------------------------------------------------- /working_directory/shader/R_cut_A_stencil/ms_output_32.frag.glsl.before_368.22: -------------------------------------------------------------------------------- 1 | 2 | #version 430 3 | 4 | // -------- -------- -------- -------- 5 | layout(binding = 4) uniform samplerBuffer tex_table; 6 | layout(binding = 5) uniform sampler2D tex_ramp; 7 | layout(binding = 6) uniform sampler2DMS path_frag_tex; 8 | 9 | uniform bool enable_srgb_correction; 10 | 11 | // -------- -------- -------- -------- 12 | flat in vec4 fragment_color; 13 | 14 | flat in ivec2 path_frag_pos; 15 | 16 | flat in ivec4 pixel_mask; 17 | 18 | // -------- -------- -------- -------- -------- -------- -------- -------- 19 | in vec3 gradient_coord_0; 20 | in vec3 gradient_coord_1; 21 | flat in vec3 gradient_ramp_coord; 22 | flat in vec3 gradient_focal_point; 23 | 24 | // -------- -------- -------- -------- 25 | layout(location = 0) out vec4 out_color; 26 | 27 | // -------- -------- -------- -------- 28 | float safeRcpP(float a) { return a > 1e-6 ? 1 / a : 0.0; } 29 | 30 | // -------- -------- -------- -------- -------- -------- -------- -------- 31 | float srgb_to_lrgb_f(float f) { 32 | if (f <= 0.04045f) { return f / 12.92f; } 33 | else { return pow((f + 0.055f) / (1.f + 0.055f), 2.4f); } 34 | } 35 | 36 | vec4 srgb_to_lrgb(vec4 c) { 37 | return vec4( srgb_to_lrgb_f(c.r), srgb_to_lrgb_f(c.g), srgb_to_lrgb_f(c.b), c.a ); 38 | } 39 | 40 | // -------- -------- -------- -------- 41 | void main() { 42 | 43 | ivec2 in_frag_pos = ivec2(gl_FragCoord.xy) - path_frag_pos; 44 | 45 | if (gradient_ramp_coord.z > 0.0) { 46 | vec3 gradient_coord = in_frag_pos.y == 0 ? gradient_coord_0 : gradient_coord_1; 47 | vec2 d = gradient_coord.xy - gradient_focal_point.xy; 48 | float A = dot(d, d), B = dot(d, gradient_focal_point.xy); 49 | float c = min(A*safeRcpP(sqrt(B*B + A*gradient_focal_point.z) - B), 1.0); 50 | out_color = textureLod(tex_ramp, gradient_ramp_coord.xy + vec2(c*gradient_ramp_coord.z, 0.0), 0); 51 | } 52 | else { 53 | out_color = fragment_color; 54 | } 55 | 56 | ivec2 sub_pixel_index = ivec2(mod(gl_FragCoord, 1) * 2); 57 | 58 | int mask_index = in_frag_pos.x * 2 + in_frag_pos.y; 59 | int mask_shift = (sub_pixel_index.x * 2 + sub_pixel_index.y) * 8; 60 | 61 | gl_SampleMask[0] = pixel_mask[mask_index] >> mask_shift; 62 | } 63 | -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/camera_controller_3d.h: -------------------------------------------------------------------------------- 1 | #ifndef _MOCHIMAZUI_CAMERA_CONTROLLER_3D_ 2 | #define _MOCHIMAZUI_CAMERA_CONTROLLER_3D_ 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #include "camera_3d.h" 10 | 11 | namespace Mochimazui { 12 | 13 | using glm::vec2; 14 | using glm::ivec2; 15 | using std::map; 16 | 17 | enum CameraControllerMode { 18 | CCM_NULL, 19 | CCM_MOVE, 20 | CCM_TURN, 21 | CCM_ROTATE, 22 | CCM_WALK, 23 | }; 24 | 25 | class CameraController3D : public Camera3D { 26 | 27 | public: 28 | void setControllerMode(int nm) { 29 | _controllerMode = nm; 30 | } 31 | 32 | public: 33 | 34 | void init(int width, int height); 35 | void fitToView(int vWidth, int vHeight); 36 | 37 | public: 38 | void leftButtonDown(int x, int y); 39 | void leftButtonUp(int x, int y); 40 | 41 | void rightButtonDown(int x, int y); 42 | void rightButtonUp(int x, int y); 43 | 44 | void move(int x, int y); 45 | 46 | void wheel(float dy) { 47 | wheel(0.f, dy); 48 | } 49 | 50 | void wheel(float dx, float dy) { 51 | if (dy > 0) { 52 | scale(1.05f, glm::vec3(_lastPos.x, _lastPos.y, 0.0f)); 53 | } 54 | else { 55 | scale(0.95f, glm::vec3(_lastPos.x, _lastPos.y, 0.0f)); 56 | } 57 | } 58 | 59 | void keyDown(int key) { 60 | _keyMap[key] = true; 61 | } 62 | 63 | void keyUp(int key) { 64 | _keyMap[key] = false; 65 | } 66 | 67 | glm::mat4x4 modelViewMatrix(); 68 | glm::mat4x4 projectionMatrix(); 69 | 70 | private: 71 | 72 | void handleMove(int x, int y); 73 | void handleTurn(int x, int y); 74 | void handleRotate(int x, int y); 75 | 76 | public: 77 | 78 | int _controllerMode = CCM_MOVE; 79 | 80 | bool _leftButton = false; 81 | bool _rightButton = false; 82 | 83 | ivec2 _leftButtonClickPos; 84 | ivec2 _rightButtonClickPos; 85 | 86 | ivec2 _lastPos; 87 | 88 | int _sceneWidth; 89 | int _sceneHeight; 90 | 91 | map _keyMap; 92 | 93 | glm::vec3 _rotateCenter; 94 | glm::vec3 _rotateNormal; 95 | glm::vec3 _walky; 96 | glm::vec3 _walkx; 97 | 98 | }; 99 | } 100 | 101 | #endif -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/stdio_ext.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #ifndef _MOCHIMAZUI_STDIO_EXT_ 5 | #define _MOCHIMAZUI_STDIO_EXT_ 6 | 7 | #if defined _WIN32 || defined _WIN64 8 | #define MOCHIMAZUI_WINDOWS 9 | #include 10 | #undef min 11 | #undef max 12 | #endif 13 | 14 | namespace Mochimazui { 15 | namespace stdext { 16 | 17 | namespace stdext_private { 18 | #ifdef MOCHIMAZUI_WINDOWS 19 | inline HANDLE console_handle() { 20 | static HANDLE s_h_console = GetStdHandle(STD_OUTPUT_HANDLE); 21 | return s_h_console; 22 | } 23 | inline void set_console_text_attribute(WORD a) { 24 | //#define FOREGROUND_BLUE 0x0001 // text color contains blue. 25 | //#define FOREGROUND_GREEN 0x0002 // text color contains green. 26 | //#define FOREGROUND_RED 0x0004 // text color contains red. 27 | //#define FOREGROUND_INTENSITY 0x0008 // text color is intensified. 28 | //#define BACKGROUND_BLUE 0x0010 // background color contains blue. 29 | //#define BACKGROUND_GREEN 0x0020 // background color contains green. 30 | //#define BACKGROUND_RED 0x0040 // background color contains red. 31 | //#define BACKGROUND_INTENSITY 0x0080 // background color is intensified. 32 | SetConsoleTextAttribute(console_handle(), a); 33 | } 34 | #else 35 | inline void set_console_text_attribute(uint32_t) { 36 | } 37 | #endif 38 | } 39 | 40 | template 41 | inline void color_printf(int text_color, int background_color, const char *fmt_str, Ts... args) { 42 | stdext_private::set_console_text_attribute((background_color << 4) | text_color); 43 | printf(fmt_str, args...); 44 | stdext_private::set_console_text_attribute(7); 45 | } 46 | 47 | template 48 | inline void error_printf(const char *fmt_str, Ts... args) { 49 | stdext_private::set_console_text_attribute((12 << 4) | 15); 50 | fprintf(stderr, fmt_str, args...); 51 | stdext_private::set_console_text_attribute(7); 52 | } 53 | 54 | template 55 | inline void warning_printf(const char *fmt_str, Ts... args) { 56 | stdext_private::set_console_text_attribute((6 << 4) | 15); 57 | fprintf(stderr, fmt_str, args...); 58 | stdext_private::set_console_text_attribute(7); 59 | } 60 | 61 | template 62 | inline void info_printf(const char *fmt_str, Ts... args) { 63 | stdext_private::set_console_text_attribute((10 << 4) | 15); 64 | fprintf(stderr, fmt_str, args...); 65 | stdext_private::set_console_text_attribute(7); 66 | } 67 | 68 | } 69 | } 70 | 71 | #endif -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/bitmap.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "bitmap.h" 3 | 4 | #include 5 | 6 | #if defined _WIN32 || defined _WIN64 7 | #include 8 | #endif 9 | 10 | namespace Mochimazui { 11 | 12 | using std::string; 13 | 14 | #if defined _WIN32 || defined _WIN64 15 | #else 16 | typedef int32_t WORD; 17 | typedef int64_t DWORD; 18 | typedef int64_t LONG; 19 | 20 | typedef struct tagBITMAPFILEHEADER { 21 | WORD bfType; 22 | DWORD bfSize; 23 | WORD bfReserved1; 24 | WORD bfReserved2; 25 | DWORD bfOffBits; 26 | } BITMAPFILEHEADER, *PBITMAPFILEHEADER; 27 | 28 | typedef struct tagBITMAPINFOHEADER { 29 | DWORD biSize; 30 | LONG biWidth; 31 | LONG biHeight; 32 | WORD biPlanes; 33 | WORD biBitCount; 34 | DWORD biCompression; 35 | DWORD biSizeImage; 36 | LONG biXPelsPerMeter; 37 | LONG biYPelsPerMeter; 38 | DWORD biClrUsed; 39 | DWORD biClrImportant; 40 | } BITMAPINFOHEADER, *PBITMAPINFOHEADER; 41 | #endif 42 | 43 | //void Bitmap::load(const std::string &fileName) { 44 | //} 45 | 46 | bool Bitmap::save(const std::string &fileName) { 47 | 48 | BITMAPFILEHEADER bfh; 49 | BITMAPINFOHEADER bih; 50 | 51 | // fill info header 52 | bih.biSize = sizeof(BITMAPINFOHEADER); 53 | 54 | bih.biWidth = _width; 55 | bih.biHeight = _height; 56 | bih.biPlanes = 1; 57 | 58 | bih.biBitCount = 32; 59 | bih.biCompression = 0; 60 | bih.biSizeImage = _width*_height * 4; 61 | 62 | bih.biXPelsPerMeter = 1; 63 | bih.biYPelsPerMeter = 1; 64 | 65 | bih.biClrUsed = 0; 66 | bih.biClrImportant = 0; 67 | 68 | // fill file header 69 | bfh.bfType = 0x4D42; 70 | bfh.bfReserved1 = 0; 71 | bfh.bfReserved2 = 0; 72 | bfh.bfOffBits = sizeof(BITMAPFILEHEADER) + sizeof(BITMAPINFOHEADER); 73 | bfh.bfSize = bfh.bfOffBits + bih.biSizeImage; 74 | 75 | FILE* hFile; 76 | #ifdef _MSC_VER 77 | fopen_s(&hFile, fileName.c_str(), "wb"); 78 | #else 79 | hFile = fopen(fileName.c_str(), "wb"); 80 | #endif 81 | if (!hFile) { return false; } 82 | 83 | fwrite(&bfh, sizeof(BITMAPFILEHEADER), 1, hFile); 84 | fwrite(&bih, sizeof(BITMAPINFOHEADER), 1, hFile); 85 | 86 | auto outputPixel = _pixel; 87 | //for (uint32_t y = 0; y < _height; ++y) { 88 | // auto y0 = y; 89 | // auto y1 = _height - y0 - 1; 90 | // for (uint32_t x = 0; x < _width; ++x) { 91 | // outputPixel[y0 * _width + x] = _pixel[y1 * _width + x]; 92 | // } 93 | //} 94 | 95 | fwrite(outputPixel.data(), bih.biSizeImage, 1, hFile); 96 | fclose(hFile); 97 | 98 | return true; 99 | } 100 | 101 | } 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Efficient GPU Path Rendering Using Scanline Rasterization 2 | 3 | SIGGRAPH Asia 2016 project. 4 | 5 | Please download code together with dependency and test data from http://gaps-zju.org/pathrendering/ 6 | 7 | ## Pre-built Binary 8 | 9 | ./x64/release/gpu-scanline.exe. Tested on 64 bit Windows 10. Copy to ./working_directory to run.
10 | Built with Visual Studio 2013 and CUDA 7.5.
11 | Requires a NVIDIA graphics card with CUDA sm_50 and OpenGL 4.5 support. 12 | 13 | Right button: move.
14 | Mouse wheel: scale.
15 | Left button: draw. 16 | 17 | ## Build Dependency 18 | 19 | * Visual Studio 2013/2015 20 | * CUDA 7.5/8.0, lower versions may work as well. 21 | * Thrust 22 | 23 | ---- 24 | Open source code and pre-built binaries included in /3rd 25 | 26 | * [SDL 2.0.3](https://www.libsdl.org/) for basic window system and UI. 27 | * [Boost 1.60.0](http://www.boost.org/) for command line options. 28 | 29 | Libraries are built on a 64-bit Windows 10 system with Visual Studio 2013. 30 | You may need to download or build these libraries on your own system. 31 | 32 | ---- 33 | Other included open source code 34 | 35 | * [Modern GPU](https://nvlabs.github.io/moderngpu/) for segmented sort. 36 | * [glm 0.9.6.3](http://www.g-truc.net/) for vector and matrix. 37 | * [stb](https://github.com/nothings/stb) for image and font. 38 | * [rapidxml](https://github.com/dwd/rapidxml) for SVG parsing. 39 | 40 | ---- 41 | Code generator used 42 | 43 | * [glLoadGen](https://bitbucket.org/alfonse/glloadgen/wiki/Home) for OpenGL functions. 44 | 45 | ## Build 46 | 47 | Open in Visual Studio.
48 | Check if "Properties -> CUDA C/C++ -> Device -> Code Generation" matches your device.
49 | Then build. 50 | 51 | ## Run 52 | 53 | * Start in Visual Studio: set "Debugging -> Working Directory" to $(SolutionDir)working_directory.
54 | * Start in explorer or command line: copy exe file to working_directory, or create shortcut.
55 | 56 | The program loads ./vg_default.cfg by default. Run with --help or check cmd files in working directory for more detail. 57 | 58 | ## Data 59 | 60 | RVG files in ./input/rvg from [MPVG](http://w3.impa.br/~diego/projects/GanEtAl14/).
61 | Works on SVG files with a subset of features (see the paper for details). 62 | 63 | ## Driver Issue 64 | 65 | At the time we release the code, we are using driver 368.81, and everything runs well. 66 | 67 | We found the behavior of gl_SampleMask in GLSL has changed since NVIDIA driver version 368.22. 68 | While the old behaviour was inconsistent with OpenGL standard, we assume it was a driver bug, 69 | or a result of incorrect graphics card configuration. 70 | Using drivers earlier than this version may get incorrect rendering results. 71 | -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/moderngpu.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "kernels/reduce.cuh" 38 | #include "kernels/scan.cuh" 39 | #include "kernels/bulkremove.cuh" 40 | #include "kernels/bulkinsert.cuh" 41 | #include "kernels/merge.cuh" 42 | #include "kernels/mergesort.cuh" 43 | #include "kernels/segmentedsort.cuh" 44 | #include "kernels/localitysort.cuh" 45 | #include "kernels/sortedsearch.cuh" 46 | #include "kernels/loadbalance.cuh" 47 | #include "kernels/intervalmove.cuh" 48 | #include "kernels/join.cuh" 49 | #include "kernels/sets.cuh" 50 | #include "kernels/segreducecsr.cuh" 51 | #include "kernels/reducebykey.cuh" 52 | #include "kernels/spmvcsr.cuh" 53 | -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/mgpuenums.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | namespace mgpu { 38 | 39 | enum MgpuBounds { 40 | MgpuBoundsLower, 41 | MgpuBoundsUpper 42 | }; 43 | 44 | enum MgpuScanType { 45 | MgpuScanTypeExc, 46 | MgpuScanTypeInc 47 | }; 48 | 49 | enum MgpuSearchType { 50 | MgpuSearchTypeNone, 51 | MgpuSearchTypeIndex, 52 | MgpuSearchTypeMatch, 53 | MgpuSearchTypeIndexMatch 54 | }; 55 | 56 | enum MgpuJoinKind { 57 | MgpuJoinKindInner, 58 | MgpuJoinKindLeft, 59 | MgpuJoinKindRight, 60 | MgpuJoinKindOuter 61 | }; 62 | 63 | enum MgpuSetOp { 64 | MgpuSetOpIntersection, 65 | MgpuSetOpUnion, 66 | MgpuSetOpDiff, 67 | MgpuSetOpSymDiff 68 | }; 69 | 70 | } // namespace mgpu 71 | -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/camera_3d.h: -------------------------------------------------------------------------------- 1 | #ifndef _MOCHIMAZUI_CAMERA_3D_H_ 2 | #define _MOCHIMAZUI_CAMERA_3D_H_ 3 | 4 | #include 5 | #include 6 | 7 | namespace Mochimazui { 8 | 9 | class Camera3D { 10 | 11 | friend class CameraController2D; 12 | 13 | public: 14 | Camera3D() { 15 | } 16 | 17 | public: 18 | 19 | void reset() { 20 | _matrix = glm::mat4x4(); 21 | } 22 | 23 | // 24 | void translate(const glm::vec3 &t) { 25 | translate(t.x, t.y, t.z); 26 | } 27 | 28 | template 29 | void translate(const T &x, const T &y, const T &z) { 30 | _matrix = glm::mat4x4( 31 | 1, 0, 0, 0, 32 | 0, 1, 0, 0, 33 | 0, 0, 1, 0, 34 | x, y, z, 1 35 | ) * _matrix; 36 | } 37 | 38 | // 39 | template 40 | void scale(const T &sx, const T &sy, const glm::vec3 &cp = glm::vec3(0.f, 0.f, 0.f)) { 41 | translate(-cp); 42 | _matrix = glm::mat4x4( 43 | sx, 0, 0, 0, 44 | 0, sy, 0, 0, 45 | 0, 0, 1, 0, 46 | 0, 0, 0, 1 47 | ) * _matrix; 48 | translate(cp); 49 | } 50 | 51 | template 52 | void scale(const T &s, const glm::vec3 &cp = glm::vec3(0.f, 0.f, 0.f)) { 53 | scale(s, s, cp); 54 | } 55 | 56 | void scale(const glm::vec3 &s, const glm::vec3 &cp = glm::vec3(0.f, 0.f, 0.f)) { 57 | scale(s.x, s.y, cp); 58 | } 59 | 60 | // 61 | void rotate(const glm::vec3 &c, float a) { 62 | } 63 | 64 | // 65 | glm::mat4x4 matrix(); 66 | 67 | // 68 | //void walk(const float delta, bool fixvr = true); 69 | 70 | void turn_y(const float d, const float y) { 71 | _matrix = 72 | glm::translate(glm::vec3(0, y, 0)) * 73 | glm::rotate(d, glm::vec3(1, 0, 0)) * 74 | glm::translate(glm::vec3(0, -y, 0)) * 75 | _matrix; 76 | } 77 | 78 | 79 | void rotate_cn(const float d, const glm::vec3 ¢er, const glm::vec3 &normal) { 80 | _matrix = 81 | glm::translate(center) * 82 | glm::rotate(d, normal) * 83 | glm::translate(-center) * 84 | _matrix; 85 | } 86 | 87 | //void pan(const float dx, const float dy); 88 | 89 | void rotate(const float dx, const float dy) { 90 | // 91 | glm::vec4 dir = glm::vec4(_eye - _center, 1.f); 92 | dir = glm::rotate(dir, -dx, glm::vec3(0, 1, 0)); 93 | //dir /= dir.w; 94 | //_eye = _center + vec3(dir); 95 | 96 | // 97 | auto cd = glm::cross(_up, glm::vec3(dir)); 98 | dir = glm::rotate(dir, -dy, cd); 99 | dir /= dir.w; 100 | _eye = _center + glm::vec3(dir); 101 | 102 | //_up = glm::cross(vec3(dir), cd); 103 | } 104 | 105 | private: 106 | 107 | glm::vec3 _eye; 108 | glm::vec3 _center; 109 | glm::vec3 _up; 110 | 111 | glm::vec3 _scale; 112 | glm::vec3 _translate; 113 | 114 | glm::mat4x4 _matrix; 115 | }; 116 | 117 | } 118 | 119 | #endif -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/util/mgpualloc.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include "util.h" 5 | #include 6 | 7 | namespace mgpu { 8 | 9 | class CudaDevice; 10 | 11 | class CudaContext; 12 | typedef intrusive_ptr ContextPtr; 13 | 14 | //////////////////////////////////////////////////////////////////////////////// 15 | // Customizable allocator. 16 | 17 | // CudaAlloc is the interface class all allocator accesses. Users may derive 18 | // this, implement custom allocators, and set it to the device with 19 | // CudaDevice::SetAllocator. 20 | 21 | class CudaAlloc : public CudaBase { 22 | public: 23 | virtual cudaError_t Malloc(size_t size, void** p) = 0; 24 | virtual bool Free(void* p) = 0; 25 | virtual void Clear() = 0; 26 | 27 | virtual ~CudaAlloc() { } 28 | 29 | CudaDevice& Device() { return _device; } 30 | 31 | protected: 32 | CudaAlloc(CudaDevice& device) : _device(device) { } 33 | CudaDevice& _device; 34 | }; 35 | 36 | // A concrete class allocator that simply calls cudaMalloc and cudaFree. 37 | class CudaAllocSimple : public CudaAlloc { 38 | public: 39 | CudaAllocSimple(CudaDevice& device) : CudaAlloc(device) { } 40 | 41 | virtual cudaError_t Malloc(size_t size, void** p); 42 | virtual bool Free(void* p); 43 | virtual void Clear() { } 44 | virtual ~CudaAllocSimple() { } 45 | }; 46 | 47 | // A concrete class allocator that uses exponentially-spaced buckets and an LRU 48 | // to reuse allocations. This is the default allocator. It is shared between 49 | // all contexts on the device. 50 | class CudaAllocBuckets : public CudaAlloc { 51 | public: 52 | CudaAllocBuckets(CudaDevice& device); 53 | virtual ~CudaAllocBuckets(); 54 | 55 | virtual cudaError_t Malloc(size_t size, void** p); 56 | virtual bool Free(void* p); 57 | virtual void Clear(); 58 | 59 | size_t Allocated() const { return _allocated; } 60 | size_t Committed() const { return _committed; } 61 | size_t Capacity() const { return _capacity; } 62 | 63 | bool SanityCheck() const; 64 | 65 | void SetCapacity(size_t capacity, size_t maxObjectSize) { 66 | _capacity = capacity; 67 | _maxObjectSize = maxObjectSize; 68 | Clear(); 69 | } 70 | 71 | private: 72 | static const int NumBuckets = 84; 73 | static const size_t BucketSizes[NumBuckets]; 74 | 75 | struct MemNode; 76 | typedef std::list MemList; 77 | typedef std::map AddressMap; 78 | typedef std::multimap PriorityMap; 79 | 80 | struct MemNode { 81 | AddressMap::iterator address; 82 | PriorityMap::iterator priority; 83 | int bucket; 84 | }; 85 | 86 | void Compact(size_t extra); 87 | void FreeNode(MemList::iterator memIt); 88 | int LocateBucket(size_t size) const; 89 | 90 | AddressMap _addressMap; 91 | PriorityMap _priorityMap; 92 | MemList _memLists[NumBuckets + 1]; 93 | 94 | size_t _maxObjectSize, _capacity, _allocated, _committed; 95 | int _counter; 96 | }; 97 | 98 | } // namespace mgpu 99 | -------------------------------------------------------------------------------- /gpu-scanline/src/cuda/cuda_cached_allocator.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _MOCHIMAZUI_THRUST_CACHED_ALLOCATOR_H_ 3 | #define _MOCHIMAZUI_THRUST_CACHED_ALLOCATOR_H_ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | //#include 13 | //#include 14 | //#include 15 | //#include 16 | 17 | namespace Mochimazui { 18 | 19 | class cuda_cached_allocator_bad_alloc : public std::runtime_error { 20 | public: 21 | cuda_cached_allocator_bad_alloc(const char *msg) :runtime_error(msg) {} 22 | cuda_cached_allocator_bad_alloc(const std::string &msg) :runtime_error(msg) {} 23 | }; 24 | 25 | // Example by Nathan Bell and Jared Hoberock 26 | // (modified by Mihail Ivakhnenko) 27 | // 28 | // This example demonstrates how to intercept calls to get_temporary_buffer 29 | // and return_temporary_buffer to control how Thrust allocates temporary storage 30 | // during algorithms such as thrust::reduce. The idea will be to create a simple 31 | // cache of allocations to search when temporary storage is requested. If a hit 32 | // is found in the cache, we quickly return the cached allocation instead of 33 | // resorting to the more expensive thrust::cuda::malloc. 34 | // 35 | // Note: this implementation cached_allocator is not thread-safe. If multiple 36 | // (host) threads use the same cached_allocator then they should gain exclusive 37 | // access to the allocator before accessing its methods. 38 | 39 | // cached_allocator: a simple allocator for caching allocation requests 40 | class cuda_cached_allocator { 41 | public: 42 | // just allocate bytes 43 | typedef char value_type; 44 | 45 | cuda_cached_allocator() {} 46 | 47 | ~cuda_cached_allocator() { 48 | // free all allocations when cached_allocator goes out of scope 49 | free_all(); 50 | } 51 | 52 | public: 53 | void reserver(size_t s) { 54 | if (_ptr) { 55 | cudaFree(_ptr); 56 | } 57 | cudaMalloc(&_ptr, s); 58 | _reservedSize = s; 59 | } 60 | 61 | size_t reserved() { 62 | return _reservedSize; 63 | } 64 | 65 | void reset() { 66 | _unallocatedPtr = 0; 67 | } 68 | 69 | void fill_zero() { 70 | cudaMemsetAsync(_ptr, 0, _reservedSize); 71 | } 72 | 73 | size_t allocated() { 74 | return _unallocatedPtr; 75 | } 76 | 77 | char* allocate(std::ptrdiff_t num_bytes) { 78 | size_t newPtr = _unallocatedPtr + num_bytes; 79 | if (newPtr > _reservedSize) { 80 | printf("cuda_cached_allocator: reserved memory exhausted."); 81 | throw std::runtime_error("cuda_cached_allocator: reserved memory exhausted."); 82 | } 83 | char *a = _ptr + _unallocatedPtr; 84 | _unallocatedPtr = newPtr; 85 | 86 | // 256 bit align 87 | if (_unallocatedPtr & 0x1F) { 88 | _unallocatedPtr += (32 - _unallocatedPtr & 0x1F); 89 | } 90 | return a; 91 | } 92 | 93 | template 94 | T *allocate(size_t num) { 95 | return (T*)this->allocate(num *sizeof(T)); 96 | } 97 | 98 | template 99 | void allocate(T **ptr, size_t num) { 100 | *ptr = (T*)this->allocate(num *sizeof(T)); 101 | } 102 | 103 | void deallocate(char* ptr, size_t n) {} 104 | 105 | private: 106 | size_t _reservedSize = 0; 107 | size_t _unallocatedPtr = 0; 108 | char *_ptr = nullptr; 109 | 110 | private: 111 | void free_all() { 112 | cudaFree(_ptr); 113 | } 114 | 115 | }; 116 | 117 | extern cuda_cached_allocator g_thrustCachedAllocator; 118 | extern cuda_cached_allocator &g_alloc; 119 | 120 | } 121 | 122 | #endif 123 | -------------------------------------------------------------------------------- /gpu-scanline/src/timer.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _SVDAG_CUDA_HEADER_H_ 3 | #define _SVDAG_CUDA_HEADER_H_ 4 | 5 | #pragma warning( push ) 6 | #pragma warning( disable : 4819 ) 7 | #include 8 | #include 9 | #include 10 | #include 11 | #pragma warning( pop ) 12 | 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | 22 | namespace Mochimazui { 23 | class Timer { 24 | 25 | public: 26 | void start() { 27 | _totalTime = std::chrono::system_clock::duration::zero(); 28 | resume(); 29 | } 30 | 31 | void pause() { 32 | auto end = std::chrono::system_clock::now(); 33 | _totalTime += end - _start; 34 | } 35 | 36 | void resume() { 37 | _start = std::chrono::system_clock::now(); 38 | } 39 | 40 | void end() { 41 | pause(); 42 | } 43 | 44 | public: 45 | 46 | void start(const std::string &msg) { 47 | _msg = msg; 48 | printf("%s START.\n", msg.c_str()); 49 | start(); 50 | } 51 | 52 | void pause(const std::string &msg) { 53 | pause(); 54 | printf("%s PAUSE.\n", msg.c_str()); 55 | } 56 | 57 | void resume(const std::string &msg) { 58 | resume(); 59 | printf("%s RESUME.\n", msg.c_str()); 60 | } 61 | 62 | void end(const std::string& msg) { 63 | end(); 64 | auto omsg = msg == "" ? _msg : msg; 65 | std::cout << omsg << " END" << std::endl; 66 | std::cout << omsg << " duration = " 67 | << std::chrono::duration_cast(_totalTime).count() 68 | << "ms." << std::endl; 69 | } 70 | 71 | public: 72 | std::chrono::system_clock::duration time() { 73 | return _totalTime; 74 | } 75 | 76 | float time_in_ms() { 77 | return std::chrono::duration_cast(_totalTime).count() / 1000.f; 78 | } 79 | 80 | private: 81 | 82 | std::string _msg; 83 | std::chrono::system_clock::time_point _start; 84 | std::chrono::system_clock::duration _totalTime; 85 | }; 86 | 87 | 88 | class CUDATimer { 89 | 90 | public: 91 | CUDATimer() { 92 | cudaEventCreate(&_start); 93 | cudaEventCreate(&_stop); 94 | } 95 | 96 | void start() { 97 | cudaEventRecord(_start); 98 | } 99 | 100 | void start(const std::string &msg) { 101 | _msg = msg; 102 | start(); 103 | } 104 | 105 | float stop() { 106 | cudaEventRecord(_stop); 107 | cudaEventSynchronize(_stop); 108 | _ms = 0; 109 | cudaEventElapsedTime(&_ms, _start, _stop); 110 | return _ms; 111 | } 112 | 113 | float stop(const std::string &msg) { 114 | stop(); 115 | if (msg.length() != 0) { _msg = msg; } 116 | std::cout << _msg << " END." << std::endl; 117 | std::cout << _msg << " duration = " << _ms << " ms." << std::endl; 118 | return _ms; 119 | } 120 | 121 | private: 122 | std::string _msg; 123 | cudaEvent_t _start, _stop; 124 | float _ms; 125 | }; 126 | } 127 | 128 | static Mochimazui::CUDATimer g_timer_zero; 129 | static int g_timer_enabled = 0; 130 | static inline void timer_reset() { 131 | //clock_gettime(CLOCK_MONOTONIC,&g_timer_zero); 132 | g_timer_zero.start(); 133 | g_timer_enabled = 1; 134 | } 135 | static inline void timer_print(const char* msg, const char* file, int line) { 136 | if (!g_timer_enabled) { return; } 137 | //timespec cur_time; 138 | cudaDeviceSynchronize(); 139 | __getLastCudaError(msg, file, line); 140 | //clock_gettime(CLOCK_MONOTONIC,&cur_time); 141 | //double dt=(double)(cur_time.tv_sec-g_timer_zero.tv_sec)+(double)(cur_time.tv_nsec-g_timer_zero.tv_nsec)*1e-9; 142 | float dt = g_timer_zero.stop(); 143 | //g_timer_zero=cur_time; 144 | printf(">>> %s %5.2f ms\n", msg, dt); 145 | g_timer_zero.start(); 146 | } 147 | static inline void timer_done() { 148 | g_timer_enabled = 0; 149 | } 150 | 151 | #endif -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/config.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | 6 | namespace Mochimazui { 7 | 8 | using namespace ConfigPrivate; 9 | 10 | using std::vector; 11 | using std::string; 12 | using std::basic_string; 13 | using std::runtime_error; 14 | 15 | namespace program_options = boost::program_options; 16 | 17 | // -------- -------- -------- -------- -------- -------- -------- -------- 18 | void Config::addValue(const std::string &iv) { 19 | stdext::string v = iv; 20 | auto l = v.split(':'); 21 | if (l.size() != 2 && l.size() != 3) { 22 | throw std::runtime_error("Config::addValue: invalid ConfigValue format " + iv); 23 | } 24 | ConfigValue cv; 25 | cv.setName(l[0]); 26 | cv.setType(l[1]); 27 | if (l.size() == 3) { 28 | cv.setValue(l[2].c_str()); 29 | } 30 | _value_map[l[0]] = cv; 31 | } 32 | 33 | void Config::addValue(const std::vector &vv) { 34 | for (const auto &v : vv) { 35 | addValue(v); 36 | } 37 | } 38 | 39 | // -------- -------- -------- -------- -------- -------- -------- -------- 40 | void Config::addOption(const std::string &o) { 41 | 42 | ConfigOption co; 43 | 44 | auto i_space = o.find(' '); 45 | 46 | co.name = o.substr(0, i_space); 47 | 48 | auto i_lb = o.find('{'); 49 | auto i_rb = o.find('}'); 50 | 51 | // help info, ignore. 52 | 53 | stdext::string values = o.substr(i_rb + 2); 54 | auto value_list = values.split('#'); 55 | 56 | if (value_list[0] == "0") { co.type = ConfigOption_Void; } 57 | else if (value_list[0] == "1") { co.type = ConfigOption_Value; } 58 | else if (value_list[0] == "*") { co.type = ConfigOption_Array; } 59 | else { throw std::runtime_error("Config::addOption: invalid option format."); } 60 | 61 | for (int i = 1; i < value_list.size(); ++i) { 62 | auto l = value_list[i].split(':'); 63 | if (l.size() != 1 && l.size() != 2) { 64 | throw std::runtime_error("Config::addOption: invalid option format."); 65 | } 66 | ConfigOptionSetValue sv; 67 | sv.name = l[0]; 68 | if (l.size() == 2) { sv.value = l[1]; } 69 | co.values.push_back(sv); 70 | } 71 | 72 | _option_map["-" + co.name] = co; 73 | } 74 | 75 | void Config::addOption(const std::vector &vo) { 76 | for (const auto &o : vo) { 77 | addOption(o); 78 | } 79 | } 80 | 81 | // -------- -------- -------- -------- -------- -------- -------- -------- 82 | void help() { 83 | } 84 | 85 | // -------- -------- -------- -------- -------- -------- -------- -------- 86 | void Config::load(const std::string &file) { 87 | 88 | vector args; 89 | command_line_file_to_args(file, args); 90 | 91 | vector args_ptr; 92 | for (string &s : args) { args_ptr.push_back(s.data()); } 93 | 94 | parse((int)args_ptr.size(), args_ptr.data()); 95 | } 96 | 97 | void Config::parse(int argc, const char *argv[]) { 98 | 99 | for (int i = 1; i < argc;) { 100 | std::string arg = argv[i]; 101 | auto ioption = _option_map.find(arg); 102 | if (ioption == _option_map.end()) { 103 | throw std::runtime_error("Config::parse: unsupported option " + arg); 104 | } 105 | ++i; 106 | 107 | const auto &co = ioption->second; 108 | 109 | std::string value; 110 | if (co.type != ConfigOption_Void) { 111 | if (i >= argc) { throw std::runtime_error(arg + " requires more input."); } 112 | value = argv[i]; 113 | ++i; 114 | } 115 | 116 | for (const auto &v : co.values) { 117 | auto iv = _value_map.find(v.name); 118 | if (iv == _value_map.end()) { 119 | ConfigValue new_value; 120 | new_value.setName(v.name); 121 | new_value.setType("any"); 122 | new_value.setValue(v.value.empty() ? value : v.value); 123 | } 124 | else { 125 | iv->second.setValue(v.value.empty() ? value : v.value); 126 | } 127 | } 128 | 129 | } 130 | 131 | } 132 | 133 | 134 | } 135 | -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/kernels_ext/search_ext.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "../kernels/search.cuh" 38 | 39 | #include "../../../cuda/cuda_cached_allocator.h" 40 | 41 | namespace mgpu_ext { 42 | 43 | using Mochimazui::g_alloc; 44 | 45 | using namespace mgpu; 46 | 47 | template 48 | int *BinarySearchPartitions(int count, It1 data_global, int numItems, 49 | int nv, Comp comp) { 50 | 51 | const int NT = 64; 52 | int numBlocks = MGPU_DIV_UP(count, nv); 53 | int numPartitionBlocks = MGPU_DIV_UP(numBlocks + 1, NT); 54 | //MGPU_MEM(int) partitionsDevice = context.Malloc(numBlocks + 1); 55 | int *partitionsDevice = (int*)g_alloc.allocate(sizeof(int) * (numBlocks + 1)); 56 | 57 | KernelBinarySearch 58 | << > >(count, data_global, 59 | numItems, nv, partitionsDevice, numBlocks + 1, comp); 60 | MGPU_SYNC_CHECK("KernelBinarySearch"); 61 | 62 | return partitionsDevice; 63 | } 64 | 65 | template 66 | int *MergePathPartitions_ext(It1 a_global, int aCount, It2 b_global, 67 | int bCount, int nv, int coop, Comp comp) { 68 | 69 | const int NT = 64; 70 | int numPartitions = MGPU_DIV_UP(aCount + bCount, nv); 71 | int numPartitionBlocks = MGPU_DIV_UP(numPartitions + 1, NT); 72 | 73 | //MGPU_MEM(int) partitionsDevice = context.Malloc(numPartitions + 1); 74 | int *partitionsDevice = g_alloc.allocate(numPartitions + 1); 75 | 76 | KernelMergePartition 77 | << > >(a_global, aCount, 78 | b_global, bCount, nv, coop, partitionsDevice, numPartitions + 1, 79 | comp); 80 | MGPU_SYNC_CHECK("KernelMergePartition"); 81 | 82 | return partitionsDevice; 83 | } 84 | 85 | } 86 | 87 | -------------------------------------------------------------------------------- /gpu-scanline/src/rapidxml_utils.hpp: -------------------------------------------------------------------------------- 1 | #ifndef RAPIDXML_UTILS_HPP_INCLUDED 2 | #define RAPIDXML_UTILS_HPP_INCLUDED 3 | 4 | // Copyright (C) 2006, 2009 Marcin Kalicinski 5 | // Version 1.13 6 | // Revision $DateTime: 2009/05/13 01:46:17 $ 7 | //! \file rapidxml_utils.hpp This file contains high-level rapidxml utilities that can be useful 8 | //! in certain simple scenarios. They should probably not be used if maximizing performance is the main objective. 9 | 10 | #include "rapidxml.hpp" 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace rapidxml 17 | { 18 | 19 | //! Represents data loaded from a file 20 | template 21 | class file 22 | { 23 | 24 | public: 25 | 26 | //! Loads file into the memory. Data will be automatically destroyed by the destructor. 27 | //! \param filename Filename to load. 28 | file(const char *filename) 29 | { 30 | using namespace std; 31 | 32 | // Open stream 33 | basic_ifstream stream(filename, ios::binary); 34 | if (!stream) 35 | throw runtime_error(string("cannot open file ") + filename); 36 | stream.unsetf(ios::skipws); 37 | 38 | // Determine stream size 39 | stream.seekg(0, ios::end); 40 | size_t size = stream.tellg(); 41 | stream.seekg(0); 42 | 43 | // Load data and add terminating 0 44 | m_data.resize(size + 1); 45 | stream.read(&m_data.front(), static_cast(size)); 46 | m_data[size] = 0; 47 | } 48 | 49 | //! Loads file into the memory. Data will be automatically destroyed by the destructor 50 | //! \param stream Stream to load from 51 | file(std::basic_istream &stream) 52 | { 53 | using namespace std; 54 | 55 | // Load data and add terminating 0 56 | stream.unsetf(ios::skipws); 57 | m_data.assign(istreambuf_iterator(stream), istreambuf_iterator()); 58 | if (stream.fail() || stream.bad()) 59 | throw runtime_error("error reading stream"); 60 | m_data.push_back(0); 61 | } 62 | 63 | //! Gets file data. 64 | //! \return Pointer to data of file. 65 | Ch *data() 66 | { 67 | return &m_data.front(); 68 | } 69 | 70 | //! Gets file data. 71 | //! \return Pointer to data of file. 72 | const Ch *data() const 73 | { 74 | return &m_data.front(); 75 | } 76 | 77 | //! Gets file data size. 78 | //! \return Size of file data, in characters. 79 | std::size_t size() const 80 | { 81 | return m_data.size(); 82 | } 83 | 84 | private: 85 | 86 | std::vector m_data; // File data 87 | 88 | }; 89 | 90 | //! Counts children of node. Time complexity is O(n). 91 | //! \return Number of children of node 92 | template 93 | inline std::size_t count_children(xml_node *node) 94 | { 95 | xml_node *child = node->first_node(); 96 | std::size_t count = 0; 97 | while (child) 98 | { 99 | ++count; 100 | child = child->next_sibling(); 101 | } 102 | return count; 103 | } 104 | 105 | //! Counts attributes of node. Time complexity is O(n). 106 | //! \return Number of attributes of node 107 | template 108 | inline std::size_t count_attributes(xml_node *node) 109 | { 110 | xml_attribute *attr = node->first_attribute(); 111 | std::size_t count = 0; 112 | while (attr) 113 | { 114 | ++count; 115 | attr = attr->next_attribute(); 116 | } 117 | return count; 118 | } 119 | 120 | } 121 | 122 | #endif 123 | -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/kernels/loadbalance.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "../mgpuhost.cuh" 38 | #include "../device/ctaloadbalance.cuh" 39 | #include "../kernels/search.cuh" 40 | 41 | namespace mgpu { 42 | 43 | //////////////////////////////////////////////////////////////////////////////// 44 | // KernelLoadBalance 45 | 46 | template 47 | MGPU_LAUNCH_BOUNDS void KernelLoadBalance(int aCount, InputIt b_global, 48 | int bCount, const int* mp_global, int* indices_global) { 49 | 50 | typedef MGPU_LAUNCH_PARAMS Params; 51 | const int NT = Params::NT; 52 | const int VT = Params::VT; 53 | __shared__ int indices_shared[NT * (VT + 1)]; 54 | 55 | int tid = threadIdx.x; 56 | int block = blockIdx.x; 57 | int4 range = CTALoadBalance(aCount, b_global, bCount, block, tid, 58 | mp_global, indices_shared, false); 59 | aCount = range.y - range.x; 60 | 61 | DeviceSharedToGlobal(aCount, indices_shared, tid, 62 | indices_global + range.x, false); 63 | } 64 | 65 | //////////////////////////////////////////////////////////////////////////////// 66 | // LoadBalanceSearch 67 | 68 | template 69 | MGPU_HOST void LoadBalanceSearch(int aCount, InputIt b_global, int bCount, 70 | int* indices_global, CudaContext& context) { 71 | 72 | const int NT = 128; 73 | const int VT = 7; 74 | typedef LaunchBoxVT Tuning; 75 | int2 launch = Tuning::GetLaunchParams(context); 76 | const int NV = launch.x * launch.y; 77 | 78 | MGPU_MEM(int) partitionsDevice = MergePathPartitions( 79 | mgpu::counting_iterator(0), aCount, b_global, bCount, NV, 0, 80 | mgpu::less(), context); 81 | 82 | int numBlocks = MGPU_DIV_UP(aCount + bCount, NV); 83 | KernelLoadBalance<<>>( 84 | aCount, b_global, bCount, partitionsDevice->get(), indices_global); 85 | MGPU_SYNC_CHECK("KernelLoadBalance"); 86 | } 87 | 88 | } // namespace mgpu 89 | -------------------------------------------------------------------------------- /gpu-scanline/src/vg_config.cpp: -------------------------------------------------------------------------------- 1 | 2 | #define _CRT_SECURE_NO_WARNINGS 3 | 4 | #include "vg_config.h" 5 | 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | // -------- -------- -------- -------- -------- -------- -------- -------- 19 | namespace Mochimazui { 20 | 21 | namespace PRIVATE { 22 | boost::program_options::variables_map g_config_variables; 23 | } 24 | 25 | int init_config(int argc, char *argv[]) { 26 | 27 | namespace po = boost::program_options; 28 | 29 | using PRIVATE::g_config_variables; 30 | using Mochimazui::parse_command_line_file; 31 | 32 | po::options_description general_options("General options"); 33 | general_options.add_options() 34 | ("help", "print help") 35 | 36 | ("verbose", po::bool_switch(), "verbose output to console") 37 | ("gl-debug", po::bool_switch(), "enable GL_DEBUG") 38 | ("draw-curve", po::bool_switch(), "draw curve") 39 | ("show-fps", po::bool_switch(), "show fps") 40 | 41 | ("benchmark", po::bool_switch(), "benchmark") 42 | ("step-timing", po::bool_switch(), "step timing") 43 | ("attach-timing-to", po::value()->default_value(""), "") 44 | 45 | ("merge-path", po::bool_switch(), "") 46 | ("minimal-ui", po::bool_switch(), "produce help message") 47 | 48 | ("v-flip", po::bool_switch(), "") 49 | 50 | ("count-pixel", po::bool_switch(), "") 51 | ("attach-pixel-count-to", po::value()->default_value(""), "") 52 | 53 | ("animation", po::bool_switch(), "run chrod animation") 54 | ; 55 | 56 | po::options_description io_options("Input/output options"); 57 | io_options.add_options() 58 | ("file-index", po::value()->multitoken(), "file index") 59 | 60 | ("input-name", po::value()->default_value(""), "") 61 | ("input-file,i", po::value(), "input file") 62 | 63 | ("input-width", po::value()->default_value(0), "") 64 | ("input-height", po::value()->default_value(0), "") 65 | 66 | ("window-width", po::value()->default_value(1200), "") 67 | ("window-height", po::value()->default_value(1024), "") 68 | 69 | ("fit-to-vg", po::bool_switch(), "") 70 | ("fit-to-window", po::bool_switch(), "") 71 | 72 | ("save-output-file", po::bool_switch(), "") 73 | ("output-file,o", po::value()->default_value(""), "input file") 74 | 75 | ("output-width", po::value()->default_value(0), "") 76 | ("output-height", po::value()->default_value(0), "") 77 | ("fix-output-size", po::bool_switch(), "") 78 | ; 79 | 80 | po::options_description rasterizer_options("Rasterizer options"); 81 | rasterizer_options.add_options() 82 | ("c-m-cs", po::bool_switch(), "cut, mask table, comb-like scanline") 83 | 84 | ("lrgb", po::bool_switch(), "") 85 | ("srgb", po::bool_switch(), "") 86 | 87 | ("samples", po::value()->default_value(32), "") 88 | ("ms-output", po::bool_switch(), "") 89 | 90 | ("reserve-ink", po::value()->default_value(0), "reserve ink") 91 | ("tiger-clip", po::bool_switch(), "") 92 | 93 | ("break-before-gl", po::bool_switch(), "break before gl step") 94 | 95 | ("a128", po::bool_switch(), "align alpha value to 1/128") 96 | ; 97 | 98 | po::options_description all_options; 99 | all_options.add(general_options).add(io_options).add(rasterizer_options); 100 | 101 | if (argc == 1) { 102 | po::store(parse_command_line_file("vg_default.cfg", all_options), g_config_variables); 103 | } 104 | else { 105 | po::store(po::parse_command_line(argc, argv, all_options), g_config_variables); 106 | } 107 | 108 | po::notify(g_config_variables); 109 | 110 | if (g_config_variables.count("help")) { 111 | printf("\nLoads \"vg_default.cfg\" by default.\n"); 112 | printf("Using command line argument will skip config file loading.\n"); 113 | std::cout << all_options << "\n"; 114 | return -1; 115 | } 116 | 117 | return 0; 118 | 119 | } 120 | 121 | } 122 | -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/device/launchbox.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "../util/mgpucontext.h" 38 | 39 | namespace mgpu { 40 | 41 | #if __CUDA_ARCH__ >= 350 42 | #define MGPU_SM_TAG Sm35 43 | #elif __CUDA_ARCH__ >= 300 44 | #define MGPU_SM_TAG Sm30 45 | #elif __CUDA_ARCH__ >= 200 46 | #define MGPU_SM_TAG Sm20 47 | #else 48 | #define MGPU_SM_TAG Sm20 49 | #endif 50 | 51 | #define MGPU_LAUNCH_PARAMS typename Tuning::MGPU_SM_TAG 52 | #define MGPU_LAUNCH_BOUNDS __global__ \ 53 | __launch_bounds__(Tuning::MGPU_SM_TAG::NT, Tuning::MGPU_SM_TAG::OCC) 54 | 55 | // Returns (NT, VT) from the sm version. 56 | template 57 | struct LaunchBoxRuntime { 58 | static int2 GetLaunchParams(CudaContext& context) { 59 | return GetLaunchParams(context.PTXVersion()); 60 | } 61 | 62 | static int2 GetLaunchParams(int sm) { 63 | if(sm >= 350) 64 | return make_int2(Derived::Sm35::NT, Derived::Sm35::VT); 65 | else if(sm >= 300) 66 | return make_int2(Derived::Sm30::NT, Derived::Sm30::VT); 67 | else 68 | return make_int2(Derived::Sm20::NT, Derived::Sm20::VT); 69 | } 70 | }; 71 | 72 | // General LaunchBox for any param types. 73 | template< 74 | typename Sm20_, 75 | typename Sm30_ = Sm20_, 76 | typename Sm35_ = Sm30_> 77 | struct LaunchBox : LaunchBoxRuntime > { 78 | typedef Sm20_ Sm20; 79 | typedef Sm30_ Sm30; 80 | typedef Sm35_ Sm35; 81 | }; 82 | 83 | // LaunchBox over (NT, VT, NumBlocks) 84 | template 85 | struct LaunchParamsVT { 86 | enum { NT = NT_, VT = VT_, OCC = OCC_ }; 87 | }; 88 | template< 89 | int NT_SM20, int VT_SM20, int OCC_SM20 = 0, 90 | int NT_SM30 = NT_SM20, int VT_SM30 = VT_SM20, int OCC_SM30 = OCC_SM20, 91 | int NT_SM35 = NT_SM30, int VT_SM35 = VT_SM30, int OCC_SM35 = OCC_SM30> 92 | struct LaunchBoxVT : LaunchBox< 93 | LaunchParamsVT, 94 | LaunchParamsVT, 95 | LaunchParamsVT > { }; 96 | 97 | } // namespace mgpu 98 | -------------------------------------------------------------------------------- /working_directory/shader/R_cut_A_stencil/ms_output_8.vert.glsl: -------------------------------------------------------------------------------- 1 | 2 | #version 450 3 | 4 | // -------- -------- -------- -------- -------- -------- -------- -------- 5 | layout(binding = 0) uniform isamplerBuffer tb_index; 6 | layout(binding = 1) uniform isamplerBuffer tb_span; 7 | layout(binding = 2) uniform isamplerBuffer tb_path_fragment; 8 | layout(binding = 3) uniform isamplerBuffer tb_stencil_mask; 9 | 10 | layout(binding = 4) uniform samplerBuffer tex_table; 11 | layout(binding = 5) uniform sampler2D tex_ramp; 12 | 13 | // -------- -------- -------- -------- -------- -------- -------- -------- 14 | uniform vec3 pid2depth_irampheight; 15 | uniform vec3 inv_proj_rx; 16 | uniform vec3 inv_proj_ry; 17 | uniform vec3 inv_proj_rw; 18 | uniform vec3 inv_proj_rp; 19 | uniform float inv_proj_a; 20 | 21 | uniform bool enable_srgb_correction; 22 | 23 | uniform ivec2 vp_size; 24 | 25 | // -------- -------- -------- -------- -------- -------- -------- -------- 26 | flat out vec4 fragment_color; 27 | 28 | flat out ivec2 path_frag_pos; 29 | 30 | flat out int pixel_mask; 31 | 32 | // -------- -------- -------- -------- -------- -------- -------- -------- 33 | out vec3 gradient_coord_0; 34 | out vec3 gradient_coord_1; 35 | flat out vec3 gradient_ramp_coord; 36 | flat out vec3 gradient_focal_point; 37 | 38 | // -------- -------- -------- -------- -------- -------- -------- -------- 39 | vec4 u8rgba2frgba(int c) { 40 | return vec4(c & 0xFF, (c >> 8) & 0xFF, (c >> 16) & 0xFF, (c >> 24) & 0xFF) / 255.0; 41 | } 42 | 43 | // -------- -------- -------- -------- -------- -------- -------- -------- 44 | float srgb_to_lrgb_f(float f) { 45 | if (f <= 0.04045f) { return f / 12.92f; } 46 | else { return pow((f + 0.055f) / (1.f + 0.055f), 2.4f); } 47 | } 48 | 49 | vec4 srgb_to_lrgb(vec4 c) { 50 | return vec4(srgb_to_lrgb_f(c.r), srgb_to_lrgb_f(c.g), srgb_to_lrgb_f(c.b), c.a); 51 | } 52 | 53 | // -------- -------- -------- -------- 54 | float safeRcpP(float a) { return a > 1e-6 ? 1 / a : 0.0; } 55 | 56 | // -------- -------- -------- -------- 57 | void calc_color(int colori, vec2 vertex) { 58 | 59 | if (uint(colori - 1) < uint(0x01000000)) { 60 | 61 | // 1. fetch gradient transform & focal point. 62 | int path_id = (colori - 1) * 3; 63 | vec4 word0 = texelFetch(tex_table, path_id); 64 | vec4 word1 = texelFetch(tex_table, path_id + 1); 65 | gradient_focal_point = texelFetch(tex_table, path_id + 2).xyz; 66 | 67 | // 2. transform back to object space. 68 | vec2 vertex_0 = vertex + vec2(0.0, 0.5); 69 | vec2 vertex_1 = vertex + vec2(0.0, 1.5); 70 | 71 | vec3 rd_0 = inv_proj_rx*vertex_0.x + inv_proj_ry*vertex_0.y + inv_proj_rw; 72 | vec3 rd_1 = inv_proj_rx*vertex_1.x + inv_proj_ry*vertex_1.y + inv_proj_rw; 73 | 74 | vec3 obj_space_vertex_0 = inv_proj_rp + (inv_proj_a / rd_0.z)*rd_0; 75 | vec3 obj_space_vertex_1 = inv_proj_rp + (inv_proj_a / rd_1.z)*rd_1; 76 | 77 | // 3. transform to gradient space. 78 | gradient_coord_0 = vec3( 79 | obj_space_vertex_0.x*word0.xw + 80 | obj_space_vertex_0.y*vec2(word0.y, word1.x) + 81 | vec2(word0.z, word1.y), 82 | 1.0); 83 | 84 | gradient_coord_1 = vec3( 85 | obj_space_vertex_1.x*word0.xw + 86 | obj_space_vertex_1.y*vec2(word0.y, word1.x) + 87 | vec2(word0.z, word1.y), 88 | 1.0); 89 | 90 | // 4. ramp. 91 | int ramp_coordi = floatBitsToInt(word1.z); 92 | gradient_ramp_coord = vec3( 93 | (float(ramp_coordi & 1023) + 0.5)*(1.0 / 1024.0), 94 | (float(ramp_coordi >> 10) + 0.5)*pid2depth_irampheight.z, 95 | word1.w); 96 | } 97 | else { 98 | gradient_ramp_coord.z = 0.0; 99 | vec4 color = u8rgba2frgba(colori); 100 | fragment_color = enable_srgb_correction ? srgb_to_lrgb(color) : color; 101 | } 102 | } 103 | 104 | // -------- -------- -------- -------- 105 | void main() { 106 | 107 | int index = gl_VertexID >> 1; 108 | int line_vi = gl_VertexID & 1; 109 | 110 | ivec4 draw = texelFetch(tb_index, index); 111 | 112 | path_frag_pos = ivec2(draw.x & 0xFFFF, draw.x >> 16); 113 | 114 | vec2 pos = vec2( 115 | path_frag_pos.x + line_vi * draw.y, 116 | path_frag_pos.y 117 | ); 118 | 119 | calc_color(draw.z, pos); 120 | 121 | pos.y += 1; 122 | 123 | pos.x = pos.x / float(vp_size.x) * 2 - 1.0; 124 | pos.y = pos.y / float(vp_size.y) * 2 - 1.0; 125 | 126 | gl_Position = vec4(pos, 0, 1); 127 | 128 | pixel_mask = (draw.w == 0) ? 0xFFFFFFFF 129 | : texelFetch(tb_stencil_mask, draw.w - 1).r; 130 | } 131 | -------------------------------------------------------------------------------- /working_directory/shader/R_cut_A_stencil/output_8.vert.glsl: -------------------------------------------------------------------------------- 1 | 2 | #version 450 3 | 4 | // -------- -------- -------- -------- -------- -------- -------- -------- 5 | layout(binding = 0) uniform isamplerBuffer tb_index; 6 | layout(binding = 1) uniform isamplerBuffer tb_span; 7 | layout(binding = 2) uniform isamplerBuffer tb_path_fragment; 8 | layout(binding = 3) uniform isamplerBuffer tb_stencil_mask; 9 | 10 | layout(binding = 4) uniform samplerBuffer tex_table; 11 | layout(binding = 5) uniform sampler2D tex_ramp; 12 | 13 | // -------- -------- -------- -------- -------- -------- -------- -------- 14 | uniform vec3 pid2depth_irampheight; 15 | uniform vec3 inv_proj_rx; 16 | uniform vec3 inv_proj_ry; 17 | uniform vec3 inv_proj_rw; 18 | uniform vec3 inv_proj_rp; 19 | uniform float inv_proj_a; 20 | 21 | uniform bool enable_srgb_correction; 22 | 23 | uniform ivec2 vp_size; 24 | 25 | // -------- -------- -------- -------- -------- -------- -------- -------- 26 | //flat out int fragment_type; 27 | flat out vec4 fragment_color; 28 | 29 | flat out ivec2 path_frag_pos; 30 | 31 | flat out int pixel_mask; 32 | 33 | // -------- -------- -------- -------- -------- -------- -------- -------- 34 | out vec3 gradient_coord_0; 35 | out vec3 gradient_coord_1; 36 | flat out vec3 gradient_ramp_coord; 37 | flat out vec3 gradient_focal_point; 38 | 39 | // -------- -------- -------- -------- -------- -------- -------- -------- 40 | vec4 u8rgba2frgba(int c) { 41 | return vec4(c & 0xFF, (c >> 8) & 0xFF, (c >> 16) & 0xFF, (c >> 24) & 0xFF) / 255.0; 42 | } 43 | 44 | // -------- -------- -------- -------- -------- -------- -------- -------- 45 | float srgb_to_lrgb_f(float f) { 46 | if (f <= 0.04045f) { return f / 12.92f; } 47 | else { return pow((f + 0.055f) / (1.f + 0.055f), 2.4f); } 48 | } 49 | 50 | vec4 srgb_to_lrgb(vec4 c) { 51 | return vec4(srgb_to_lrgb_f(c.r), srgb_to_lrgb_f(c.g), srgb_to_lrgb_f(c.b), c.a); 52 | } 53 | 54 | // -------- -------- -------- -------- 55 | float safeRcpP(float a) { return a > 1e-6 ? 1 / a : 0.0; } 56 | 57 | // -------- -------- -------- -------- 58 | void calc_color(int colori, vec2 vertex) { 59 | 60 | if (uint(colori - 1) < uint(0x01000000)) { 61 | 62 | // 1. fetch gradient transform & focal point. 63 | int path_id = (colori - 1) * 3; 64 | vec4 word0 = texelFetch(tex_table, path_id); 65 | vec4 word1 = texelFetch(tex_table, path_id + 1); 66 | gradient_focal_point = texelFetch(tex_table, path_id + 2).xyz; 67 | 68 | // 2. transform back to object space. 69 | vec2 vertex_0 = vertex + vec2(1.0, 0.5); 70 | vec2 vertex_1 = vertex + vec2(1.0, 1.5); 71 | 72 | vec3 rd_0 = inv_proj_rx*vertex_0.x + inv_proj_ry*vertex_0.y + inv_proj_rw; 73 | vec3 rd_1 = inv_proj_rx*vertex_1.x + inv_proj_ry*vertex_1.y + inv_proj_rw; 74 | 75 | vec3 obj_space_vertex_0 = inv_proj_rp + (inv_proj_a / rd_0.z)*rd_0; 76 | vec3 obj_space_vertex_1 = inv_proj_rp + (inv_proj_a / rd_1.z)*rd_1; 77 | 78 | // 3. transform to gradient space. 79 | gradient_coord_0 = vec3( 80 | obj_space_vertex_0.x*word0.xw + 81 | obj_space_vertex_0.y*vec2(word0.y, word1.x) + 82 | vec2(word0.z, word1.y), 83 | 1.0); 84 | 85 | gradient_coord_1 = vec3( 86 | obj_space_vertex_1.x*word0.xw + 87 | obj_space_vertex_1.y*vec2(word0.y, word1.x) + 88 | vec2(word0.z, word1.y), 89 | 1.0); 90 | 91 | // 4. ramp. 92 | int ramp_coordi = floatBitsToInt(word1.z); 93 | gradient_ramp_coord = vec3( 94 | (float(ramp_coordi & 1023) + 0.5)*(1.0 / 1024.0), 95 | (float(ramp_coordi >> 10) + 0.5)*pid2depth_irampheight.z, 96 | word1.w); 97 | } 98 | else { 99 | gradient_ramp_coord.z = 0.0; 100 | vec4 color = u8rgba2frgba(colori); 101 | fragment_color = enable_srgb_correction ? srgb_to_lrgb(color) : color; 102 | } 103 | } 104 | 105 | // -------- -------- -------- -------- 106 | void main() { 107 | 108 | int index = gl_VertexID >> 1; 109 | int line_vi = gl_VertexID & 1; 110 | 111 | ivec4 draw = texelFetch(tb_index, index); 112 | 113 | path_frag_pos = ivec2(draw.x & 0xFFFF, draw.x >> 16); 114 | 115 | vec2 pos = vec2( 116 | path_frag_pos.x + line_vi * draw.y, 117 | path_frag_pos.y 118 | ); 119 | 120 | calc_color(draw.z, pos); 121 | 122 | pos.y += 1; 123 | 124 | pos.x = pos.x / float(vp_size.x) * 2 - 1.0; 125 | pos.y = pos.y / float(vp_size.y) * 2 - 1.0; 126 | 127 | gl_Position = vec4(pos, 0, 1); 128 | 129 | pixel_mask = (draw.w == 0) ? 0xFFFFFFFF 130 | : texelFetch(tb_stencil_mask, draw.w - 1).r; 131 | } 132 | -------------------------------------------------------------------------------- /working_directory/shader/R_cut_A_stencil/output_32.vert.glsl: -------------------------------------------------------------------------------- 1 | 2 | #version 450 3 | 4 | // -------- -------- -------- -------- -------- -------- -------- -------- 5 | layout(binding = 0) uniform isamplerBuffer tb_index; 6 | layout(binding = 1) uniform isamplerBuffer tb_span; 7 | layout(binding = 2) uniform isamplerBuffer tb_path_fragment; 8 | layout(binding = 3) uniform isamplerBuffer tb_stencil_mask; 9 | 10 | layout(binding = 4) uniform samplerBuffer tex_table; 11 | layout(binding = 5) uniform sampler2D tex_ramp; 12 | 13 | // -------- -------- -------- -------- -------- -------- -------- -------- 14 | uniform vec3 pid2depth_irampheight; 15 | uniform vec3 inv_proj_rx; 16 | uniform vec3 inv_proj_ry; 17 | uniform vec3 inv_proj_rw; 18 | uniform vec3 inv_proj_rp; 19 | uniform float inv_proj_a; 20 | 21 | uniform bool enable_srgb_correction; 22 | 23 | uniform ivec2 vp_size; 24 | 25 | // -------- -------- -------- -------- -------- -------- -------- -------- 26 | flat out vec4 fragment_color; 27 | 28 | flat out ivec2 path_frag_pos; 29 | 30 | flat out ivec4 pixel_mask; 31 | 32 | // -------- -------- -------- -------- -------- -------- -------- -------- 33 | out vec3 gradient_coord_0; 34 | out vec3 gradient_coord_1; 35 | flat out vec3 gradient_ramp_coord; 36 | flat out vec3 gradient_focal_point; 37 | 38 | // -------- -------- -------- -------- -------- -------- -------- -------- 39 | vec4 u8rgba2frgba(int c) { 40 | return vec4(c & 0xFF, (c >> 8) & 0xFF, (c >> 16) & 0xFF, (c >> 24) & 0xFF) / 255.0; 41 | } 42 | 43 | // -------- -------- -------- -------- -------- -------- -------- -------- 44 | float srgb_to_lrgb_f(float f) { 45 | if (f <= 0.04045f) { return f / 12.92f; } 46 | else { return pow((f + 0.055f) / (1.f + 0.055f), 2.4f); } 47 | } 48 | 49 | vec4 srgb_to_lrgb(vec4 c) { 50 | return vec4(srgb_to_lrgb_f(c.r), srgb_to_lrgb_f(c.g), srgb_to_lrgb_f(c.b), c.a); 51 | } 52 | 53 | // -------- -------- -------- -------- 54 | float safeRcpP(float a) { return a > 1e-6 ? 1 / a : 0.0; } 55 | 56 | // -------- -------- -------- -------- 57 | void calc_color(int colori, vec2 vertex) { 58 | 59 | if (uint(colori - 1) < uint(0x01000000)) { 60 | 61 | // 1. fetch gradient transform & focal point. 62 | int path_id = (colori - 1) * 3; 63 | vec4 word0 = texelFetch(tex_table, path_id); 64 | vec4 word1 = texelFetch(tex_table, path_id + 1); 65 | gradient_focal_point = texelFetch(tex_table, path_id + 2).xyz; 66 | 67 | // 2. transform back to object space. 68 | vec2 vertex_0 = vertex + vec2(1.0, 0.5); 69 | vec2 vertex_1 = vertex + vec2(1.0, 1.5); 70 | 71 | vec3 rd_0 = inv_proj_rx*vertex_0.x + inv_proj_ry*vertex_0.y + inv_proj_rw; 72 | vec3 rd_1 = inv_proj_rx*vertex_1.x + inv_proj_ry*vertex_1.y + inv_proj_rw; 73 | 74 | vec3 obj_space_vertex_0 = inv_proj_rp + (inv_proj_a / rd_0.z)*rd_0; 75 | vec3 obj_space_vertex_1 = inv_proj_rp + (inv_proj_a / rd_1.z)*rd_1; 76 | 77 | // 3. transform to gradient space. 78 | gradient_coord_0 = vec3( 79 | obj_space_vertex_0.x*word0.xw + 80 | obj_space_vertex_0.y*vec2(word0.y, word1.x) + 81 | vec2(word0.z, word1.y), 82 | 1.0); 83 | 84 | gradient_coord_1 = vec3( 85 | obj_space_vertex_1.x*word0.xw + 86 | obj_space_vertex_1.y*vec2(word0.y, word1.x) + 87 | vec2(word0.z, word1.y), 88 | 1.0); 89 | 90 | // 4. ramp. 91 | int ramp_coordi = floatBitsToInt(word1.z); 92 | gradient_ramp_coord = vec3( 93 | (float(ramp_coordi & 1023) + 0.5)*(1.0 / 1024.0), 94 | (float(ramp_coordi >> 10) + 0.5)*pid2depth_irampheight.z, 95 | word1.w); 96 | } 97 | else { 98 | gradient_ramp_coord.z = 0.0; 99 | vec4 color = u8rgba2frgba(colori); 100 | fragment_color = enable_srgb_correction ? srgb_to_lrgb(color) : color; 101 | } 102 | } 103 | 104 | // -------- -------- -------- -------- 105 | void main() { 106 | 107 | int index = gl_VertexID >> 1; 108 | int line_vi = gl_VertexID & 1; 109 | 110 | ivec4 draw = texelFetch(tb_index, index); 111 | 112 | path_frag_pos = ivec2(draw.x & 0xFFFF, draw.x >> 16); 113 | 114 | vec2 pos = vec2( 115 | path_frag_pos.x + line_vi * draw.y, 116 | path_frag_pos.y 117 | ); 118 | 119 | calc_color(draw.z, pos); 120 | 121 | pos.y += 1; 122 | 123 | pos.x = pos.x / float(vp_size.x) * 2 - 1.0; 124 | pos.y = pos.y / float(vp_size.y) * 2 - 1.0; 125 | 126 | gl_Position = vec4(pos, 0, 1); 127 | 128 | pixel_mask = (draw.w == 0) ? 129 | ivec4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF) 130 | : texelFetch(tb_stencil_mask, draw.w - 1); 131 | } 132 | -------------------------------------------------------------------------------- /gpu-scanline/src/vg_config.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _MOCHIMAZUI_VG_CONFIG_H_ 3 | #define _MOCHIMAZUI_VG_CONFIG_H_ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | #include 14 | 15 | #include 16 | 17 | #include "rasterizer/shared/ras_pipeline_mode.h" 18 | 19 | namespace Mochimazui { 20 | 21 | namespace PRIVATE { 22 | extern boost::program_options::variables_map g_config_variables; 23 | } 24 | 25 | int init_config(int argc, char *argv[]); 26 | 27 | template 28 | T get_config(const std::string &key) { 29 | using PRIVATE::g_config_variables; 30 | if (g_config_variables.count(key)) { 31 | return g_config_variables[key].as(); 32 | } 33 | throw std::runtime_error("Mochimazui::get_config: \"" + key + "\" not set"); 34 | } 35 | 36 | // -------- -------- -------- -------- -------- -------- -------- -------- 37 | 38 | namespace VGConfig { 39 | 40 | // general 41 | 42 | inline bool Help() { return get_config("help"); } 43 | 44 | inline bool Verbose() { return get_config("verbose"); } 45 | inline bool GLDebug() { return get_config("gl-debug"); } 46 | inline bool DrawCurve() { return get_config("draw-curve"); } 47 | inline bool ShowFPS() { return get_config("show-fps"); } 48 | 49 | inline bool Benchmark() { return get_config("benchmark"); } 50 | inline bool StepTiming() { return get_config("step-timing"); } 51 | inline const std::string AttachTimingToFile() { 52 | return get_config("attach-timing-to"); 53 | } 54 | 55 | inline bool MergeAdjacentPath() { return get_config("merge-path"); } 56 | //inline bool MinimalUI() { return get_config("minimal-ui"); } 57 | inline bool MinimalUI() { return true; } 58 | 59 | inline bool OutputVerticalFlip() { return get_config ("v-flip"); } 60 | 61 | inline bool Animation() { return get_config("animation"); } 62 | 63 | // input / output 64 | 65 | inline std::string Name() { return get_config("input-name"); } 66 | inline std::string InputName() { return get_config("input-name"); } 67 | inline std::string InputFile() { return get_config("input-file"); } 68 | inline int InputWidth() { return get_config("input-width"); } 69 | inline int InputHeight() { return get_config("input-height"); }; 70 | 71 | inline int WindowWidth() { return get_config("window-width"); } 72 | inline int WindowHeight() { return get_config("window-height"); } 73 | inline glm::ivec2 WindowSize() { 74 | return glm::ivec2(WindowWidth(), WindowHeight()); 75 | } 76 | 77 | inline bool FitVGToWindowSize() { return get_config("fit-to-window"); } 78 | inline bool FitWindowToVGSize() { return get_config("fit-to-vg"); } 79 | 80 | inline bool SaveOutputFile() { return get_config("save-output-file"); } 81 | inline std::string OutputFile() { return get_config("output-file"); } 82 | 83 | inline int OutputWidth() { return get_config("output-width"); } 84 | inline int OutputHeight() { return get_config("output-height"); } 85 | inline glm::ivec2 OutputSize() { 86 | return glm::ivec2(OutputWidth(), OutputHeight()); 87 | } 88 | inline bool FixOutputSize() { return get_config("fix-output-size"); } 89 | 90 | // rasterizer config 91 | 92 | inline RasterizerPipelineMode PipelineMode() { 93 | if (get_config("c-m-cs")) { 94 | return PM_Cut_Mask_Comb_Scanline; 95 | } 96 | else { 97 | return PM_Cut_Mask_Comb_Scanline; 98 | } 99 | } 100 | 101 | inline bool linearRGB() { return get_config("lrgb"); } 102 | inline bool sRGB() { return get_config("srgb"); } 103 | 104 | inline int Samples() { return get_config("samples"); } 105 | inline bool MultisampleOutput() { return get_config("ms-output"); } 106 | 107 | inline bool UseMaskTable() { return true; } 108 | 109 | inline int ReserveInk() { return get_config("reserve-ink"); } 110 | inline bool TigerClip() { return get_config("tiger-clip"); } 111 | 112 | inline bool BreakBeforeGL() { return get_config("break-before-gl"); } 113 | 114 | inline bool A128() { return get_config("a128"); } 115 | 116 | inline bool CountPixel() { return get_config("count-pixel"); } 117 | inline std::string AttachPixelCountToFile() { return get_config("attach-pixel-count-to"); } 118 | 119 | } 120 | 121 | } 122 | 123 | #endif 124 | -------------------------------------------------------------------------------- /working_directory/shader/R_cut_A_stencil/ms_output_32.vert.glsl: -------------------------------------------------------------------------------- 1 | 2 | #version 450 3 | 4 | // -------- -------- -------- -------- -------- -------- -------- -------- 5 | layout(binding = 0) uniform isamplerBuffer tb_index; 6 | layout(binding = 1) uniform isamplerBuffer tb_span; 7 | layout(binding = 2) uniform isamplerBuffer tb_path_fragment; 8 | layout(binding = 3) uniform isamplerBuffer tb_stencil_mask; 9 | 10 | layout(binding = 4) uniform samplerBuffer tex_table; 11 | layout(binding = 5) uniform sampler2D tex_ramp; 12 | 13 | // -------- -------- -------- -------- -------- -------- -------- -------- 14 | uniform vec3 pid2depth_irampheight; 15 | uniform vec3 inv_proj_rx; 16 | uniform vec3 inv_proj_ry; 17 | uniform vec3 inv_proj_rw; 18 | uniform vec3 inv_proj_rp; 19 | uniform float inv_proj_a; 20 | 21 | uniform bool enable_srgb_correction; 22 | 23 | uniform ivec2 vp_size; 24 | 25 | // -------- -------- -------- -------- -------- -------- -------- -------- 26 | //flat out int fragment_type; 27 | flat out vec4 fragment_color; 28 | 29 | flat out ivec2 path_frag_pos; 30 | 31 | flat out ivec4 pixel_mask; 32 | 33 | // -------- -------- -------- -------- -------- -------- -------- -------- 34 | out vec3 gradient_coord_0; 35 | out vec3 gradient_coord_1; 36 | flat out vec3 gradient_ramp_coord; 37 | flat out vec3 gradient_focal_point; 38 | 39 | // -------- -------- -------- -------- -------- -------- -------- -------- 40 | vec4 u8rgba2frgba(int c) { 41 | return vec4(c & 0xFF, (c >> 8) & 0xFF, (c >> 16) & 0xFF, (c >> 24) & 0xFF) / 255.0; 42 | } 43 | 44 | // -------- -------- -------- -------- -------- -------- -------- -------- 45 | float srgb_to_lrgb_f(float f) { 46 | if (f <= 0.04045f) { return f / 12.92f; } 47 | else { return pow((f + 0.055f) / (1.f + 0.055f), 2.4f); } 48 | } 49 | 50 | vec4 srgb_to_lrgb(vec4 c) { 51 | return vec4(srgb_to_lrgb_f(c.r), srgb_to_lrgb_f(c.g), srgb_to_lrgb_f(c.b), c.a); 52 | } 53 | 54 | // -------- -------- -------- -------- 55 | float safeRcpP(float a) { return a > 1e-6 ? 1 / a : 0.0; } 56 | 57 | // -------- -------- -------- -------- 58 | void calc_color(int colori, vec2 vertex) { 59 | 60 | if (uint(colori - 1) < uint(0x01000000)) { 61 | 62 | // 1. fetch gradient transform & focal point. 63 | int path_id = (colori - 1) * 3; 64 | vec4 word0 = texelFetch(tex_table, path_id); 65 | vec4 word1 = texelFetch(tex_table, path_id + 1); 66 | gradient_focal_point = texelFetch(tex_table, path_id + 2).xyz; 67 | 68 | // 2. transform back to object space. 69 | vec2 vertex_0 = vertex + vec2(0.0, 0.5); 70 | vec2 vertex_1 = vertex + vec2(0.0, 1.5); 71 | 72 | vec3 rd_0 = inv_proj_rx*vertex_0.x + inv_proj_ry*vertex_0.y + inv_proj_rw; 73 | vec3 rd_1 = inv_proj_rx*vertex_1.x + inv_proj_ry*vertex_1.y + inv_proj_rw; 74 | 75 | vec3 obj_space_vertex_0 = inv_proj_rp + (inv_proj_a / rd_0.z)*rd_0; 76 | vec3 obj_space_vertex_1 = inv_proj_rp + (inv_proj_a / rd_1.z)*rd_1; 77 | 78 | // 3. transform to gradient space. 79 | gradient_coord_0 = vec3( 80 | obj_space_vertex_0.x*word0.xw + 81 | obj_space_vertex_0.y*vec2(word0.y, word1.x) + 82 | vec2(word0.z, word1.y), 83 | 1.0); 84 | 85 | gradient_coord_1 = vec3( 86 | obj_space_vertex_1.x*word0.xw + 87 | obj_space_vertex_1.y*vec2(word0.y, word1.x) + 88 | vec2(word0.z, word1.y), 89 | 1.0); 90 | 91 | // 4. ramp. 92 | int ramp_coordi = floatBitsToInt(word1.z); 93 | gradient_ramp_coord = vec3( 94 | (float(ramp_coordi & 1023) + 0.5)*(1.0 / 1024.0), 95 | (float(ramp_coordi >> 10) + 0.5)*pid2depth_irampheight.z, 96 | word1.w); 97 | } 98 | else { 99 | gradient_ramp_coord.z = 0.0; 100 | vec4 color = u8rgba2frgba(colori); 101 | fragment_color = enable_srgb_correction ? srgb_to_lrgb(color) : color; 102 | } 103 | } 104 | 105 | // -------- -------- -------- -------- 106 | void main() { 107 | 108 | int index = gl_VertexID >> 1; 109 | int line_vi = gl_VertexID & 1; 110 | 111 | ivec4 draw = texelFetch(tb_index, index); 112 | 113 | path_frag_pos = ivec2( draw.x & 0xFFFF, draw.x >> 16 ); 114 | 115 | vec2 pos = vec2( 116 | path_frag_pos.x + line_vi * draw.y, 117 | path_frag_pos.y 118 | ); 119 | 120 | calc_color(draw.z, pos); 121 | 122 | pos.y += 1; 123 | 124 | pos.x = pos.x / float(vp_size.x) * 2 - 1.0; 125 | pos.y = pos.y / float(vp_size.y) * 2 - 1.0; 126 | 127 | gl_Position = vec4(pos, 0, 1); 128 | 129 | pixel_mask = (draw.w == 0) ? 130 | ivec4(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF) 131 | : texelFetch(tb_stencil_mask, draw.w -1); 132 | } 133 | -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/util/util.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "static.h" 38 | 39 | namespace mgpu { 40 | 41 | extern int Rand(int min, int max); 42 | extern int64 Rand(int64 min, int64 max); 43 | extern uint Rand(uint min, uint max); 44 | extern uint64 Rand(uint64 min, uint64 max); 45 | extern float Rand(float min, float max); 46 | extern double Rand(double min, double max); 47 | 48 | 49 | //////////////////////////////////////////////////////////////////////////////// 50 | // intrusive_ptr 51 | 52 | // boost::noncopyable, moved here so we don't have dependency on boost 53 | class noncopyable { 54 | protected: 55 | noncopyable() {} 56 | ~noncopyable() {} 57 | private: 58 | noncopyable(const noncopyable&) { } 59 | const noncopyable& operator=(const noncopyable&) { return *this; } 60 | }; 61 | 62 | class CudaBase : public noncopyable { 63 | public: 64 | CudaBase() : _ref(0) { } 65 | virtual ~CudaBase() { } 66 | virtual long AddRef() { 67 | // return BOOST_INTERLOCKED_INCREMENT(&_ref); 68 | return ++_ref; 69 | } 70 | virtual void Release() { 71 | // if(!BOOST_INTERLOCKED_DECREMENT(&_ref)) delete this; 72 | if(!--_ref) delete this; 73 | } 74 | private: 75 | long _ref; 76 | }; 77 | 78 | inline long intrusive_ptr_add_ref(CudaBase* base) { 79 | return base->AddRef(); 80 | } 81 | 82 | inline void intrusive_ptr_release(CudaBase* base) { 83 | base->Release(); 84 | } 85 | 86 | template 87 | class intrusive_ptr { 88 | public: 89 | intrusive_ptr() : _p(0) { } 90 | explicit intrusive_ptr(T* p) : _p(p) { 91 | if(p) intrusive_ptr_add_ref(p); 92 | } 93 | intrusive_ptr(const intrusive_ptr& rhs) : _p(rhs._p) { 94 | if(_p) intrusive_ptr_add_ref(_p); 95 | } 96 | ~intrusive_ptr() { 97 | if(_p) intrusive_ptr_release(_p); 98 | } 99 | intrusive_ptr& operator=(const intrusive_ptr& rhs) { 100 | intrusive_ptr(rhs.get()).swap(*this); 101 | return *this; 102 | } 103 | 104 | void reset(T* p = 0) { 105 | intrusive_ptr(p).swap(*this); 106 | } 107 | T* release() { 108 | T* p = _p; 109 | _p = 0; 110 | return p; 111 | } 112 | 113 | T* get() const { return _p; } 114 | operator T*() const { return _p; } 115 | T* operator->() const { return _p; } 116 | 117 | void swap(intrusive_ptr& rhs) { 118 | std::swap(_p, rhs._p); 119 | } 120 | private: 121 | T* _p; 122 | }; 123 | 124 | } // namespace mgpu 125 | -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/camera_controller_3d.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "camera_controller_3d.h" 3 | 4 | namespace Mochimazui { 5 | 6 | void CameraController3D::init(int width, int height) { 7 | _sceneWidth = width; 8 | _sceneHeight = height; 9 | _rotateCenter = glm::vec3(width / 2.f, height / 2.f, 0.f); 10 | _rotateNormal = glm::vec3(0.f, 0.f, 1.f); 11 | _walkx = glm::vec3(1.f, 0.f, 0.f); 12 | _walky = glm::vec3(0.f, -1.f, 0.f); 13 | } 14 | 15 | void CameraController3D::fitToView(int vWidth, int vHeight) { 16 | 17 | //vWidth += 1; 18 | //vHeight += 1; 19 | 20 | Camera3D::reset(); 21 | 22 | float sw = (float)vWidth / (float)_sceneWidth; 23 | float sh = (float)vHeight / (float)_sceneHeight; 24 | float s = sw < sh ? sw : sh; 25 | Camera3D::scale(glm::vec3(s, s, s)); 26 | 27 | float nw = _sceneWidth * s; 28 | float nh = _sceneHeight * s; 29 | 30 | float dw = abs(vWidth - nw) *.5f; 31 | float dh = abs(vHeight - nh) * .5f; 32 | 33 | Camera3D::translate(glm::vec3(dw, dh, 0)); 34 | 35 | init(vWidth, vHeight); 36 | } 37 | 38 | // 39 | glm::mat4x4 CameraController3D::projectionMatrix() { 40 | 41 | float W = (float)_sceneWidth; 42 | float H = (float)_sceneHeight; 43 | float W2 = W / 2.f; 44 | float H2 = H / 2.f; 45 | 46 | float S = W < H ? W : H; 47 | 48 | auto mat = 49 | glm::mat4x4( 50 | 1, 0, 0, 0, 51 | 0, 1, 0, 0, 52 | 0, 0, 1, 0, 53 | W2, H2, -S, 1 54 | ) 55 | * 56 | glm::mat4x4( 57 | S, 0, 0, 0, 58 | 0, S, 0, 0, 59 | 0, 0, 1, 1, 60 | 0, 0, 0, 0 61 | ) 62 | * 63 | glm::mat4x4( 64 | 1, 0, 0, 0, 65 | 0, 1, 0, 0, 66 | 0, 0, 1, 0, 67 | -W2, -H2, S, 1 68 | ); 69 | 70 | return mat; 71 | } 72 | 73 | // 74 | glm::mat4x4 CameraController3D::modelViewMatrix() { 75 | return Camera3D::matrix(); 76 | } 77 | 78 | void CameraController3D::leftButtonDown(int x, int y) { 79 | _leftButton = true; 80 | _lastPos = _leftButtonClickPos = ivec2(x, y); 81 | } 82 | 83 | void CameraController3D::leftButtonUp(int x, int y) { 84 | _leftButton = false; 85 | } 86 | 87 | void CameraController3D::rightButtonDown(int x, int y) { 88 | _rightButton = true; 89 | _lastPos = _rightButtonClickPos = ivec2(x, y); 90 | } 91 | 92 | void CameraController3D::rightButtonUp(int x, int y) { 93 | _rightButton = false; 94 | } 95 | 96 | void CameraController3D::move(int x, int y) { 97 | switch (_controllerMode) { 98 | case CCM_NULL: 99 | break; 100 | case CCM_MOVE: 101 | handleMove(x, y); 102 | break; 103 | case CCM_TURN: 104 | handleTurn(x, y); 105 | break; 106 | case CCM_ROTATE: 107 | handleRotate(x, y); 108 | break; 109 | default: 110 | throw std::runtime_error("CameraController3D::unsupported controller mode"); 111 | } 112 | } 113 | 114 | // -------- -------- -------- -------- -------- -------- -------- -------- 115 | void CameraController3D::handleMove(int x, int y) { 116 | ivec2 cp(x, y); 117 | if (_leftButton || _rightButton) { 118 | ivec2 delta = cp - _lastPos; 119 | delta.y *= -1; 120 | Camera3D::translate(delta.x * _walkx + delta.y * _walky); 121 | } 122 | _lastPos = cp; 123 | } 124 | 125 | void CameraController3D::handleTurn(int x, int y) { 126 | 127 | if (!_leftButton) { return;} 128 | 129 | static const double RV = 0.01; 130 | ivec2 cp(x, y); 131 | glm::vec2 delta = cp - _lastPos; 132 | //delta.y *= -1; 133 | delta *= RV; 134 | 135 | auto tv3 = [](const glm::mat4x4& m, glm::vec3 &r3) { 136 | auto r4 = glm::vec4(r3.x, r3.y, r3.z, 0.f); 137 | r4 = m * r4; 138 | r4 = glm::normalize(r4); 139 | r3 = glm::vec3(r4.x, r4.y, r4.z); 140 | }; 141 | 142 | Camera3D::turn_y(delta.y, _rotateCenter.y); 143 | auto m = glm::rotate(delta.y, glm::vec3(1, 0, 0)); 144 | tv3(m, _rotateNormal); 145 | tv3(m, _walkx); 146 | tv3(m, _walky); 147 | 148 | _lastPos = cp; 149 | } 150 | 151 | void CameraController3D::handleRotate(int x, int y) { 152 | 153 | if (!_leftButton) { return; } 154 | 155 | static const double RV = 0.001; 156 | ivec2 cp(x, y); 157 | glm::vec2 delta = cp - _lastPos; 158 | delta.y *= -1; 159 | delta *= RV; 160 | 161 | auto tv3 = [](const glm::mat4x4& m, glm::vec3 &r3) { 162 | auto r4 = glm::vec4(r3.x, r3.y, r3.z, 0.f); 163 | r4 = m * r4; 164 | r4 = glm::normalize(r4); 165 | r3 = glm::vec3(r4.x, r4.y, r4.z); 166 | }; 167 | 168 | Camera3D::rotate_cn(delta.x, _rotateCenter, _rotateNormal); 169 | auto m = glm::rotate(delta.x, _rotateNormal); 170 | 171 | _lastPos = cp; 172 | } 173 | 174 | } -------------------------------------------------------------------------------- /gpu-scanline/src/rasterizer/shared/ras_define.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _MOCHIMAZUI_RASTERIZER_SHARED_DEFINE_H_ 3 | #define _MOCHIMAZUI_RASTERIZER_SHARED_DEFINE_H_ 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | // -------- -------- -------- -------- -------- -------- -------- -------- 11 | // CONST 12 | 13 | //#define VG_RASTERIZER_FRAGMENT_TEXTURE_WIDTH (1024) 14 | //#define VG_RASTERIZER_FRAGMENT_TEXTURE_HEIGHT (1024) 15 | 16 | //#define VG_RASTERIZER_FRAGMENT_TEXTURE_WIDTH (2048) 17 | //#define VG_RASTERIZER_FRAGMENT_TEXTURE_HEIGHT (2048) 18 | 19 | #define VG_RASTERIZER_FRAGMENT_TEXTURE_WIDTH (4096) 20 | #define VG_RASTERIZER_FRAGMENT_TEXTURE_HEIGHT (4096) 21 | 22 | #define VG_RASTERIZER_BIG_FRAGMENT_SIZE (2) 23 | 24 | // -------- -------- -------- -------- -------- -------- -------- -------- 25 | // CUDA 26 | 27 | #define CUDA_DEVICE_SYNC_AND_CHECK_ERROR(msg) { \ 28 | cudaDeviceSynchronize(); __getLastCudaError (msg, __FILE__, __LINE__); } 29 | 30 | // 31 | #ifdef _DEBUG 32 | #define DEBUG_CUDA_DEVICE_SYNC() cudaDeviceSynchronize() 33 | #define DEBUG_CUDA_DEVICE_SYNC_AND_CHECK_ERROR(msg) { \ 34 | cudaDeviceSynchronize(); __getLastCudaError (msg, __FILE__, __LINE__); } 35 | #else 36 | #define DEBUG_CUDA_DEVICE_SYNC() 37 | #define DEBUG_CUDA_DEVICE_SYNC_AND_CHECK_ERROR(msg) 38 | //#define DEBUG_CUDA_DEVICE_SYNC_AND_CHECK_ERROR(msg) { \ 39 | // cudaDeviceSynchronize(); __getLastCudaError (msg, __FILE__, __LINE__); } 40 | //#define DEBUG_CUDA_DEVICE_SYNC_AND_CHECK_ERROR(msg) {timer_print(msg,__FILE__, __LINE__);} 41 | #endif 42 | 43 | #define GET_ID() (blockDim.x * blockIdx.x + threadIdx.x) 44 | 45 | inline int divup(int a, int b) { return (a + (b - 1)) / b; } 46 | #define LAUNCH(kernel,N,NT,args) {kernel <<< divup(N,NT),NT >>>args; \ 47 | DEBUG_CUDA_DEVICE_SYNC_AND_CHECK_ERROR(#kernel);} 48 | //#define LAUNCH(kernel,N,NT,args) {kernel <<< (N+(NT-1))/NT,NT >>>args; \ 49 | // DEBUG_CUDA_DEVICE_SYNC_AND_CHECK_ERROR(#kernel);} 50 | 51 | #define ASSERT(a) 52 | 53 | // -------- -------- -------- -------- -------- -------- -------- -------- 54 | // GL 55 | 56 | #ifdef _DEBUG 57 | #define DEBUG_GL_FINISH() glFinish() 58 | #else 59 | #define DEBUG_GL_FINISH() 60 | //#define DEBUG_GL_FINISH() glFinish() 61 | #endif 62 | 63 | #define SHADER_DEFINE_TEXT(text) #text 64 | #define SHADER_DEFINE(name) #name" "SHADER_DEFINE_TEXT(name) 65 | #define SHADER_REDEFINE(new_name, old_name) #new_name" "SHADER_DEFINE_TEXT(old_name) 66 | 67 | #ifdef __CUDACC__ 68 | #ifndef __ldg 69 | #define __ldg(a) (*(a)) 70 | #pragma comment( user, "__ldg not defined" ) 71 | #endif 72 | #endif 73 | 74 | // -------- -------- -------- -------- -------- -------- -------- -------- 75 | //#define QM_MASK_TABLE_RES 256 76 | //#define QM_MASK_TABLE_PACKING_SCALE 0.5f 77 | //#define QM_MASK_TABLE_N_SAMPLES 128 78 | //#define QM_MASK_TABLE_FETCH_TEST_RES 33 79 | 80 | // -------- -------- -------- -------- -------- -------- -------- -------- 81 | 82 | //#define ENABLE_MPVG_SHIFT 83 | //#define ENABLE_NVPR_SHIFT 84 | 85 | // -------- -------- -------- -------- -------- -------- -------- -------- 86 | // -------- -------- -------- -------- -------- -------- -------- -------- 87 | // path visible flag: 88 | // 89 | // y > height 90 | // 91 | // 5 | 6 | 7 92 | // x < 0 3 | x | 4 x > width 93 | // 0 | 1 | 2 94 | // 95 | // y < 0 96 | // 97 | 98 | // -------- -------- -------- -------- 99 | // Big endian 100 | // 012: 11100000 0x0101010000000000 : ~ 0x0000000101010101 101 | // 247: 00101001 0x0000010001000001 : ~ 0x0101000100010100 102 | // 567: 00000111 0x0000000000010101 : ~ 0x0101010101000000 103 | // 035: 10010100 0x0100000100010000 : ~ 0x0001010001000101 104 | 105 | //#define PATH_INVISIBLE(mask) ( \ 106 | // (!(mask & 0x0000000101010101)) \ 107 | // || (!(mask & 0x0101000100010100)) \ 108 | // || (!(mask & 0x0101010101000000)) \ 109 | // || (!(mask & 0x0001010001000101)) \ 110 | //) 111 | 112 | // -------- -------- -------- -------- 113 | // little endian 114 | // 012: 11100000 00000111 : ~ 0x0101010101000000 115 | // 247: 00101001 10010100 : ~ 0x0001010001000101 116 | // 567: 00000111 11100000 : ~ 0x0000000101010101 117 | // 035: 10010100 00101001 : ~ 0x0101000100010100 118 | 119 | #define PATH_INVISIBLE(mask) ( \ 120 | (!(mask & 0x0101010101000000)) \ 121 | || (!(mask & 0x0001010001000101)) \ 122 | || (!(mask & 0x0000000101010101)) \ 123 | || (!(mask & 0x0101000100010100)) \ 124 | ) 125 | 126 | // -------- -------- -------- -------- 127 | #define PATH_VISIBLE(mask) (!(PATH_INVISIBLE(mask))) 128 | 129 | #endif 130 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | 3rd/ 5 | 6 | *.swp 7 | *.dll 8 | *.opendb 9 | *.xlsx 10 | 11 | # User-specific files 12 | *.suo 13 | *.user 14 | *.userosscache 15 | *.sln.docstates 16 | 17 | # User-specific files (MonoDevelop/Xamarin Studio) 18 | *.userprefs 19 | 20 | # Build results 21 | [Dd]ebug/ 22 | [Dd]ebugPublic/ 23 | [Rr]elease/ 24 | [Rr]eleases/ 25 | x64/ 26 | x86/ 27 | build/ 28 | bld/ 29 | [Bb]in/ 30 | [Oo]bj/ 31 | 32 | # Visual Studio 2015 cache/options directory 33 | .vs/ 34 | # Uncomment if you have tasks that create the project's static files in wwwroot 35 | #wwwroot/ 36 | 37 | # MSTest test Results 38 | [Tt]est[Rr]esult*/ 39 | [Bb]uild[Ll]og.* 40 | 41 | # NUNIT 42 | *.VisualState.xml 43 | TestResult.xml 44 | 45 | # Build Results of an ATL Project 46 | [Dd]ebugPS/ 47 | [Rr]eleasePS/ 48 | dlldata.c 49 | 50 | # DNX 51 | project.lock.json 52 | artifacts/ 53 | 54 | *_i.c 55 | *_p.c 56 | *_i.h 57 | *.ilk 58 | *.meta 59 | *.obj 60 | *.pch 61 | *.pdb 62 | *.pgc 63 | *.pgd 64 | *.rsp 65 | *.sbr 66 | *.tlb 67 | *.tli 68 | *.tlh 69 | *.tmp 70 | *.tmp_proj 71 | *.log 72 | *.vspscc 73 | *.vssscc 74 | .builds 75 | *.pidb 76 | *.svclog 77 | *.scc 78 | 79 | # Chutzpah Test files 80 | _Chutzpah* 81 | 82 | # Visual C++ cache files 83 | ipch/ 84 | *.aps 85 | *.ncb 86 | *.opensdf 87 | *.sdf 88 | *.cachefile 89 | 90 | # Visual Studio profiler 91 | *.psess 92 | *.vsp 93 | *.vspx 94 | *.sap 95 | 96 | # TFS 2012 Local Workspace 97 | $tf/ 98 | 99 | # Guidance Automation Toolkit 100 | *.gpState 101 | 102 | # ReSharper is a .NET coding add-in 103 | _ReSharper*/ 104 | *.[Rr]e[Ss]harper 105 | *.DotSettings.user 106 | 107 | # JustCode is a .NET coding add-in 108 | .JustCode 109 | 110 | # TeamCity is a build add-in 111 | _TeamCity* 112 | 113 | # DotCover is a Code Coverage Tool 114 | *.dotCover 115 | 116 | # NCrunch 117 | _NCrunch_* 118 | .*crunch*.local.xml 119 | nCrunchTemp_* 120 | 121 | # MightyMoose 122 | *.mm.* 123 | AutoTest.Net/ 124 | 125 | # Web workbench (sass) 126 | .sass-cache/ 127 | 128 | # Installshield output folder 129 | [Ee]xpress/ 130 | 131 | # DocProject is a documentation generator add-in 132 | DocProject/buildhelp/ 133 | DocProject/Help/*.HxT 134 | DocProject/Help/*.HxC 135 | DocProject/Help/*.hhc 136 | DocProject/Help/*.hhk 137 | DocProject/Help/*.hhp 138 | DocProject/Help/Html2 139 | DocProject/Help/html 140 | 141 | # Click-Once directory 142 | publish/ 143 | 144 | # Publish Web Output 145 | *.[Pp]ublish.xml 146 | *.azurePubxml 147 | # TODO: Comment the next line if you want to checkin your web deploy settings 148 | # but database connection strings (with potential passwords) will be unencrypted 149 | *.pubxml 150 | *.publishproj 151 | 152 | # NuGet Packages 153 | *.nupkg 154 | # The packages folder can be ignored because of Package Restore 155 | **/packages/* 156 | # except build/, which is used as an MSBuild target. 157 | !**/packages/build/ 158 | # Uncomment if necessary however generally it will be regenerated when needed 159 | #!**/packages/repositories.config 160 | 161 | # Windows Azure Build Output 162 | csx/ 163 | *.build.csdef 164 | 165 | # Windows Store app package directory 166 | AppPackages/ 167 | 168 | # Visual Studio cache files 169 | # files ending in .cache can be ignored 170 | *.[Cc]ache 171 | # but keep track of directories ending in .cache 172 | !*.[Cc]ache/ 173 | 174 | # Others 175 | ClientBin/ 176 | [Ss]tyle[Cc]op.* 177 | ~$* 178 | *~ 179 | *.dbmdl 180 | *.dbproj.schemaview 181 | *.pfx 182 | *.publishsettings 183 | node_modules/ 184 | orleans.codegen.cs 185 | 186 | # RIA/Silverlight projects 187 | Generated_Code/ 188 | 189 | # Backup & report files from converting an old project file 190 | # to a newer Visual Studio version. Backup files are not needed, 191 | # because we have git ;-) 192 | _UpgradeReport_Files/ 193 | Backup*/ 194 | UpgradeLog*.XML 195 | UpgradeLog*.htm 196 | 197 | # SQL Server files 198 | *.mdf 199 | *.ldf 200 | 201 | # Business Intelligence projects 202 | *.rdl.data 203 | *.bim.layout 204 | *.bim_*.settings 205 | 206 | # Microsoft Fakes 207 | FakesAssemblies/ 208 | 209 | # Node.js Tools for Visual Studio 210 | .ntvs_analysis.dat 211 | 212 | # Visual Studio 6 build log 213 | *.plg 214 | 215 | # Visual Studio 6 workspace options file 216 | *.opt 217 | 218 | # Visual Studio LightSwitch build output 219 | **/*.HTMLClient/GeneratedArtifacts 220 | **/*.DesktopClient/GeneratedArtifacts 221 | **/*.DesktopClient/ModelManifest.xml 222 | **/*.Server/GeneratedArtifacts 223 | **/*.Server/ModelManifest.xml 224 | _Pvt_Extensions 225 | 226 | -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/sparsematrix.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "util/static.h" 38 | 39 | namespace mgpu { 40 | 41 | struct SparseMatrix { 42 | int height, width, nz; 43 | std::vector csr; // height 44 | std::vector cols; // nz 45 | std::vector matrix; // nz 46 | }; 47 | 48 | bool ReadSparseMatrix(FILE* f, std::auto_ptr* ppMatrix, 49 | std::string& err); 50 | 51 | bool ReadSparseMatrix(const char* filename, 52 | std::auto_ptr* ppMatrix, std::string& err); 53 | 54 | bool LoadBinaryMatrix(const char* filename, 55 | std::auto_ptr* ppMatrix); 56 | 57 | bool StoreBinaryMatrix(const char* filename, const SparseMatrix& matrix); 58 | 59 | bool LoadCachedMatrix(const char* filename, 60 | std::auto_ptr* ppMatrix, std::string& err); 61 | 62 | // Multiply the matrix by a vector of 1s. 63 | template 64 | void SpmvTest(const SparseMatrix& m, T* results) { 65 | memset(results, 0, sizeof(T) * m.height); 66 | for(int row = 0; row < m.height; ++row) { 67 | T product = 0; 68 | int begin = m.csr[row]; 69 | int end = (row + 1 < m.height) ? m.csr[row + 1] : m.nz; 70 | for(int i = begin; i < end; ++i) 71 | product += (T)m.matrix[i]; 72 | 73 | results[row] = product; 74 | } 75 | } 76 | 77 | template 78 | void CompareVecs(const T* test, const T* ref, int count) { 79 | for(int i = 0; i < count; ++i) { 80 | double x = ref[i]; 81 | double y = test[i]; 82 | double diff = fabs(x - y); 83 | 84 | if(diff > 1.0e-5) { 85 | if(y > 0) { 86 | if(1.01 * x < y || 0.99 * x > y) { 87 | printf("BAD OUTPUT AT COMPONENT %d: %8.5e vs %8.5e\n", i, 88 | x, y); 89 | // exit(0); 90 | return; 91 | } 92 | } else { 93 | if(1.01 * x > y || 0.99 * x < y) { 94 | printf("BAD OUTPUT AT COMPONENT %d: %8.5e vs %8.5e\n", i, 95 | x, y); 96 | // exit(0); 97 | return; 98 | } 99 | } 100 | } 101 | } 102 | } 103 | 104 | struct MatrixStats { 105 | int height, width, nz; 106 | 107 | // Row density moments: 108 | double mean; 109 | double stddev; 110 | double skewness; 111 | }; 112 | 113 | MatrixStats ComputeMatrixStats(const SparseMatrix& m); 114 | 115 | int64 MulSparseMatrices(const SparseMatrix& A, const SparseMatrix& B, 116 | std::auto_ptr* ppC); 117 | 118 | 119 | int64 ComputeProductCount(const SparseMatrix& A, const SparseMatrix& B); 120 | 121 | void ComputeColRanges(const SparseMatrix& A, const SparseMatrix& B, 122 | int* colMin, int* colMax); 123 | 124 | } // namespace mgpu 125 | -------------------------------------------------------------------------------- /gpu-scanline/src/rapidxml_iterators.hpp: -------------------------------------------------------------------------------- 1 | #ifndef RAPIDXML_ITERATORS_HPP_INCLUDED 2 | #define RAPIDXML_ITERATORS_HPP_INCLUDED 3 | 4 | // Copyright (C) 2006, 2009 Marcin Kalicinski 5 | // Version 1.13 6 | // Revision $DateTime: 2009/05/13 01:46:17 $ 7 | //! \file rapidxml_iterators.hpp This file contains rapidxml iterators 8 | 9 | #include "rapidxml.hpp" 10 | 11 | namespace rapidxml 12 | { 13 | 14 | //! Iterator of child nodes of xml_node 15 | template 16 | class node_iterator 17 | { 18 | 19 | public: 20 | 21 | typedef typename xml_node value_type; 22 | typedef typename xml_node &reference; 23 | typedef typename xml_node *pointer; 24 | typedef std::ptrdiff_t difference_type; 25 | typedef std::bidirectional_iterator_tag iterator_category; 26 | 27 | node_iterator() 28 | : m_node(0) 29 | { 30 | } 31 | 32 | node_iterator(xml_node *node) 33 | : m_node(node->first_node()) 34 | { 35 | } 36 | 37 | reference operator *() const 38 | { 39 | assert(m_node); 40 | return *m_node; 41 | } 42 | 43 | pointer operator->() const 44 | { 45 | assert(m_node); 46 | return m_node; 47 | } 48 | 49 | node_iterator& operator++() 50 | { 51 | assert(m_node); 52 | m_node = m_node->next_sibling(); 53 | return *this; 54 | } 55 | 56 | node_iterator operator++(int) 57 | { 58 | node_iterator tmp = *this; 59 | ++this; 60 | return tmp; 61 | } 62 | 63 | node_iterator& operator--() 64 | { 65 | assert(m_node && m_node->previous_sibling()); 66 | m_node = m_node->previous_sibling(); 67 | return *this; 68 | } 69 | 70 | node_iterator operator--(int) 71 | { 72 | node_iterator tmp = *this; 73 | ++this; 74 | return tmp; 75 | } 76 | 77 | bool operator ==(const node_iterator &rhs) 78 | { 79 | return m_node == rhs.m_node; 80 | } 81 | 82 | bool operator !=(const node_iterator &rhs) 83 | { 84 | return m_node != rhs.m_node; 85 | } 86 | 87 | private: 88 | 89 | xml_node *m_node; 90 | 91 | }; 92 | 93 | //! Iterator of child attributes of xml_node 94 | template 95 | class attribute_iterator 96 | { 97 | 98 | public: 99 | 100 | typedef typename xml_attribute value_type; 101 | typedef typename xml_attribute &reference; 102 | typedef typename xml_attribute *pointer; 103 | typedef std::ptrdiff_t difference_type; 104 | typedef std::bidirectional_iterator_tag iterator_category; 105 | 106 | attribute_iterator() 107 | : m_attribute(0) 108 | { 109 | } 110 | 111 | attribute_iterator(xml_node *node) 112 | : m_attribute(node->first_attribute()) 113 | { 114 | } 115 | 116 | reference operator *() const 117 | { 118 | assert(m_attribute); 119 | return *m_attribute; 120 | } 121 | 122 | pointer operator->() const 123 | { 124 | assert(m_attribute); 125 | return m_attribute; 126 | } 127 | 128 | attribute_iterator& operator++() 129 | { 130 | assert(m_attribute); 131 | m_attribute = m_attribute->next_attribute(); 132 | return *this; 133 | } 134 | 135 | attribute_iterator operator++(int) 136 | { 137 | attribute_iterator tmp = *this; 138 | ++this; 139 | return tmp; 140 | } 141 | 142 | attribute_iterator& operator--() 143 | { 144 | assert(m_attribute && m_attribute->previous_attribute()); 145 | m_attribute = m_attribute->previous_attribute(); 146 | return *this; 147 | } 148 | 149 | attribute_iterator operator--(int) 150 | { 151 | attribute_iterator tmp = *this; 152 | ++this; 153 | return tmp; 154 | } 155 | 156 | bool operator ==(const attribute_iterator &rhs) 157 | { 158 | return m_attribute == rhs.m_attribute; 159 | } 160 | 161 | bool operator !=(const attribute_iterator &rhs) 162 | { 163 | return m_attribute != rhs.m_attribute; 164 | } 165 | 166 | private: 167 | 168 | xml_attribute *m_attribute; 169 | 170 | }; 171 | 172 | } 173 | 174 | #endif 175 | -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/mmio.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Matrix Market I/O library for ANSI C 3 | * 4 | * See http://math.nist.gov/MatrixMarket for details. 5 | * 6 | * 7 | */ 8 | 9 | #ifndef MM_IO_H 10 | #define MM_IO_H 11 | 12 | #define MM_MAX_LINE_LENGTH 1025 13 | #define MatrixMarketBanner "%%MatrixMarket" 14 | #define MM_MAX_TOKEN_LENGTH 64 15 | 16 | typedef char MM_typecode[4]; 17 | 18 | char *mm_typecode_to_str(MM_typecode matcode); 19 | 20 | int mm_read_banner(FILE *f, MM_typecode *matcode); 21 | int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz); 22 | int mm_read_mtx_array_size(FILE *f, int *M, int *N); 23 | 24 | int mm_write_banner(FILE *f, MM_typecode matcode); 25 | int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz); 26 | int mm_write_mtx_array_size(FILE *f, int M, int N); 27 | 28 | 29 | /********************* MM_typecode query fucntions ***************************/ 30 | 31 | #define mm_is_matrix(typecode) ((typecode)[0]=='M') 32 | 33 | #define mm_is_sparse(typecode) ((typecode)[1]=='C') 34 | #define mm_is_coordinate(typecode)((typecode)[1]=='C') 35 | #define mm_is_dense(typecode) ((typecode)[1]=='A') 36 | #define mm_is_array(typecode) ((typecode)[1]=='A') 37 | 38 | #define mm_is_complex(typecode) ((typecode)[2]=='C') 39 | #define mm_is_real(typecode) ((typecode)[2]=='R') 40 | #define mm_is_pattern(typecode) ((typecode)[2]=='P') 41 | #define mm_is_integer(typecode) ((typecode)[2]=='I') 42 | 43 | #define mm_is_symmetric(typecode)((typecode)[3]=='S') 44 | #define mm_is_general(typecode) ((typecode)[3]=='G') 45 | #define mm_is_skew(typecode) ((typecode)[3]=='K') 46 | #define mm_is_hermitian(typecode)((typecode)[3]=='H') 47 | 48 | int mm_is_valid(MM_typecode matcode); /* too complex for a macro */ 49 | 50 | 51 | /********************* MM_typecode modify fucntions ***************************/ 52 | 53 | #define mm_set_matrix(typecode) ((*typecode)[0]='M') 54 | #define mm_set_coordinate(typecode) ((*typecode)[1]='C') 55 | #define mm_set_array(typecode) ((*typecode)[1]='A') 56 | #define mm_set_dense(typecode) mm_set_array(typecode) 57 | #define mm_set_sparse(typecode) mm_set_coordinate(typecode) 58 | 59 | #define mm_set_complex(typecode)((*typecode)[2]='C') 60 | #define mm_set_real(typecode) ((*typecode)[2]='R') 61 | #define mm_set_pattern(typecode)((*typecode)[2]='P') 62 | #define mm_set_integer(typecode)((*typecode)[2]='I') 63 | 64 | 65 | #define mm_set_symmetric(typecode)((*typecode)[3]='S') 66 | #define mm_set_general(typecode)((*typecode)[3]='G') 67 | #define mm_set_skew(typecode) ((*typecode)[3]='K') 68 | #define mm_set_hermitian(typecode)((*typecode)[3]='H') 69 | 70 | #define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \ 71 | (*typecode)[2]=' ',(*typecode)[3]='G') 72 | 73 | #define mm_initialize_typecode(typecode) mm_clear_typecode(typecode) 74 | 75 | 76 | /********************* Matrix Market error codes ***************************/ 77 | 78 | 79 | #define MM_COULD_NOT_READ_FILE 11 80 | #define MM_PREMATURE_EOF 12 81 | #define MM_NOT_MTX 13 82 | #define MM_NO_HEADER 14 83 | #define MM_UNSUPPORTED_TYPE 15 84 | #define MM_LINE_TOO_LONG 16 85 | #define MM_COULD_NOT_WRITE_FILE 17 86 | 87 | 88 | /******************** Matrix Market internal definitions ******************** 89 | 90 | MM_matrix_typecode: 4-character sequence 91 | 92 | ojbect sparse/ data storage 93 | dense type scheme 94 | 95 | string position: [0] [1] [2] [3] 96 | 97 | Matrix typecode: M(atrix) C(oord) R(eal) G(eneral) 98 | A(array) C(omplex) H(ermitian) 99 | P(attern) S(ymmetric) 100 | I(nteger) K(kew) 101 | 102 | ***********************************************************************/ 103 | 104 | #define MM_MTX_STR "matrix" 105 | #define MM_ARRAY_STR "array" 106 | #define MM_DENSE_STR "array" 107 | #define MM_COORDINATE_STR "coordinate" 108 | #define MM_SPARSE_STR "coordinate" 109 | #define MM_COMPLEX_STR "complex" 110 | #define MM_REAL_STR "real" 111 | #define MM_INT_STR "integer" 112 | #define MM_GENERAL_STR "general" 113 | #define MM_SYMM_STR "symmetric" 114 | #define MM_HERM_STR "hermitian" 115 | #define MM_SKEW_STR "skew-symmetric" 116 | #define MM_PATTERN_STR "pattern" 117 | 118 | 119 | /* high level routines */ 120 | 121 | int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[], 122 | double val[], MM_typecode matcode); 123 | int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[], 124 | double val[], MM_typecode matcode); 125 | int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img, 126 | MM_typecode matcode); 127 | 128 | int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_, 129 | double **val_, int **I_, int **J_); 130 | 131 | 132 | 133 | #endif 134 | -------------------------------------------------------------------------------- /gpu-scanline/src/rasterizer/R_cut_A_mask_comb_scanline/ras_cut_mask_comb_scanline.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include "../shared/ras_base.h" 5 | 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #include "mochimazui/3rd/gl_4_5_compatibility.h" 12 | #include 13 | #include "mochimazui/glpp.h" 14 | #include "mochimazui/cuda_array.h" 15 | 16 | namespace Mochimazui { 17 | 18 | class VGContainer; 19 | 20 | namespace Rasterizer_R_Cut_A_Mask_Comb_Scanline { 21 | 22 | using GLPP::NamedBuffer; 23 | using GLPP::NamedFramebuffer; 24 | using GLPP::NamedTexture; 25 | using GLPP::ShaderProgram; 26 | 27 | using CUDATL::CUDAArray; 28 | 29 | class VGRasterizer : public RasterizerBase::VGRasterizer { 30 | 31 | typedef RasterizerBase::VGRasterizer _Base; 32 | 33 | public: 34 | VGRasterizer(); 35 | ~VGRasterizer(); 36 | 37 | void init(); 38 | void uninit(); 39 | 40 | void addVg(const VGContainer &vgc); 41 | void clear() {} 42 | 43 | void setFragmentSize(int s) { _fragSize = s; } 44 | 45 | void rasterizeImpl(); 46 | 47 | private: 48 | void initProgram(); 49 | void initBuffer(); 50 | void initFramebuffer(); 51 | 52 | void initCommandList(); 53 | void uninitCommandList(); 54 | 55 | void onResize(int _width, int _height); 56 | 57 | private: 58 | void initQMMaskTable(); 59 | 60 | template 61 | void rasterizeImpl(); 62 | 63 | protected: 64 | 65 | uint32_t _fragSize = 2; 66 | 67 | // for debug. 68 | bool _dbgDumpWindingNumber = false; 69 | bool _dbgDumpFragmentData = false; 70 | 71 | struct _GL{ 72 | _GL() {} 73 | struct _GL_Buffer{ 74 | _GL_Buffer() {} 75 | 76 | NamedBuffer stencilDrawData; 77 | NamedBuffer stencilDrawMask; 78 | 79 | NamedBuffer outputIndex; 80 | NamedBuffer outputFragmentData; 81 | NamedBuffer outputSpanData; 82 | NamedBuffer outputFillInfo; 83 | 84 | NamedBuffer qm_output_stencil_mask; 85 | 86 | // -- debug -- 87 | NamedBuffer dbgCurveVertex; 88 | NamedBuffer dbgCurveColor; 89 | 90 | NamedBuffer dbgDrawStencilDump_0; 91 | NamedBuffer dbgDrawStencilDump_1; 92 | NamedBuffer dbgDrawStencilDump_2; 93 | } buffer; 94 | 95 | struct _GL_Texture{ 96 | _GL_Texture() {} 97 | 98 | // texbuffer 99 | NamedTexture stencilDrawData; 100 | NamedTexture stencilDrawMask; 101 | 102 | NamedTexture outputIndex; 103 | NamedTexture outputFragmentData; 104 | NamedTexture outputSpanData; 105 | NamedTexture outputFillInfo; 106 | 107 | // tex2D 108 | NamedTexture stencilDraw; 109 | 110 | // -- debug -- 111 | NamedTexture dbgCurveVertex; 112 | NamedTexture dbgCurveColor; 113 | 114 | NamedTexture dbgDrawCount; 115 | 116 | NamedTexture dbgDrawStencilDump_0; 117 | NamedTexture dbgDrawStencilDump_1; 118 | NamedTexture dbgDrawStencilDump_2; 119 | } texture; 120 | 121 | struct _GL_Framebuffer{ 122 | _GL_Framebuffer() {} 123 | NamedFramebuffer stencilDrawMS; 124 | } framebuffer; 125 | 126 | struct _GL_Program{ 127 | _GL_Program() {} 128 | 129 | ShaderProgram output; 130 | 131 | // -- debug -- 132 | ShaderProgram dbgCurve; 133 | ShaderProgram dbgCurveFragment; 134 | ShaderProgram dbgOutputScale; 135 | 136 | } program; 137 | 138 | } _gl; 139 | 140 | struct _GPU_Array{ 141 | _GPU_Array() {} 142 | 143 | // transform && stroke to fill 144 | CUDAArray strokeTransformedVertex; 145 | CUDAArray strokeToFillNewCurveTemp; 146 | 147 | CUDAArray transformedVertex; 148 | 149 | // monotonize 150 | CUDAArray curve_pixel_count; 151 | CUDAArray monotonic_cutpoint_cache; 152 | CUDAArray intersection; 153 | 154 | CUDAArray monoCurveT; 155 | CUDAArray monoCurveNumber; 156 | CUDAArray monoCurveSize; 157 | CUDAArray curveFragmentNumber; 158 | 159 | CUDAArray ic4Context; 160 | 161 | CUDAArray fragmentData; 162 | 163 | // mask 164 | CUDAArray amaskTable; 165 | CUDAArray pmaskTable; 166 | 167 | // temp for CUDA SM gen stencil 168 | CUDAArray blockBoundaryBins; 169 | 170 | // for CUDA cell list output 171 | CUDAArray cellListPos; 172 | CUDAArray cellListFillInfo; 173 | CUDAArray cellListMaskIndex; 174 | 175 | } _gpu; 176 | 177 | struct __CUDA { 178 | __CUDA() {} 179 | struct __CUDAResrouce { 180 | __CUDAResrouce() : 181 | stencilDrawData(nullptr), stencilDrawMask(nullptr), 182 | outputIndex(nullptr), outputFragment(nullptr), 183 | outputSpan(nullptr), outputFillInfo(nullptr) 184 | {} 185 | 186 | cudaGraphicsResource *stencilDrawData = nullptr; 187 | cudaGraphicsResource *stencilDrawMask = nullptr; 188 | 189 | cudaGraphicsResource *outputIndex = nullptr; 190 | cudaGraphicsResource *outputFragment = nullptr; 191 | cudaGraphicsResource *outputSpan = nullptr; 192 | cudaGraphicsResource *outputFillInfo = nullptr; 193 | 194 | cudaGraphicsResource *qm_output_stencil_mask = nullptr; 195 | } resource; 196 | } _cuda; 197 | 198 | CUDAArray _qm_mask_table_pixel8; 199 | CUDAArray _qm_mask_table_pixel32; 200 | 201 | CUDAArray _sample_position; 202 | }; 203 | 204 | } // end of namespace BigFragAM 205 | 206 | } // end of namespace Mochimazui 207 | -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/kernels/localitysort.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "../mgpuhost.cuh" 38 | #include "../kernels/mergesort.cuh" 39 | #include "../kernels/segmentedsort.cuh" 40 | 41 | namespace mgpu { 42 | 43 | template 44 | MGPU_HOST void LocalitySortKeys(T* data_global, int count, CudaContext& context, 45 | Comp comp, bool verbose) { 46 | 47 | const int NT = 128; 48 | const int VT = 11; 49 | typedef LaunchBoxVT Tuning; 50 | int2 launch = Tuning::GetLaunchParams(context); 51 | const int NV = launch.x * launch.y; 52 | 53 | int numBlocks = MGPU_DIV_UP(count, NV); 54 | int numPasses = FindLog2(numBlocks, true); 55 | 56 | SegSortSupport support; 57 | MGPU_MEM(byte) mem = AllocSegSortBuffers(count, NV, support, false, 58 | context); 59 | 60 | MGPU_MEM(T) destDevice = context.Malloc(count); 61 | T* source = data_global; 62 | T* dest = destDevice->get(); 63 | 64 | KernelBlocksort 65 | <<>>(source, (const int*)0, 66 | count, (1 & numPasses) ? dest : source, (int*)0, comp); 67 | MGPU_SYNC_CHECK("KernelBlocksort"); 68 | 69 | if(1 & numPasses) std::swap(source, dest); 70 | 71 | SegSortPasses(support, source, (int*)0, count, 72 | numBlocks, numPasses, dest, (int*)0, comp, context, verbose); 73 | } 74 | template 75 | MGPU_HOST void LocalitySortKeys(T* data_global, int count, CudaContext& context, 76 | bool verbose) { 77 | LocalitySortKeys(data_global, count, context, mgpu::less(), verbose); 78 | } 79 | 80 | template 81 | MGPU_HOST void LocalitySortPairs(KeyType* keys_global, ValType* values_global, 82 | int count, CudaContext& context, Comp comp, bool verbose) { 83 | 84 | const int NT = 128; 85 | const int VT = 7; 86 | typedef LaunchBoxVT Tuning; 87 | int2 launch = Tuning::GetLaunchParams(context); 88 | const int NV = launch.x * launch.y; 89 | 90 | int numBlocks = MGPU_DIV_UP(count, NV); 91 | int numPasses = FindLog2(numBlocks, true); 92 | 93 | SegSortSupport support; 94 | MGPU_MEM(byte) mem = AllocSegSortBuffers(count, NV, support, false, 95 | context); 96 | 97 | MGPU_MEM(KeyType) keysDestDevice = context.Malloc(count); 98 | MGPU_MEM(ValType) valsDestDevice = context.Malloc(count); 99 | 100 | KeyType* keysSource = keys_global; 101 | KeyType* keysDest = keysDestDevice->get(); 102 | ValType* valsSource = values_global; 103 | ValType* valsDest = valsDestDevice->get(); 104 | 105 | KernelBlocksort<<>>( 106 | keysSource, valsSource, count, (1 & numPasses) ? keysDest : keysSource, 107 | (1 & numPasses) ? valsDest : valsSource, comp); 108 | MGPU_SYNC_CHECK("KernelBlocksort"); 109 | 110 | if(1 & numPasses) { 111 | std::swap(keysSource, keysDest); 112 | std::swap(valsSource, valsDest); 113 | } 114 | 115 | SegSortPasses(support, keysSource, valsSource, count, 116 | numBlocks, numPasses, keysDest, valsDest, comp, context, verbose); 117 | } 118 | template 119 | MGPU_HOST void LocalitySortPairs(KeyType* keys_global, ValType* values_global, 120 | int count, CudaContext& context, bool verbose) { 121 | LocalitySortPairs(keys_global, values_global, count, context, 122 | mgpu::less(), verbose); 123 | } 124 | 125 | } // namespace mgpu 126 | -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/util/format.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "static.h" 38 | #include 39 | #include 40 | #include 41 | #include 42 | 43 | namespace mgpu { 44 | 45 | // Like sprintf but dynamically allocates sufficient output to hold the entire 46 | // text. 47 | std::string stringprintf(const char* format, ...); 48 | 49 | // Returns xxx.xx(K|M|B) 50 | std::string FormatInteger(int64 x); 51 | 52 | const char* TypeIdString(const std::type_info& ti); 53 | 54 | template 55 | const char* TypeIdName() { 56 | return TypeIdString(typeid(T)); 57 | } 58 | 59 | struct FormatOpPrintf { 60 | const char* format; 61 | FormatOpPrintf(const char* f) : format(f) { } 62 | 63 | template 64 | std::string operator()(int index, T x) const { 65 | return stringprintf(format, x); 66 | } 67 | }; 68 | 69 | struct FormatOpMaskBit { 70 | const char* format; 71 | FormatOpMaskBit(const char* f) : format(f) { } 72 | 73 | std::string operator()(int index, int x) const { 74 | return stringprintf(format, (0x80000000 & x) ? '*' : ' ', 75 | 0x7fffffff & x); 76 | } 77 | }; 78 | 79 | struct FormatOpMarkArray { 80 | const char* format; 81 | const int* marks; 82 | int numMarks; 83 | 84 | FormatOpMarkArray(const char* f, const int* m, int n) : 85 | format(f), marks(m), numMarks(n) { } 86 | 87 | std::string operator()(int index, int x) const { 88 | // Search for index in the array of marks. 89 | bool mark = std::binary_search(marks, marks + numMarks, index); 90 | return stringprintf(format, mark ? '*' : ' ', x); 91 | } 92 | }; 93 | 94 | template 95 | std::string FormatArrayOp(const T* data, size_t count, Op op, int numCols) { 96 | std::string s; 97 | size_t numRows = MGPU_DIV_UP(count, numCols); 98 | for(size_t row(0); row < numRows; ++row) { 99 | size_t left = row * numCols; 100 | s.append(stringprintf("%5d: ", left)); 101 | 102 | for(size_t col(left); col < std::min(left + numCols, count); ++col) { 103 | s.append(op(col, data[col])); 104 | s.push_back(' '); 105 | } 106 | s.push_back('\n'); 107 | } 108 | return s; 109 | } 110 | 111 | template 112 | std::string FormatArray(const T* data, size_t count, const char* format, 113 | int numCols) { 114 | return FormatArrayOp(data, count, FormatOpPrintf(format), numCols); 115 | } 116 | 117 | template 118 | std::string FormatArray(const std::vector& data, const char* format, 119 | int numCols) { 120 | return FormatArray(&data[0], (int)data.size(), format, numCols); 121 | } 122 | template 123 | std::string FormatArrayOp(const std::vector& data, Op op, int numCols) { 124 | return FormatArrayOp(&data[0], (int)data.size(), op, numCols); 125 | } 126 | 127 | template 128 | void PrintArray(const T* data, size_t count, const char* format, int numCols) { 129 | std::string s = FormatArray(data, count, format, numCols); 130 | printf("%s", s.c_str()); 131 | } 132 | 133 | template 134 | void PrintArray(const std::vector& data, const char* format, int numCols) { 135 | std::string s = FormatArray(data, format, numCols); 136 | printf("%s", s.c_str()); 137 | } 138 | 139 | template 140 | void PrintArrayOp(const std::vector& data, Op op, int numCols) { 141 | std::string s = FormatArrayOp(data, op, numCols); 142 | printf("%s", s.c_str()); 143 | } 144 | 145 | 146 | 147 | 148 | } // namespace mgpu 149 | -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/device/deviceutil.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "../device/intrinsics.cuh" 38 | 39 | namespace mgpu { 40 | 41 | // Get the difference between two pointers in bytes. 42 | MGPU_HOST_DEVICE ptrdiff_t PtrDiff(const void* a, const void* b) { 43 | return (const byte*)b - (const byte*)a; 44 | } 45 | 46 | // Offset a pointer by i bytes. 47 | template 48 | MGPU_HOST_DEVICE const T* PtrOffset(const T* p, ptrdiff_t i) { 49 | return (const T*)((const byte*)p + i); 50 | } 51 | template 52 | MGPU_HOST_DEVICE T* PtrOffset(T* p, ptrdiff_t i) { 53 | return (T*)((byte*)p + i); 54 | } 55 | 56 | //////////////////////////////////////////////////////////////////////////////// 57 | // Task range support 58 | // Evenly distributes variable-length arrays over a fixed number of CTAs. 59 | 60 | MGPU_HOST int2 DivideTaskRange(int numItems, int numWorkers) { 61 | div_t d = div(numItems, numWorkers); 62 | return make_int2(d.quot, d.rem); 63 | } 64 | 65 | MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task) { 66 | int2 range; 67 | range.x = task.x * block; 68 | range.x += min(block, task.y); 69 | range.y = range.x + task.x + (block < task.y); 70 | return range; 71 | } 72 | 73 | MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task, int blockSize, 74 | int count) { 75 | int2 range = ComputeTaskRange(block, task); 76 | range.x *= blockSize; 77 | range.y = min(count, range.y * blockSize); 78 | return range; 79 | } 80 | 81 | //////////////////////////////////////////////////////////////////////////////// 82 | // DeviceExtractHeadFlags 83 | // Input array flags is a bit array with 32 head flags per word. 84 | // ExtractThreadHeadFlags returns numBits flags starting at bit index. 85 | 86 | MGPU_HOST_DEVICE uint DeviceExtractHeadFlags(const uint* flags, int index, 87 | int numBits) { 88 | 89 | int index2 = index>> 5; 90 | int shift = 31 & index; 91 | uint headFlags = flags[index2]>> shift; 92 | int shifted = 32 - shift; 93 | 94 | if(shifted < numBits) 95 | // We also need to shift in the next set of bits. 96 | headFlags = bfi(flags[index2 + 1], headFlags, shifted, shift); 97 | headFlags &= (1<< numBits) - 1; 98 | return headFlags; 99 | } 100 | 101 | //////////////////////////////////////////////////////////////////////////////// 102 | // DevicePackHeadFlags 103 | // Pack VT bits per thread at 32 bits/thread. Will consume an integer number of 104 | // words, because CTA size is a multiple of 32. The first NT * VT / 32 threads 105 | // return packed words. 106 | 107 | template 108 | MGPU_DEVICE uint DevicePackHeadFlags(uint threadBits, int tid, 109 | uint* flags_shared) { 110 | 111 | const int WordCount = NT * VT / 32; 112 | 113 | // Each thread stores its thread bits to flags_shared[tid]. 114 | flags_shared[tid] = threadBits; 115 | __syncthreads(); 116 | 117 | uint packed = 0; 118 | if(tid < WordCount) { 119 | const int Items = MGPU_DIV_UP(32, VT); 120 | int index = 32 * tid; 121 | int first = index / VT; 122 | int bit = 0; 123 | 124 | int rem = index - VT * first; 125 | packed = flags_shared[first]>> rem; 126 | bit = VT - rem; 127 | ++first; 128 | 129 | #pragma unroll 130 | for(int i = 0; i < Items; ++i) { 131 | if(i < Items - 1 || bit < 32) { 132 | uint x = flags_shared[first + i]; 133 | if(bit < 32) packed |= x<< bit; 134 | bit += VT; 135 | } 136 | } 137 | } 138 | __syncthreads(); 139 | 140 | return packed; 141 | } 142 | 143 | } // namespace mgpu 144 | -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/device/ctasegscan.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "../device/ctascan.cuh" 38 | 39 | namespace mgpu { 40 | 41 | //////////////////////////////////////////////////////////////////////////////// 42 | // DeviceFindSegScanDelta 43 | // Runs an inclusive max-index scan over binary inputs. 44 | 45 | template 46 | MGPU_DEVICE int DeviceFindSegScanDelta(int tid, bool flag, int* delta_shared) { 47 | const int NumWarps = NT / 32; 48 | 49 | int warp = tid / 32; 50 | int lane = 31 & tid; 51 | uint warpMask = 0xffffffff>> (31 - lane); // inclusive search 52 | uint ctaMask = 0x7fffffff>> (31 - lane); // exclusive search 53 | 54 | uint warpBits = __ballot(flag); 55 | delta_shared[warp] = warpBits; 56 | __syncthreads(); 57 | 58 | if(tid < NumWarps) { 59 | uint ctaBits = __ballot(0 != delta_shared[tid]); 60 | int warpSegment = 31 - clz(ctaMask & ctaBits); 61 | int start = (-1 != warpSegment) ? 62 | (31 - clz(delta_shared[warpSegment]) + 32 * warpSegment) : 0; 63 | delta_shared[NumWarps + tid] = start; 64 | } 65 | __syncthreads(); 66 | 67 | // Find the closest flag to the left of this thread within the warp. 68 | // Include the flag for this thread. 69 | int start = 31 - clz(warpMask & warpBits); 70 | if(-1 != start) start += ~31 & tid; 71 | else start = delta_shared[NumWarps + warp]; 72 | __syncthreads(); 73 | 74 | return tid - start; 75 | } 76 | 77 | //////////////////////////////////////////////////////////////////////////////// 78 | // CTASegScan 79 | 80 | template > 81 | struct CTASegScan { 82 | typedef _Op Op; 83 | typedef typename Op::result_type T; 84 | enum { NumWarps = NT / 32, Size = NT, Capacity = 2 * NT }; 85 | union Storage { 86 | int delta[NumWarps]; 87 | T values[Capacity]; 88 | }; 89 | 90 | // Each thread passes the reduction of the LAST SEGMENT that it covers. 91 | // flag is set to true if there's at least one segment flag in the thread. 92 | // SegScan returns the reduction of values for the first segment in this 93 | // thread over the preceding threads. 94 | // Return the value init for the first thread. 95 | 96 | // When scanning single elements per thread, interpret the flag as a BEGIN 97 | // FLAG. If tid's flag is set, its value belongs to thread tid + 1, not 98 | // thread tid. 99 | 100 | // The function returns the reduction of the last segment in the CTA. 101 | 102 | MGPU_DEVICE static T SegScanDelta(int tid, int tidDelta, T x, 103 | Storage& storage, T* carryOut, T identity = (T)0, Op op = Op()) { 104 | 105 | // Run an inclusive scan 106 | int first = 0; 107 | storage.values[first + tid] = x; 108 | __syncthreads(); 109 | 110 | #pragma unroll 111 | for(int offset = 1; offset < NT; offset += offset) { 112 | if(tidDelta >= offset) 113 | x = op(storage.values[first + tid - offset], x); 114 | first = NT - first; 115 | storage.values[first + tid] = x; 116 | __syncthreads(); 117 | } 118 | 119 | // Get the exclusive scan. 120 | x = tid ? storage.values[first + tid - 1] : identity; 121 | *carryOut = storage.values[first + NT - 1]; 122 | __syncthreads(); 123 | return x; 124 | } 125 | 126 | MGPU_DEVICE static T SegScan(int tid, T x, bool flag, Storage& storage, 127 | T* carryOut, T identity = (T)0, Op op = Op()) { 128 | 129 | // Find the left-most thread that covers the first segment of this 130 | // thread. 131 | int tidDelta = DeviceFindSegScanDelta(tid, flag, storage.delta); 132 | 133 | return SegScanDelta(tid, tidDelta, x, storage, carryOut, identity, op); 134 | } 135 | }; 136 | 137 | } // namespace mgpu 138 | -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/device/ctaloadbalance.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "../device/ctasearch.cuh" 38 | #include "../device/loadstore.cuh" 39 | 40 | namespace mgpu { 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | // DeviceLoadBalancingSearch 44 | // Upper Bound search from A (needles) into B (haystack). The A values are 45 | // natural numbers from aBegin to aEnd. bFirst is the index of the B value at 46 | // bBegin in shared memory. 47 | 48 | template 49 | MGPU_DEVICE void DeviceSerialLoadBalanceSearch(const int* b_shared, int aBegin, 50 | int aEnd, int bFirst, int bBegin, int bEnd, int* a_shared) { 51 | 52 | int bKey = b_shared[bBegin]; 53 | 54 | #pragma unroll 55 | for(int i = 0; i < VT; ++i) { 56 | bool p; 57 | if(RangeCheck) 58 | p = (aBegin < aEnd) && ((bBegin >= bEnd) || (aBegin < bKey)); 59 | else 60 | p = aBegin < bKey; 61 | 62 | if(p) 63 | // Advance A (the needle). 64 | a_shared[aBegin++] = bFirst + bBegin; 65 | else 66 | // Advance B (the haystack). 67 | bKey = b_shared[++bBegin]; 68 | } 69 | } 70 | 71 | //////////////////////////////////////////////////////////////////////////////// 72 | // CTALoadBalance 73 | // Computes upper_bound(counting_iterator(first), b_global) - 1. 74 | 75 | // Unlike most other CTA* functions, CTALoadBalance loads from global memory. 76 | // This returns the loaded B elements at the beginning or end of shared memory 77 | // depending on the aFirst argument. 78 | 79 | // CTALoadBalance requires NT * VT + 2 slots of shared memory. 80 | template 81 | MGPU_DEVICE int4 CTALoadBalance(int destCount, InputIt b_global, 82 | int sourceCount, int block, int tid, const int* mp_global, 83 | int* indices_shared, bool loadPrecedingB) { 84 | 85 | int4 range = ComputeMergeRange(destCount, sourceCount, block, 0, NT * VT, 86 | mp_global); 87 | 88 | int a0 = range.x; 89 | int a1 = range.y; 90 | int b0 = range.z; 91 | int b1 = range.w; 92 | if(!b0) loadPrecedingB = false; 93 | 94 | // Load one trailing term from B. If we're already at the end, fill the 95 | // end of the buffer with destCount. 96 | int aCount = a1 - a0; 97 | int bCount = b1 - b0; 98 | int extended = b1 < sourceCount; 99 | int loadCount = bCount + extended; 100 | int fillCount = NT * VT + 1 - loadCount - aCount; 101 | 102 | int* a_shared = indices_shared; 103 | int* b_shared = indices_shared + aCount + (int)loadPrecedingB; 104 | 105 | // Load the B values. 106 | // DeviceMemToMemLoop(bCount + extended + (int)loadPrecedingB, 107 | // b_global + b0 - (int)loadPrecedingB, tid, 108 | // b_shared - (int)loadPrecedingB); 109 | 110 | for(int i = tid - (int)loadPrecedingB; i < bCount + extended; i += NT) 111 | b_shared[i] = b_global[b0 + i]; 112 | 113 | // Fill the end of the array with destCount. 114 | for(int i = tid + extended; i < fillCount; i += NT) 115 | b_shared[bCount + i] = destCount; 116 | __syncthreads(); 117 | 118 | // Run a merge path to find the start of the serial merge for each thread. 119 | int diag = VT * tid; 120 | int mp = MergePath(mgpu::counting_iterator(a0), 121 | aCount, b_shared, bCount, diag, mgpu::less()); 122 | 123 | int a0tid = a0 + mp; 124 | int b0tid = diag - mp; 125 | 126 | // Subtract 1 from b0 because we want to return upper_bound - 1. 127 | DeviceSerialLoadBalanceSearch(b_shared, a0tid, a1, b0 - 1, 128 | b0tid, bCount, a_shared - a0); 129 | __syncthreads(); 130 | 131 | b0 -= (int)loadPrecedingB; 132 | return make_int4(a0, a1, b0, b1); 133 | } 134 | 135 | 136 | } // namespace mgpu 137 | -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/kernels/reduce.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "../mgpuhost.cuh" 38 | 39 | namespace mgpu { 40 | 41 | //////////////////////////////////////////////////////////////////////////////// 42 | // KernelReduce 43 | 44 | template 45 | MGPU_LAUNCH_BOUNDS void KernelReduce(InputIt data_global, int count, 46 | T identity, Op op, T* reduction_global) { 47 | 48 | typedef MGPU_LAUNCH_PARAMS Params; 49 | const int NT = Params::NT; 50 | const int VT = Params::VT; 51 | const int NV = NT * VT; 52 | typedef CTAReduce R; 53 | 54 | union Shared { 55 | typename R::Storage reduceStorage; 56 | }; 57 | __shared__ Shared shared; 58 | 59 | int tid = threadIdx.x; 60 | int block = blockIdx.x; 61 | int gid = NV * block; 62 | int count2 = min(NV, count - gid); 63 | 64 | // Load a full tile into register in strided order. Set out-of-range values 65 | // with identity. 66 | T data[VT]; 67 | DeviceGlobalToRegDefault(count2, data_global + gid, tid, data, 68 | identity); 69 | 70 | // Sum elements within each thread. 71 | T x; 72 | #pragma unroll 73 | for(int i = 0; i < VT; ++i) 74 | x = i ? op(x, data[i]) : data[i]; 75 | 76 | // Sum thread-totals over the CTA. 77 | x = R::Reduce(tid, x, shared.reduceStorage, op); 78 | 79 | // Store the tile's reduction to global memory. 80 | if(!tid) 81 | reduction_global[block] = x; 82 | } 83 | 84 | //////////////////////////////////////////////////////////////////////////////// 85 | // Reduce 86 | 87 | template 88 | MGPU_HOST void Reduce(InputIt data_global, int count, T identity, Op op, 89 | T* reduce_global, T* reduce_host, CudaContext& context) { 90 | 91 | MGPU_MEM(T) totalDevice; 92 | if(!reduce_global) { 93 | totalDevice = context.Malloc(1); 94 | reduce_global = totalDevice->get(); 95 | } 96 | 97 | if(count <= 256) { 98 | typedef LaunchBoxVT<256, 1> Tuning; 99 | KernelReduce<<<1, 256, 0, context.Stream()>>>( 100 | data_global, count, identity, op, reduce_global); 101 | MGPU_SYNC_CHECK("KernelReduce"); 102 | 103 | } else if(count <= 768) { 104 | typedef LaunchBoxVT<256, 3> Tuning; 105 | KernelReduce<<<1, 256, 0, context.Stream()>>>( 106 | data_global, count, identity, op, reduce_global); 107 | MGPU_SYNC_CHECK("KernelReduce"); 108 | 109 | } else if(count <= 512 * ((sizeof(T) > 4) ? 4 : 8)) { 110 | typedef LaunchBoxVT<512, (sizeof(T) > 4) ? 4 : 8> Tuning; 111 | KernelReduce<<<1, 512, 0, context.Stream()>>>( 112 | data_global, count, identity, op, reduce_global); 113 | MGPU_SYNC_CHECK("KernelReduce"); 114 | 115 | } else { 116 | // Launch a grid and reduce tiles to temporary storage. 117 | typedef LaunchBoxVT<256, (sizeof(T) > 4) ? 8 : 16> Tuning; 118 | int2 launch = Tuning::GetLaunchParams(context); 119 | int NV = launch.x * launch.y; 120 | int numBlocks = MGPU_DIV_UP(count, NV); 121 | 122 | MGPU_MEM(T) reduceDevice = context.Malloc(numBlocks); 123 | KernelReduce<<>>( 124 | data_global, count, identity, op, reduceDevice->get()); 125 | MGPU_SYNC_CHECK("KernelReduce"); 126 | 127 | Reduce(reduceDevice->get(), numBlocks, identity, op, reduce_global, 128 | (T*)0, context); 129 | } 130 | 131 | if(reduce_host) 132 | copyDtoH(reduce_host, reduce_global, 1); 133 | } 134 | 135 | template 136 | MGPU_HOST typename std::iterator_traits::value_type 137 | Reduce(InputIt data_global, int count, CudaContext& context) { 138 | typedef typename std::iterator_traits::value_type T; 139 | T result; 140 | Reduce(data_global, count, (T)0, mgpu::plus(), (T*)0, &result, context); 141 | return result; 142 | } 143 | 144 | } // namespace mgpu 145 | -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/kernels/bulkremove.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "../mgpuhost.cuh" 38 | #include "../kernels/search.cuh" 39 | 40 | namespace mgpu { 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | // KernelBulkRemove 44 | // Copy the values that are not matched by an index. This is like the 45 | // anti-gather. 46 | 47 | template 49 | MGPU_LAUNCH_BOUNDS void KernelBulkRemove(InputIt source_global, int sourceCount, 50 | IndicesIt indices_global, int indicesCount, const int* p_global, 51 | OutputIt dest_global) { 52 | 53 | typedef MGPU_LAUNCH_PARAMS Params; 54 | typedef typename std::iterator_traits::value_type T; 55 | const int NT = Params::NT; 56 | const int VT = Params::VT; 57 | const int NV = NT * VT; 58 | 59 | typedef CTAScan S; 60 | union Shared { 61 | int indices[NV]; 62 | typename S::Storage scan; 63 | }; 64 | __shared__ Shared shared; 65 | 66 | int tid = threadIdx.x; 67 | int block = blockIdx.x; 68 | int gid = block * NV; 69 | sourceCount = min(NV, sourceCount - gid); 70 | 71 | // Search for begin and end iterators of interval to load. 72 | int p0 = p_global[block]; 73 | int p1 = p_global[block + 1]; 74 | 75 | // Set the flags to 1. The default is to copy a value. 76 | #pragma unroll 77 | for(int i = 0; i < VT; ++i) { 78 | int index = NT * i + tid; 79 | shared.indices[index] = index < sourceCount; 80 | } 81 | __syncthreads(); 82 | 83 | // Load the indices into register. 84 | int begin = p0; 85 | int indexCount = p1 - begin; 86 | int indices[VT]; 87 | DeviceGlobalToReg(indexCount, indices_global + begin, tid, indices); 88 | 89 | // Set the counter to 0 for each index we've loaded. 90 | #pragma unroll 91 | for(int i = 0; i < VT; ++i) 92 | if(NT * i + tid < indexCount) 93 | shared.indices[indices[i] - gid] = 0; 94 | __syncthreads(); 95 | 96 | // Run a raking scan over the flags. We count the set flags - this is the 97 | // number of elements to load in per thread. 98 | int x = 0; 99 | #pragma unroll 100 | for(int i = 0; i < VT; ++i) 101 | x += indices[i] = shared.indices[VT * tid + i]; 102 | __syncthreads(); 103 | 104 | // Run a CTA scan and scatter the gather indices to shared memory. 105 | int scan = S::Scan(tid, x, shared.scan); 106 | #pragma unroll 107 | for(int i = 0; i < VT; ++i) 108 | if(indices[i]) shared.indices[scan++] = VT * tid + i; 109 | __syncthreads(); 110 | 111 | // Load the gather indices into register. 112 | DeviceSharedToReg(shared.indices, tid, indices); 113 | 114 | // Gather the data into register. The number of values to copy is 115 | // sourceCount - indexCount. 116 | source_global += gid; 117 | int count = sourceCount - indexCount; 118 | T values[VT]; 119 | DeviceGather(count, source_global, indices, tid, values, false); 120 | 121 | // Store all the valid registers to dest_global. 122 | DeviceRegToGlobal(count, values, tid, dest_global + gid - begin); 123 | } 124 | 125 | //////////////////////////////////////////////////////////////////////////////// 126 | // BulkRemove 127 | 128 | template 129 | MGPU_HOST void BulkRemove(InputIt source_global, int sourceCount, 130 | IndicesIt indices_global, int indicesCount, OutputIt dest_global, 131 | CudaContext& context) { 132 | 133 | const int NT = 128; 134 | const int VT = 11; 135 | typedef LaunchBoxVT Tuning; 136 | int2 launch = Tuning::GetLaunchParams(context); 137 | const int NV = launch.x * launch.y; 138 | 139 | MGPU_MEM(int) partitionsDevice = BinarySearchPartitions( 140 | sourceCount, indices_global, indicesCount, NV, mgpu::less(), 141 | context); 142 | 143 | int numBlocks = MGPU_DIV_UP(sourceCount, NV); 144 | KernelBulkRemove<<>>( 145 | source_global, sourceCount, indices_global, indicesCount, 146 | partitionsDevice->get(), dest_global); 147 | MGPU_SYNC_CHECK("KernelBulkRemove"); 148 | } 149 | 150 | } // namespace mgpu 151 | -------------------------------------------------------------------------------- /gpu-scanline/src/rasterizer/shared/ras_qm_mask.cu: -------------------------------------------------------------------------------- 1 | 2 | #include "ras_qm_mask.h" 3 | 4 | #include 5 | 6 | namespace Mochimazui { 7 | 8 | namespace QM_Mask_Sample_Position { 9 | 10 | float mpvg_8_x[8] = 11 | { 12 | -0.266471f, 0.353688f, -0.401679f, 0.488846f, 13 | 0.122459f, -0.0344567f, -0.139007f, 0.207413f, 14 | }; 15 | 16 | float mpvg_8_y[8] = 17 | { 18 | 0.164718f, 0.0396624f, -0.215021f, 0.429684f, 19 | 0.282964f, -0.0841444f, -0.475235f, -0.328058f, 20 | }; 21 | 22 | float mpvg_32_x[32] = { 23 | 0.18936400f, 0.31758200f, 0.00903428f, -0.21124700f, 24 | -0.36328000f, 0.33291100f, -0.49970400f, -0.43663000f, 25 | -0.26837500f, 0.37728700f, -0.18975300f, -0.48250600f, 26 | -0.13179200f, 0.49235500f, 0.42711400f, 0.37090100f, 27 | -0.31862800f, 0.02879450f, 0.04699840f, -0.16154700f, 28 | 0.18745700f, -0.35758100f, 0.19744000f, 0.21454900f, 29 | -0.06650600f, 0.12811500f, 0.33646100f, 0.09921190f, 30 | -0.05305180f, -0.39892000f, -0.06095580f, -0.25435800f, 31 | }; 32 | 33 | float mpvg_32_y[32] = { 34 | -0.34008000f, 0.40063000f, -0.37434000f, -0.08741820f, 35 | -0.43687600f, -0.04052820f, -0.09869600f, 0.12511500f, 36 | 0.40422500f, 0.11086400f, -0.43411500f, -0.30842600f, 37 | -0.26574100f, 0.47500100f, 0.26635400f, -0.24554700f, 38 | -0.24432000f, 0.27314400f, -0.19358200f, 0.27910400f, 39 | 0.13190500f, -0.03301080f, 0.29668700f, -0.15357100f, 40 | -0.05443540f, 0.47731600f, -0.43140000f, 0.00607759f, 41 | 0.44544500f, 0.30706200f, 0.12207700f, 0.13223200f, 42 | }; 43 | 44 | void sort_samples(std::vector &samples) { 45 | std::sort(samples.begin(), samples.end(), [](const float2 &a, const float2 &b) { 46 | return (a.y < b.y) || (a.y == b.y && a.x < b.x) ? true : false; 47 | }); 48 | } 49 | 50 | std::vector mpvg_sample_position(int n_samples) { 51 | std::vector samples; 52 | samples.resize(n_samples); 53 | if (n_samples == 8) { 54 | for (int i = 0; i < 8; ++i) { 55 | samples[i].x = mpvg_8_x[i] + 0.5f; 56 | samples[i].y = 1.0f - (mpvg_8_y[i] + 0.5f); 57 | } 58 | } 59 | else if (n_samples == 32) { 60 | for (int i = 0; i < 32; ++i) { 61 | samples[i].x = mpvg_32_x[i] + 0.5f; 62 | samples[i].y = 1.0f - (mpvg_32_y[i] + 0.5f); 63 | } 64 | } 65 | else { 66 | throw std::runtime_error("mpvg_sample_position: unsupported sample number."); 67 | } 68 | sort_samples(samples); 69 | return samples; 70 | } 71 | 72 | std::vector gl_sample_position(int n_samples) { 73 | std::vector samples; 74 | 75 | GLPP::NamedTexture ttex; 76 | ttex.target(GLPP::Texture2DMultisample).create() 77 | .storage2DMultisample(n_samples, GL_RGBA8, 1, 1, GL_TRUE); 78 | 79 | GLPP::NamedFramebuffer tfbo; 80 | tfbo.create().texture2D(GL_COLOR_ATTACHMENT0, ttex, 0); 81 | 82 | tfbo.bind(GL_FRAMEBUFFER); 83 | 84 | int gl_n_samples; 85 | glGetIntegerv(GL_SAMPLES, &gl_n_samples); 86 | 87 | if (n_samples != gl_n_samples) { 88 | throw std::runtime_error("initQMMaskTable: incorrect sample number"); 89 | } 90 | 91 | samples.resize(n_samples); 92 | for (int i = 0; i < n_samples; ++i) { 93 | glGetMultisamplefv(GL_SAMPLE_POSITION, i, (float*)(samples.data() + i)); 94 | } 95 | 96 | tfbo.bind(GL_FRAMEBUFFER); 97 | 98 | tfbo.destroy(); 99 | ttex.destroy(); 100 | 101 | sort_samples(samples); 102 | return samples; 103 | } 104 | 105 | // -------- -------- -------- -------- -------- -------- -------- -------- 106 | // -------- -------- -------- -------- -------- -------- -------- -------- 107 | 108 | float vg_8_x[8] = {}; 109 | float vg_8_y[8] = {}; 110 | 111 | double vg_32_x[32] = { 112 | -0.49479166651144624000, 0.02604168653488159200, -0.24479165673255920000, 0.27604168653488159000, -0.36979167163372040000, 0.15104168653488159000, -0.09895834326744079600, 0.38020831346511841000, -0.43229166418313980000, 0.06770831346511840800, -0.18229165673255920000, 0.33854168653488159000, -0.30729167163372040000, 0.21354168653488159000, -0.05729165673255920400, 0.46354168653488159000, -0.46354166790843010000, 0.05729168653488159200, -0.19270834326744080000, 0.30729168653488159000, -0.31770832836627960000, 0.16145831346511841000, -0.06770834326744079600, 0.43229168653488159000, -0.40104166418313980000, 0.11979168653488159000, -0.13020834326744080000, 0.36979168653488159000, -0.27604167163372040000, 0.24479168653488159000, -0.02604165673255920400, 0.47395831346511841000, 113 | }; 114 | double vg_32_y[32] = { 115 | -0.48437500000000000000, -0.45312500000000000000, -0.42187500000000000000, -0.39062500000000000000, -0.35937500000000000000, -0.32812500000000000000, -0.29687500000000000000, -0.26562500000000000000, -0.23437500000000000000, -0.20312500000000000000, -0.17187500000000000000, -0.14062500000000000000, -0.10937500000000000000, -0.07812500000000000000, -0.04687500000000000000, -0.01562500000000000000, 0.01562500000000000000, 0.04687500000000000000, 0.07812500000000000000, 0.10937500000000000000, 0.14062500000000000000, 0.17187500000000000000, 0.20312500000000000000, 0.23437500000000000000, 0.26562500000000000000, 0.29687500000000000000, 0.32812500000000000000, 0.35937500000000000000, 0.39062500000000000000, 0.42187500000000000000, 0.45312500000000000000, 0.48437500000000000000, 116 | }; 117 | 118 | std::vector vg_sample_position(int i_n_samples) { 119 | 120 | std::vector fsample_8; 121 | std::vector fsample_32; 122 | 123 | fsample_8.resize(8); 124 | fsample_32.resize(32); 125 | 126 | for (int i = 0; i < 32; ++i) { 127 | fsample_32[i] = make_float2((float)(vg_32_x[i] + 0.5), (float)(vg_32_y[i] + 0.5)); 128 | } 129 | 130 | { 131 | int2 isample_8[8]; 132 | double gap = 1.0 / 8; 133 | double hgap = gap / 2; 134 | 135 | for (int i = 0; i < 8; ++i) { 136 | 137 | int bc = 0; 138 | int x = i; 139 | int y = 0; 140 | 141 | while (x) { 142 | y <<= 1; 143 | y |= x & 1; 144 | x >>= 1; 145 | ++bc; 146 | } 147 | 148 | int scale = 1 << bc; 149 | double f = y / (float)scale; 150 | 151 | int yy = (int)(f * 8); 152 | 153 | auto &s = isample_8[i]; 154 | s.x = yy; 155 | s.y = i; 156 | 157 | auto &sp = fsample_8[i]; 158 | sp.x = (float)(s.x * gap + hgap); 159 | sp.y = (float)(s.y * gap + hgap); 160 | } 161 | 162 | sort_samples(fsample_8); 163 | } 164 | 165 | 166 | if (i_n_samples == 8) { 167 | return fsample_8; 168 | } 169 | else if(i_n_samples == 32) { 170 | return fsample_32; 171 | } 172 | else { 173 | throw std::runtime_error("vg_sample_position: only support 8x & 32x samples."); 174 | } 175 | } 176 | 177 | } // end of namespace QM_Mask_Sample_Position 178 | 179 | } // end of namespace Mochimazui 180 | -------------------------------------------------------------------------------- /gpu-scanline/src/modern_gpu/include/util/static.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | 51 | #ifndef MGPU_MIN 52 | #define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y)) 53 | #define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y)) 54 | #define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0) 55 | #define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x)) 56 | 57 | #define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y)) 58 | #define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y)) 59 | #define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y)) 60 | #define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y) 61 | #define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1)) 62 | #define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1)) 63 | #define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1))) 64 | 65 | #endif // MGPU_MIN 66 | 67 | namespace mgpu { 68 | 69 | 70 | typedef unsigned char byte; 71 | 72 | typedef unsigned int uint; 73 | typedef signed short int16; 74 | 75 | typedef unsigned short ushort; 76 | typedef unsigned short uint16; 77 | 78 | typedef long long int64; 79 | typedef unsigned long long uint64; 80 | 81 | // IsPow2::value is true if X is a power of 2. 82 | template struct sIsPow2 { 83 | enum { value = 0 == (X & (X - 1)) }; 84 | }; 85 | 86 | // Finds the base-2 logarithm of X. value is -1 if X is not a power of 2. 87 | template struct sLogPow2 { 88 | enum { extra = sIsPow2::value ? 0 : (roundUp ? 1 : 0) }; 89 | enum { inner = sLogPow2::inner + 1 }; 90 | enum { value = inner + extra }; 91 | }; 92 | template struct sLogPow2<0, roundUp> { 93 | enum { inner = 0 }; 94 | enum { value = 0 }; 95 | }; 96 | template struct sLogPow2<1, roundUp> { 97 | enum { inner = 0 }; 98 | enum { value = 0 }; 99 | }; 100 | 101 | template 102 | struct sDivUp { 103 | enum { value = (X + Y - 1) / Y }; 104 | }; 105 | 106 | template struct sDiv2RoundUp { 107 | enum { value = sDiv2RoundUp::value, levels - 1>::value }; 108 | }; 109 | template struct sDiv2RoundUp { 110 | enum { value = count }; 111 | }; 112 | 113 | template 114 | struct sDivSafe { 115 | enum { value = X / Y }; 116 | }; 117 | template 118 | struct sDivSafe { 119 | enum { value = 0 }; 120 | }; 121 | 122 | template 123 | struct sRoundUp { 124 | enum { rem = X % Y }; 125 | enum { value = X + (rem ? (Y - rem) : 0) }; 126 | }; 127 | 128 | template 129 | struct sRoundDown { 130 | enum { rem = X % Y }; 131 | enum { value = X - rem }; 132 | }; 133 | 134 | // IntegerDiv is a template for avoiding divisions by zero in template 135 | // evaluation. Templates always evaluate both b and c in an expression like 136 | // a ? b : c, and will error if either rhs contains an illegal expression, 137 | // even if the ternary is explictly designed to guard against that. 138 | template 139 | struct sIntegerDiv { 140 | enum { value = X / (Y ? Y : (X + 1)) }; 141 | }; 142 | 143 | template 144 | struct sMax { 145 | enum { value = (X >= Y) ? X : Y }; 146 | }; 147 | template 148 | struct sMin { 149 | enum { value = (X <= Y) ? X : Y }; 150 | }; 151 | 152 | template 153 | struct sAbs { 154 | enum { value = (X >= 0) ? X : -X }; 155 | }; 156 | 157 | 158 | // Finds the number of powers of 2 in the prime factorization of X. 159 | template struct sNumFactorsOf2 { 160 | enum { shifted = X >> 1 }; 161 | enum { value = 1 + sNumFactorsOf2::value }; 162 | }; 163 | template struct sNumFactorsOf2 { 164 | enum { value = 0 }; 165 | }; 166 | 167 | // Returns the divisor for a conflict-free transpose. 168 | template struct sBankConflictDivisor { 169 | enum { value = 170 | (1 & X) ? 0 : 171 | (sIsPow2::value ? NumBanks : 172 | (1<< sNumFactorsOf2::value)) }; 173 | enum { log_value = sLogPow2::value }; 174 | }; 175 | 176 | template struct sConflictFreeStorage { 177 | enum { count = NT * X }; 178 | enum { divisor = sBankConflictDivisor::value }; 179 | enum { padding = sDivSafe::value }; 180 | enum { value = count + padding }; 181 | }; 182 | 183 | } // namespace mgpu 184 | -------------------------------------------------------------------------------- /gpu-scanline/src/mochimazui/cuda_array.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #ifndef _MOCHIMAZUI_CURA_ARRAY_H_ 5 | #define _MOCHIMAZUI_CUDA_ARRAY_H_ 6 | 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | 17 | #ifdef _DEBUG 18 | #define CTL_ASSERT(x) assert((x)==CUDA_SUCCESS) 19 | #else 20 | #define CTL_ASSERT(x) x 21 | #endif 22 | 23 | namespace CUDATL { 24 | 25 | /* 26 | enum ManagedCUDAArraySyncDirection { 27 | HostToDevice, 28 | DeviceToHost 29 | }; 30 | */ 31 | 32 | template 33 | struct cuda_array_allocator { 34 | static cudaError_t malloc(T**p, size_t s) { 35 | return cudaMalloc(p, s); 36 | } 37 | static cudaError_t free(T*p) { 38 | return cudaFree(p); 39 | } 40 | }; 41 | 42 | template 43 | struct cuda_array_managed_allocator { 44 | static cudaError malloc(T**p, size_t s) { 45 | return cudaMallocManaged(p, s); 46 | } 47 | static cudaError_t free(T*p) { 48 | return cudaFree(p); 49 | } 50 | }; 51 | 52 | template 53 | struct cuda_array_host_allocator { 54 | static cudaError malloc(T**p, size_t s) { 55 | return cudaMallocHost(p, s); 56 | } 57 | static cudaError_t free(T*p) { 58 | return cudaFreeHost(p); 59 | } 60 | }; 61 | 62 | // -------- -------- -------- -------- -------- -------- -------- -------- 63 | // @class cuda_array 64 | template < class T, class allocator = cuda_array_allocator > 65 | class cuda_array { 66 | 67 | public: 68 | cuda_array() 69 | :_size(0), _reservedSize(0), _gpuPointer(nullptr) { 70 | } 71 | 72 | ~cuda_array() { 73 | clear(); 74 | } 75 | 76 | public: 77 | size_t size() { return _size; } 78 | size_t reserved() { return _reservedSize; } 79 | 80 | void malloc(const size_t size) { 81 | if (_gpuPointer) { 82 | cudaFree(_gpuPointer); 83 | } 84 | CTL_ASSERT(allocator::malloc(&_gpuPointer, size * sizeof(T))); 85 | _size = size; 86 | _reservedSize = size; 87 | } 88 | 89 | // 90 | //void resize(const size_t newSize) { 91 | // if (newSize <= _reservedSize) { _size = newSize; return; } 92 | // auto oldGPUPointer = _gpuPointer; 93 | // auto newReserved = std::max(newSize, _reservedSize); 94 | // allocator::malloc(&_gpuPointer, newReserved * sizeof(T)); 95 | // if (!_gpuPointer) { throw std::runtime_error("cuda_array::resize: out of memory"); } 96 | // if (oldGPUPointer) { 97 | // auto oldByteSize = _size * sizeof(T); 98 | // cudaMemcpy(_gpuPointer, oldGPUPointer, oldByteSize, cudaMemcpyDeviceToDevice); 99 | // cudaFree(oldGPUPointer); 100 | // } 101 | // _size = newSize; 102 | // _reservedSize = newReserved; 103 | //} 104 | 105 | void resizeWithoutCopy(size_t newSize) { 106 | if (newSize <= _reservedSize) { _size = newSize; return; } 107 | if (_gpuPointer) { 108 | cudaFree(_gpuPointer); 109 | _gpuPointer = nullptr; 110 | } 111 | _size = newSize; 112 | newSize = (size_t)(newSize*1.5); 113 | allocator::malloc(&_gpuPointer, newSize * sizeof(T)); 114 | if (!_gpuPointer) { throw std::runtime_error("cuda_array::resizWithoutCopy: out of memory"); } 115 | _reservedSize = newSize; 116 | } 117 | 118 | // 119 | void clear() { 120 | if (_gpuPointer) { 121 | allocator::free(_gpuPointer); 122 | } 123 | _gpuPointer = nullptr; 124 | _size = 0; 125 | } 126 | 127 | // cpu -> gpu 128 | void set(const std::vector &v) { 129 | set(v.data(), v.size()); 130 | } 131 | 132 | void set(const T* data, size_t size) { 133 | if (size > _size) { resizeWithoutCopy(size); } 134 | CTL_ASSERT(cudaMemcpy(_gpuPointer, data, size*sizeof(T), cudaMemcpyHostToDevice)); 135 | } 136 | 137 | void setAsync(const std::vector &v) { 138 | setAsync(v.data(), v.size()); 139 | } 140 | 141 | void setAsync(const T* data, size_t size) { 142 | if (size > _size) { resizeWithoutCopy(size); } 143 | CTL_ASSERT(cudaMemcpyAsync(_gpuPointer, data, size*sizeof(T), cudaMemcpyHostToDevice)); 144 | } 145 | 146 | // gpu -> cpu 147 | void get(std::vector &v) { 148 | v.resize(_size); 149 | get(v.data(), v.size()); 150 | } 151 | 152 | void get(T* data, size_t size) { 153 | if (size > _size) { size = _size; } 154 | CTL_ASSERT(cudaMemcpy(data, _gpuPointer, size*sizeof(T), cudaMemcpyDeviceToHost)); 155 | } 156 | 157 | void getAsync(std::vector &v) { 158 | v.resize(_size); 159 | getAsync(v.data(), v.size()); 160 | } 161 | 162 | void getAsync(T* data, size_t size) { 163 | if (size > _size) { size = _size; } 164 | CTL_ASSERT(cudaMemcpyAsync(data, _gpuPointer, size*sizeof(T), cudaMemcpyDeviceToHost)); 165 | } 166 | 167 | // cpu -> gpu 168 | void setValue(int pos, const T &value) { 169 | if (pos >= _size) { resize(pos + 1); } 170 | CTL_ASSERT(cudaMemcpy(_gpuPointer + pos, &value, sizeof(T), cudaMemcpyHostToDevice)); 171 | } 172 | 173 | void setValueAsync(int pos, const T &value) { 174 | if (pos >= _size) { resize(pos + 1); } 175 | CTL_ASSERT(cudaMemcpyAsync(_gpuPointer + pos, &value, sizeof(T), cudaMemcpyHostToDevice)); 176 | } 177 | 178 | // gpu -> cpu 179 | T getValue(size_t pos) { 180 | T value; 181 | if (pos >= _size) { throw std::runtime_error("index out of range"); } 182 | CTL_ASSERT(cudaMemcpy(&value, _gpuPointer + pos, sizeof(T), cudaMemcpyDeviceToHost)); 183 | return value; 184 | } 185 | 186 | T* gpointer() const { return _gpuPointer; } 187 | T* cpointer() const { return nullptr; } 188 | 189 | T* gptr() const { return _gpuPointer; } 190 | T* cptr() const { return nullptr; } 191 | 192 | public: 193 | operator T* () { return _gpuPointer; } 194 | 195 | private: 196 | size_t _size; 197 | size_t _reservedSize; 198 | T *_gpuPointer; 199 | }; 200 | 201 | // !!! If device support cudaMallocManaged, just use cuda_array with cuda_array_managed_allocator 202 | //template < class T, class allocator = cuda_array_allocator> 203 | //class cuda_array_managed : public cuda_array { 204 | //}; 205 | 206 | template 207 | using cuda_device_array = cuda_array < T, cuda_array_allocator >; 208 | 209 | template 210 | using cuda_managed_array = cuda_array < T, cuda_array_managed_allocator >; 211 | 212 | template 213 | using cuda_host_array = cuda_array < T, cuda_array_host_allocator >; 214 | 215 | #ifdef _DEBUG 216 | template 217 | using CUDAArray = cuda_managed_array < T >; 218 | 219 | template 220 | using CUDAHostArray = cuda_host_array < T >; 221 | 222 | template 223 | using CUDAManagedArray = cuda_managed_array < T >; 224 | #else 225 | template 226 | using CUDAArray = cuda_array < T >; 227 | 228 | template 229 | using CUDAHostArray = cuda_host_array < T >; 230 | 231 | template 232 | using CUDAManagedArray = cuda_managed_array < T >; 233 | #endif 234 | 235 | } 236 | 237 | #endif 238 | --------------------------------------------------------------------------------