├── LICENSE.txt ├── README.md ├── doc └── overview-small.svg ├── src ├── backend_glsl.h ├── backend_ptx.h ├── backend_sass.h ├── frep.h ├── frep_builder.h ├── frep_eval.h ├── frep_serialize.h └── sass_6_x │ ├── backend.h │ ├── blocks.h │ ├── bytecode.h │ ├── cubin.h │ ├── instruction.h │ ├── registers.h │ ├── scheduler.h │ └── simulator.h └── test ├── backend_glsl.cpp ├── backend_ptx.cpp ├── backend_sass_6_x.cpp ├── backend_sass_6_x_mock.cpp ├── linker.cpp ├── test1.cubin ├── test1.ptx ├── test2.cubin ├── test2.ptx └── util ├── cuda_error.h ├── init_cuda.h ├── profiler.h └── test_models.h /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015-2019 Simen Haugo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fast-csg 2 | 3 | ![](doc/overview-small.svg) 4 | 5 | A compiler for functional representations (see e.g. OpenSCAD, libfive, Hyperfun) that directly generates executable bytecode instructions for GPU architectures. 6 | 7 | It gives you the benefit of fast tree evaluation as the tree structure is compiled into optimized machine code instructions (which makes the program compute-limited, not memory bandwidth-limited), while avoiding the long compile times that you would get by compiling to an intermediate target, such as GLSL, PTX, CUDA or NVVM IR. 8 | 9 | Unlike NVIDIA's closed-source compiler chain, we focus on compilation speed, aiming for sub-millisecond time from compilation start to having the kernel uploaded to the GPU and ready to run. 10 | 11 | Possible applications: 12 | 13 | * Fast and parallelized hypothesis generation and testing, for e.g. program synthesis or 3D reconstruction. 14 | * GPU-accelerated visualization where you can live edit the CSG tree structure with instant feedback 15 | 16 | ## Project status 17 | 18 | This project is currently in limbo. I'm open-sourcing it in the event that anyone finds some parts of it useful. To that end, here's a list of stuff that's in here: 19 | 20 | * CSG tree grammar and interpreter (see [src/frep.h](src/frep.h)) 21 | * Complete CSG->GLSL compiler (see [src/backend_glsl.h](src/backend_glsl.h)) 22 | * Complete CSG->PTX compiler (see [src/backend_ptx.h](src/backend_ptx.h)) 23 | * Partial CSG->SASS 6.x compiler (missing Cubin linking stage) 24 | 25 | Naturally, implementing a custom SASS compiler is difficult, as NVIDIA does not publically document the ISA and their PTX compiler is closed-source. With the help of Scott Gray's MaxAs (a reverse engineering of the Maxwell SASS), I was able to implement a rudimentary compiler for compute capability 6.x devices (Maxwell, Pascal families). Although the succeeding families Volta and Turing have not made huge changes to the ISA, it's a tedious task to implement backends for all of them. 26 | 27 | Nonetheless, you can find 28 | * Scheduler and register allocation ([src/sass_6_x/backend.h](src/sass_6_x/backend.h)) 29 | * Bytecode generation ([src/sass_6_x/bytecode.h](src/sass_6_x/bytecode.h)) 30 | -------------------------------------------------------------------------------- /doc/overview-small.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | image/svg+xml 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | f1 26 | f2 27 | 28 | 29 | 30 | 31 | 32 | 33 | f 34 | 35 | 36 | 37 | 38 | uint64_t cubin[] = { ... 0x001fd400fc2007e1 0x3958503f80070403 // FADD R3,|R4|,-1.0 0x3958503f00070505 // FADD R5,|R5|,-0.5 0x3958503e80070604 // FADD R4,|R6|,-0.25 0x001ffc00fe0007f6 0x5c60178000570303 // FMNMX R3,R3,R5,!PT 0x5c60178000370404 // FMNMX R4,R4,R3,!PT ...};CUmodule module;CUfunction fun;cuModuleLoadData(&module, cubin)cuModuleGetFunction(&fun, module, "f") 39 | f3 40 | 41 | 42 | NVIDIA SASS 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | Function Representation 66 | 67 | 68 | -------------------------------------------------------------------------------- /src/backend_glsl.h: -------------------------------------------------------------------------------- 1 | // Developed by Simen Haugo. 2 | // See LICENSE.txt for copyright and licensing details (standard MIT License). 3 | 4 | // This is the code generation backend for GLSL (GL Shading Language). 5 | // The output is a stripped GLSL source code, meaning you must insert 6 | // it into a GLSL shader as necessary for your application. 7 | 8 | #pragma once 9 | #include "frep.h" 10 | #include 11 | 12 | // Generates a null-terminated string of GLSL code that computes 13 | // 14 | // Variables are expected to be defined: 15 | // vec3 p0; 16 | // 17 | // Output is stored in: 18 | // float d1 = f(p0.x, p0.y, p0.z); 19 | // 20 | // The following functions must be declared and linked into the GLSL: 21 | // float fBox(vec3 p, vec3 dim); 22 | // float fBoxCheap(vec3 p, vec3 dim); 23 | // float fCylinder(vec3 p, float r, float h); 24 | // float fSphere(vec3 p, float r, float h); 25 | // 26 | char *frep_compile_to_glsl(frep_t *f); 27 | 28 | ////////////////////////////////////////////////////////////////// 29 | // Implementation 30 | ////////////////////////////////////////////////////////////////// 31 | 32 | namespace backend_glsl { 33 | 34 | struct glsl_t 35 | { 36 | int destination; 37 | char *stream; 38 | }; 39 | 40 | int _frep_compile_to_glsl(frep_t *node, 41 | glsl_t &s, 42 | frep_mat3_t R_root_to_parent=frep_identity_3x3, 43 | frep_vec3_t T_parent_rel_root=frep_null_3x1) 44 | { 45 | assert(node); 46 | 47 | frep_mat3_t R_root_to_this; 48 | frep_vec3_t T_this_rel_root; 49 | frep_get_global_transform(node, &R_root_to_this, &T_this_rel_root, R_root_to_parent, T_parent_rel_root); 50 | 51 | int my_index = s.destination++; 52 | 53 | // p^this = R_root_to_this*(p^0 - T_this_rel_root) 54 | // = R_root_to_this*p^0 + (-R_root_to_this*T_this_rel_root) 55 | { 56 | #define R(row,col) R_root_to_this.at(row,col) 57 | #define T(i) T_this_rel_root[i] 58 | float dtx = -(R(0,0)*T(0) + R(0,1)*T(1) + R(0,2)*T(2)); 59 | float dty = -(R(1,0)*T(0) + R(1,1)*T(1) + R(1,2)*T(2)); 60 | float dtz = -(R(2,0)*T(0) + R(2,1)*T(1) + R(2,2)*T(2)); 61 | s.stream += sprintf(s.stream, 62 | "vec3 p%d = " 63 | "vec3(%f,%f,%f)*p0.x + " 64 | "vec3(%f,%f,%f)*p0.y + " 65 | "vec3(%f,%f,%f)*p0.z + " 66 | "vec3(%f,%f,%f);\n", 67 | my_index, 68 | R(0,0), R(1,0), R(2,0), 69 | R(0,1), R(1,1), R(2,1), 70 | R(0,2), R(1,2), R(2,2), 71 | dtx, dty, dtz 72 | ); 73 | #undef R 74 | #undef T 75 | } 76 | 77 | if (frep_is_boolean(node)) 78 | { 79 | assert(node->left); 80 | assert(node->right); 81 | 82 | int i_left = _frep_compile_to_glsl(node->left, s, R_root_to_this, T_this_rel_root); 83 | int i_right = _frep_compile_to_glsl(node->right, s, R_root_to_this, T_this_rel_root); 84 | 85 | s.stream += sprintf(s.stream, "float d%d = ", my_index); 86 | 87 | switch (node->opcode) 88 | { 89 | case FREP_UNION: s.stream += sprintf(s.stream, "min(d%d,d%d);\n", i_left, i_right); break; 90 | case FREP_INTERSECT: s.stream += sprintf(s.stream, "max(d%d,d%d);\n", i_left, i_right); break; 91 | case FREP_SUBTRACT: s.stream += sprintf(s.stream, "max(d%d,-d%d);\n", i_left, i_right); break; 92 | case FREP_BLEND: s.stream += sprintf(s.stream, "%f*d%d + %f*d%d;\n", node->blend.alpha, i_left, 1.0f-node->blend.alpha, i_right); break; 93 | default: assert(false && "Unexpected opcode"); 94 | } 95 | } 96 | else if (frep_is_primitive(node)) 97 | { 98 | s.stream += sprintf(s.stream, "float d%d = ", my_index); 99 | 100 | switch (node->opcode) 101 | { 102 | case FREP_BOX: s.stream += sprintf(s.stream, "fBox(p%d, vec3(%f, %f, %f));\n", my_index, node->box.width, node->box.height, node->box.depth); break; 103 | case FREP_BOX_CHEAP: s.stream += sprintf(s.stream, "fBoxCheap(p%d, vec3(%f, %f, %f));\n", my_index, node->box.width, node->box.height, node->box.depth); break; 104 | case FREP_SPHERE: s.stream += sprintf(s.stream, "fSphere(p%d, %f);\n", my_index, node->sphere.radius); break; 105 | case FREP_CYLINDER: s.stream += sprintf(s.stream, "fCylinder(p%d, %f, %f);\n", my_index, node->cylinder.radius, node->cylinder.height); break; 106 | case FREP_PLANE: s.stream += sprintf(s.stream, "p%d.x - %f;\n", my_index, node->plane.offset); break; 107 | default: assert(false && "Unexpected opcode"); 108 | } 109 | } 110 | else 111 | { 112 | assert(false && "Unexpected node type"); 113 | } 114 | return my_index; 115 | } 116 | 117 | } 118 | 119 | char *frep_compile_to_glsl(frep_t *node) 120 | { 121 | using namespace backend_glsl; 122 | static char *buffer = (char*)malloc(10*1024*1024); 123 | assert(buffer && "Failed to allocate buffer to contain GLSL output"); 124 | glsl_t s; 125 | s.stream = buffer; 126 | s.destination = 1; 127 | _frep_compile_to_glsl(node, s); 128 | return buffer; 129 | } 130 | -------------------------------------------------------------------------------- /src/backend_ptx.h: -------------------------------------------------------------------------------- 1 | // Developed by Simen Haugo. 2 | // See LICENSE.txt for copyright and licensing details (standard MIT License). 3 | 4 | /* 5 | This is the code generation backend for NVIDIA PTX, which is not 6 | a machine code target, but a fake assembly language (stored as text) 7 | which gets compiled into native target-architecture instructions by 8 | the CUDA driver. Note that this compilation can take a long time. 9 | If you need to be able to rapidly compile and upload trees to the 10 | GPU, look at the SASS backend, where we implement our own native 11 | machine code generation. 12 | */ 13 | 14 | #pragma once 15 | 16 | #include "frep.h" 17 | #include 18 | #include 19 | 20 | /* 21 | Generates a string containing newline-seperated PTX instructions 22 | which evaluate f(x0, y0, z0) and stores the result in a register 23 | named "f%d" % result_register (e.g. "f3"). The input coordinates 24 | are assumed to be in registers named "x0", "y0", and "z0". 25 | 26 | See test/backend_ptx.cpp for an example of a complete PTX program 27 | that uses the generated output. 28 | */ 29 | char *frep_compile_to_ptx(frep_t *f, int *result_register); 30 | 31 | ////////////////////////////////////////////////////////////////// 32 | // Implementation 33 | ////////////////////////////////////////////////////////////////// 34 | 35 | namespace backend_ptx { 36 | 37 | /* 38 | Nodes in the FRep AST have constants (such as sphere radius) that 39 | are involved in the expression for that node's membership function. 40 | When generating code to execute the membership function, constants 41 | can either be placed in Constants Memory (and must be fetched with 42 | an additional load), or be baked directly into the instructions. 43 | 44 | For example, the PTX instruction 45 | add.ftz.f32 x, x, 0f3F000000; // x <- x + 0.5 46 | uses +0.5 as an immediate value. In the generated machine code for 47 | e.g. Maxwell architectures, this instruction may look like this: 48 | 0x3858503f00070409 49 | ^^^^^ 50 | immediate value (note that last 12 bits are truncated). 51 | 52 | However, not all instructions can use full 32-bit floating point 53 | immediate values. Notably, min, max and fused-multiply-add (FFMA) 54 | on Maxwell/Pascal target architectures. But all do support 20-bit 55 | floating point immediates, where the last 12 bits of the mantissa 56 | are truncated (assumed to be zero). 57 | 58 | You can choose whether you want to preserve 32-bit floating point 59 | constants at the expense of speed, or if you want to truncate the 60 | last 12 bits and use 20-bit floating point constants. 61 | */ 62 | uint32_t encode_f32(float x) 63 | { 64 | #if defined(PTX_FP32_IMMEDIATE) 65 | return (*(uint32_t*)&x); 66 | #elif defined(PTX_FP20_IMMEDIATE) 67 | // Note: PTX immediate values preserve their sign bit, unlike 68 | // SASS immediate values, which encode the sign bit elsewhere 69 | // in the instruction. 70 | return (*(uint32_t*)&x) & 0xFFFFF000; 71 | #else 72 | #error "You must #define either PTX_FP32_IMMEDIATE or PTX_FP20_IMMEDIATE before including this file." 73 | #endif 74 | } 75 | 76 | struct ptx_t 77 | { 78 | int next_register; 79 | char *stream; 80 | }; 81 | 82 | void emit_transform(ptx_t &s, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/) 83 | { 84 | // emit transform code: p_this = R_root_to_this*(p_root - T_this_rel_root) 85 | int x = s.next_register++; 86 | int y = s.next_register++; 87 | int z = s.next_register++; 88 | 89 | // compute R_root_to_this*(-T_this_rel_root) 90 | float tx = -(R.at(0,0)*T[0] + R.at(0,1)*T[1] + R.at(0,2)*T[2]); 91 | float ty = -(R.at(1,0)*T[0] + R.at(1,1)*T[1] + R.at(1,2)*T[2]); 92 | float tz = -(R.at(2,0)*T[0] + R.at(2,1)*T[1] + R.at(2,2)*T[2]); 93 | 94 | // emit instructions for R_root_to_this*p_root + R_root_to_this*(-T_this_rel_root) 95 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, x0, 0f%08x, 0f%08x;\n", x, encode_f32(R.at(0,0)), encode_f32(tx)); 96 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, x0, 0f%08x, 0f%08x;\n", y, encode_f32(R.at(1,0)), encode_f32(ty)); 97 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, x0, 0f%08x, 0f%08x;\n", z, encode_f32(R.at(2,0)), encode_f32(tz)); 98 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, y0, 0f%08x, f%d;\n", x, encode_f32(R.at(0,1)), x); 99 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, y0, 0f%08x, f%d;\n", y, encode_f32(R.at(1,1)), y); 100 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, y0, 0f%08x, f%d;\n", z, encode_f32(R.at(2,1)), z); 101 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, z0, 0f%08x, f%d;\n", x, encode_f32(R.at(0,2)), x); 102 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, z0, 0f%08x, f%d;\n", y, encode_f32(R.at(1,2)), y); 103 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, z0, 0f%08x, f%d;\n", z, encode_f32(R.at(2,2)), z); 104 | } 105 | 106 | int emit_box(ptx_t &s, frep_t *node, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/) 107 | { 108 | assert(false && "Box is not implemented in PTX backend yet"); 109 | return 0; 110 | } 111 | 112 | int emit_box_cheap(ptx_t &s, frep_t *node, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/) 113 | { 114 | // mathematical expression: Box(p, width,height,depth) 115 | // (x,y,z) = R*(p - T) 116 | // d = max( |x|-width, |y|-height, |z|-depth ) 117 | 118 | // ptx template: 119 | // 120 | // abs.ftz.f32 x, x; 121 | // abs.ftz.f32 y, y; 122 | // abs.ftz.f32 z, z; 123 | // sub.ftz.f32 x, x, (width); 124 | // sub.ftz.f32 y, y, (height); 125 | // sub.ftz.f32 z, z, (depth); 126 | // max.ftz.f32 d, x, y; 127 | // max.ftz.f32 d, d, z; 128 | 129 | // emitted instructions: 130 | emit_transform(s, R, T); // todo: inline here and optimize for each primitive 131 | int x = s.next_register - 3; 132 | int y = s.next_register - 2; 133 | int z = s.next_register - 1; 134 | int d = s.next_register++; 135 | s.stream += sprintf(s.stream, "abs.ftz.f32 f%d,f%d;\n", x, x); 136 | s.stream += sprintf(s.stream, "abs.ftz.f32 f%d,f%d;\n", y, y); 137 | s.stream += sprintf(s.stream, "abs.ftz.f32 f%d,f%d;\n", z, z); 138 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", x, x, encode_f32(-node->box.width)); 139 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", y, y, encode_f32(-node->box.height)); 140 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", z, z, encode_f32(-node->box.depth)); 141 | s.stream += sprintf(s.stream, "max.ftz.f32 f%d,f%d,f%d;\n", d, x, y); 142 | s.stream += sprintf(s.stream, "max.ftz.f32 f%d,f%d,f%d;\n", d, d, z); 143 | return d; 144 | } 145 | 146 | int emit_sphere(ptx_t &s, frep_t *node, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/) 147 | { 148 | // mathematical expression: 149 | // d = length(p_this) - r 150 | // = length(R*(p_root - T)) - r 151 | // = length(p_root - T) - r 152 | 153 | // ptx template: 154 | // add.ftz.f32 x, x0, (-tx); 155 | // add.ftz.f32 y, y0, (-ty); 156 | // add.ftz.f32 z, z0, (-tz); 157 | // mul.ftz.f32 d, x, x; 158 | // fma.rn.ftz.f32 d, y, y, d; 159 | // fma.rn.ftz.f32 d, z, z, d; 160 | // sqrt.approx.ftz.f32 d, d; 161 | // sub.f32 d, d, (r); 162 | 163 | // emitted instructions: 164 | int x = s.next_register++; 165 | int y = s.next_register++; 166 | int z = s.next_register++; 167 | int d = s.next_register++; 168 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,x0,0f%08x;\n", x, encode_f32(-T[0])); // x <- x0 - (Tx) 169 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,y0,0f%08x;\n", y, encode_f32(-T[1])); // y <- y0 - (Ty) 170 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,z0,0f%08x;\n", z, encode_f32(-T[2])); // z <- z0 - (Tz) 171 | s.stream += sprintf(s.stream, "mul.ftz.f32 f%d,f%d,f%d;\n", d, x, x); // d <- x*x 172 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,f%d,f%d,f%d;\n", d, y, y, d); // d <- y*y + d 173 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,f%d,f%d,f%d;\n", d, z, z, d); // d <- z*z + d 174 | s.stream += sprintf(s.stream, "sqrt.approx.ftz.f32 f%d,f%d;\n", d, d); // d <- sqrt(d) 175 | s.stream += sprintf(s.stream, "add.f32 f%d,f%d,0f%08x;\n", d, d, encode_f32(-node->sphere.radius)); // d <- d - (r) 176 | return d; 177 | } 178 | 179 | int emit_cylinder(ptx_t &s, frep_t *node, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/) 180 | { 181 | // mathematical expression: cylinder(p, 2*height, radius) 182 | // (x,y,z) = R*(p - T) 183 | // d = max( sqrt(x*x + z*z) - radius, abs(y) - height ) 184 | 185 | // ptx template 186 | // 187 | // mul.ftz.f32 d, x, x; 188 | // fma.rn.ftz.f32 d, z, z, d; 189 | // sqrt.approx.ftz.f32 d, d; 190 | // abs.ftz.f32 y, y; 191 | // add.ftz.f32 y, y, (-height); 192 | // add.ftz.f32 d, d, (-radius); 193 | // max.ftz.f32 d, d, y; 194 | 195 | // emitted instructions: 196 | emit_transform(s, R, T); // todo: inline here and optimize for each primitive 197 | int x = s.next_register - 3; 198 | int y = s.next_register - 2; 199 | int z = s.next_register - 1; 200 | int d = s.next_register++; 201 | s.stream += sprintf(s.stream, "mul.ftz.f32 f%d,f%d,f%d;\n", d, x, x); 202 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,f%d,f%d,f%d;\n", d, z, z, d); 203 | s.stream += sprintf(s.stream, "sqrt.approx.ftz.f32 f%d,f%d;\n", d, d); 204 | s.stream += sprintf(s.stream, "abs.ftz.f32 f%d,f%d;\n", y, y); 205 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", y, y, encode_f32(-node->cylinder.height)); 206 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", d, d, encode_f32(-node->cylinder.radius)); 207 | s.stream += sprintf(s.stream, "max.ftz.f32 f%d,f%d,f%d;\n", d, d, y); 208 | return d; 209 | } 210 | 211 | int emit_plane(ptx_t &s, frep_t *node, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/) 212 | { 213 | // mathematical expression: 214 | // (x,y,z) = R*(p - T) 215 | // d = x - plane.x 216 | // = R00*(x0 - Tx) + R01*(y0 - Ty) + R02*(z0 - Tz) - plane.x 217 | // = R00*x0 + R01*y0 + R02*z0 + (-plane.x - R00*Tx - R01*Ty - R02*Tz) 218 | // = R00*x0 + R01*y0 + R02*z0 + k 219 | 220 | // ptx template: 221 | // mul.ftz.f32 d, x0, (R00); 222 | // fma.rn.ftz.f32 d, y0, (R01), d; 223 | // fma.rn.ftz.f32 d, z0, (R02), d; 224 | // add.ftz.f32 d, d, (k) 225 | 226 | // emitted instructions: 227 | float k = -(R.at(0,0)*T[0] + R.at(0,1)*T[1] + R.at(0,2)*T[2] + node->plane.offset); 228 | int d = s.next_register++; 229 | s.stream += sprintf(s.stream, "mul.ftz.f32 f%d,x0,0f%08x;\n", d, encode_f32(R.at(0,0))); 230 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,y0,0f%08x,f%d;\n", d, encode_f32(R.at(0,1)), d); 231 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,z0,0f%08x,f%d;\n", d, encode_f32(R.at(0,2)), d); 232 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", d, d, encode_f32(k)); 233 | return d; 234 | } 235 | 236 | int emit_union(ptx_t &s, int left, int right) 237 | { 238 | int d = s.next_register++; 239 | s.stream += sprintf(s.stream, "min.ftz.f32 f%d,f%d,f%d;\n", d, left, right); 240 | return d; 241 | } 242 | 243 | int emit_intersect(ptx_t &s, int left, int right) 244 | { 245 | int d = s.next_register++; 246 | s.stream += sprintf(s.stream, "max.ftz.f32 f%d,f%d,f%d;\n", d, left, right); 247 | return d; 248 | } 249 | 250 | int emit_subtract(ptx_t &s, int left, int right) 251 | { 252 | int d = s.next_register++; 253 | s.stream += sprintf(s.stream, "neg.ftz.f32 f%d,f%d;\n", right, right); 254 | s.stream += sprintf(s.stream, "max.ftz.f32 f%d,f%d,f%d;\n", d, left, right); 255 | return d; 256 | } 257 | 258 | int emit_blend(ptx_t &s, int left, int right, float blend_alpha) 259 | { 260 | int d = s.next_register++; 261 | s.stream += sprintf(s.stream, "mul.ftz.f32 f%d,f%d,0f%08x;\n", d, left, encode_f32(blend_alpha)); 262 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,f%d,0f%08x,f%d;\n", d, right, encode_f32(1.0f-blend_alpha), d); 263 | return d; 264 | } 265 | 266 | int _frep_compile_to_ptx( 267 | frep_t *node, 268 | ptx_t &state, 269 | frep_mat3_t R_root_to_parent=frep_identity_3x3, 270 | frep_vec3_t T_parent_rel_root=frep_null_3x1) 271 | { 272 | assert(node); 273 | 274 | frep_mat3_t R_root_to_this; 275 | frep_vec3_t T_this_rel_root; 276 | frep_get_global_transform(node, &R_root_to_this, &T_this_rel_root, R_root_to_parent, T_parent_rel_root); 277 | 278 | int result = -1; 279 | if (frep_is_boolean(node)) 280 | { 281 | assert(node->left); 282 | assert(node->right); 283 | int left = _frep_compile_to_ptx(node->left, state, R_root_to_this, T_this_rel_root); 284 | int right = _frep_compile_to_ptx(node->right, state, R_root_to_this, T_this_rel_root); 285 | switch (node->opcode) 286 | { 287 | case FREP_UNION: return emit_union(state, left, right); 288 | case FREP_INTERSECT: return emit_intersect(state, left, right); 289 | case FREP_SUBTRACT: return emit_subtract(state, left, right); 290 | case FREP_BLEND: return emit_blend(state, left, right, node->blend.alpha); 291 | } 292 | } 293 | else if (frep_is_primitive(node)) 294 | { 295 | switch (node->opcode) 296 | { 297 | case FREP_BOX: return emit_box(state, node, R_root_to_this, T_this_rel_root); 298 | case FREP_BOX_CHEAP: return emit_box_cheap(state, node, R_root_to_this, T_this_rel_root); 299 | case FREP_SPHERE: return emit_sphere(state, node, R_root_to_this, T_this_rel_root); 300 | case FREP_CYLINDER: return emit_cylinder(state, node, R_root_to_this, T_this_rel_root); 301 | case FREP_PLANE: return emit_plane(state, node, R_root_to_this, T_this_rel_root); 302 | } 303 | } 304 | 305 | assert(false && "Unexpected node opcode"); 306 | return -1; 307 | } 308 | 309 | } 310 | 311 | char *frep_compile_to_ptx(frep_t *node, int *result_register) 312 | { 313 | using namespace backend_ptx; 314 | static char *buffer = (char*)malloc(10*1024*1024); 315 | assert(buffer && "Failed to allocate buffer to contain PTX output"); 316 | ptx_t s; 317 | s.stream = buffer; 318 | s.next_register = 0; 319 | *result_register = _frep_compile_to_ptx(node, s); 320 | return buffer; 321 | } 322 | -------------------------------------------------------------------------------- /src/backend_sass.h: -------------------------------------------------------------------------------- 1 | #if defined(COMPUTE_CAPABILITY_3_X) 2 | // Kepler 3 | #error "Target devices of compute capability 3.x are not supported by the SASS backend." 4 | 5 | #elif defined(COMPUTE_CAPABILITY_5_X) || defined(COMPUTE_CAPABILITY_6_X) 6 | // Maxwell, Pascal (e.g. GTX 1080, Titan X) 7 | #include "sass_6_x/backend.h" 8 | 9 | #elif defined(COMPUTE_CAPABILITY_7_X) 10 | // Volta, Turing (e.g. RTX Titan, 2080) 11 | #error "Target devices of compute capability 7.x are not supported by the SASS backend." 12 | 13 | #else 14 | #error "Missing #define. Specify the compute capability target for the SASS backend." 15 | #endif 16 | -------------------------------------------------------------------------------- /src/frep.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | typedef int frep_opcode_t; 9 | enum frep_opcode_ { 10 | FREP_INVALID = 0, 11 | 12 | FREP_BOX, 13 | FREP_BOX_CHEAP, 14 | FREP_SPHERE, 15 | FREP_CYLINDER, 16 | FREP_PLANE, 17 | FREP_UNION, 18 | FREP_INTERSECT, 19 | FREP_SUBTRACT, 20 | FREP_BLEND, 21 | }; 22 | 23 | struct frep_box_t { float width, height, depth; }; 24 | struct frep_sphere_t { float radius; }; 25 | struct frep_cylinder_t { float radius, height; }; 26 | struct frep_plane_t { float sign, offset; }; 27 | struct frep_blend_t { float alpha; }; 28 | 29 | /* 30 | Each frep node has a rigid-body transform associated with it. 31 | It can be the identity. If so, it gets optimized out in the 32 | backend. The transformation parameters relate the point argument 33 | of the child node to its parent node by: 34 | 35 | p^parent = Rx(rx)*Ry(ry)*Rz(rz)*p^child + (tx,ty,tz) 36 | 37 | */ 38 | struct frep_t { 39 | frep_opcode_t opcode; 40 | frep_t *left; 41 | frep_t *right; 42 | float rx,ry,rz,tx,ty,tz; 43 | union { 44 | frep_box_t box; 45 | frep_sphere_t sphere; 46 | frep_cylinder_t cylinder; 47 | frep_plane_t plane; 48 | frep_blend_t blend; 49 | }; 50 | }; 51 | 52 | /* 53 | Node creation and deletion utilities 54 | */ 55 | frep_t *frep_malloc() { 56 | frep_t *f = (frep_t*)malloc(sizeof(frep_t)); 57 | return f; 58 | } 59 | frep_t *frep_calloc() { 60 | frep_t *f = (frep_t*)calloc(1, sizeof(frep_t)); 61 | return f; 62 | } 63 | void frep_free(frep_t *f) { 64 | if (!f) return; 65 | frep_free(f->left); 66 | frep_free(f->right); 67 | free(f); 68 | } 69 | frep_t *frep_copy(frep_t *f) { 70 | if (!f) return NULL; 71 | frep_t *f1 = frep_malloc(); 72 | *f1 = *f; 73 | f1->left = frep_copy(f->left); 74 | f1->right = frep_copy(f->right); 75 | return f1; 76 | } 77 | 78 | /* 79 | Other utilities 80 | */ 81 | bool frep_is_primitive(frep_t *f) { 82 | return f->opcode == FREP_BOX || 83 | f->opcode == FREP_BOX_CHEAP || 84 | f->opcode == FREP_SPHERE || 85 | f->opcode == FREP_CYLINDER || 86 | f->opcode == FREP_PLANE; 87 | } 88 | bool frep_is_boolean(frep_t *f) { 89 | return f->opcode == FREP_UNION || 90 | f->opcode == FREP_INTERSECT || 91 | f->opcode == FREP_SUBTRACT; 92 | } 93 | int frep_get_num_nodes(frep_t *f) { 94 | if (!f) return 0; 95 | return 1 + frep_get_num_nodes(f->left) + frep_get_num_nodes(f->right); 96 | } 97 | 98 | int frep_get_depth(frep_t *f) { 99 | if (!f) return 0; 100 | int l = frep_get_depth(f->left); 101 | int r = frep_get_depth(f->right); 102 | int max_lr = (l > r ? l : r); 103 | return 1 + max_lr; 104 | } 105 | frep_t *frep_find_node(frep_t *a, int find_i, frep_t **out_parent, int *out_depth, frep_t *parent=NULL, int depth=0) 106 | { 107 | assert(a); 108 | assert(find_i >= 0); 109 | 110 | static int i = 0; 111 | if (!parent) i = 0; 112 | else i++; 113 | 114 | if (i == find_i) 115 | { 116 | *out_depth = depth; 117 | *out_parent = parent; 118 | return a; 119 | } 120 | else if (frep_is_boolean(a)) 121 | { 122 | frep_t *left = frep_find_node(a->left, find_i, out_parent, out_depth, a, depth+1); 123 | if (left) return left; 124 | frep_t *right = frep_find_node(a->right, find_i, out_parent, out_depth, a, depth+1); 125 | if (right) return right; 126 | } 127 | return NULL; 128 | } 129 | 130 | /* 131 | Utility routines for computing rigid-body transform from root node to a specific child. 132 | */ 133 | struct frep_mat3_t { float d[3*3]; float &at(int row, int col) { return d[col + row*3]; } }; 134 | struct frep_vec3_t { float d[3]; float &operator[](int i) { return d[i]; } }; 135 | static frep_mat3_t frep_identity_3x3 = { 1,0,0, 0,1,0, 0,0,1 }; 136 | static frep_vec3_t frep_null_3x1 = { 0,0,0 }; 137 | 138 | // d = a*b 139 | frep_mat3_t frep_mat_mul(frep_mat3_t a, frep_mat3_t b) { 140 | frep_mat3_t d = {0}; 141 | for (int row = 0; row < 3; row++) 142 | for (int col = 0; col < 3; col++) 143 | { 144 | d.at(row,col) = 0.0f; 145 | for (int i = 0; i < 3; i++) 146 | d.at(row,col) += a.at(row,i)*b.at(i,col); 147 | } 148 | return d; 149 | } 150 | 151 | // d = transpose(a) * b 152 | frep_vec3_t frep_mat_mul_transpose(frep_mat3_t a, frep_vec3_t b) { 153 | frep_vec3_t d = {0}; 154 | for (int row = 0; row < 3; row++) 155 | { 156 | d[row] = 0.0f; 157 | for (int i = 0; i < 3; i++) 158 | d[row] += a.at(i,row)*b[i]; 159 | } 160 | return d; 161 | } 162 | 163 | frep_vec3_t frep_mat_add(frep_vec3_t a, frep_vec3_t b) { 164 | frep_vec3_t d = { a[0]+b[0], a[1]+b[1], a[2]+b[2] }; 165 | return d; 166 | } 167 | 168 | void frep_get_global_transform(frep_t *node, 169 | frep_mat3_t *R_root_to_this, 170 | frep_vec3_t *T_this_rel_root, 171 | frep_mat3_t R_root_to_parent, 172 | frep_vec3_t T_parent_rel_root) { 173 | float cx = cosf(-node->rx); float sx = sinf(-node->rx); 174 | float cy = cosf(-node->ry); float sy = sinf(-node->ry); 175 | float cz = cosf(-node->rz); float sz = sinf(-node->rz); 176 | 177 | // R_this_to_parent = Rx(rx)*Ry(ry)*Rz(rz) 178 | // -> R_parent_to_this = Rz(-rz)*Ry(-ry)*Rx(-rx) 179 | frep_mat3_t R_parent_to_this = 180 | { 181 | cy*cz, cz*sx*sy - cx*sz, sx*sz + cx*cz*sy, 182 | cy*sz, cx*cz + sx*sy*sz, cx*sy*sz - cz*sx, 183 | -sy, cy*sx, cx*cy 184 | }; 185 | frep_vec3_t T_this_rel_parent = { node->tx, node->ty, node->tz }; 186 | 187 | *R_root_to_this = frep_mat_mul(R_parent_to_this,R_root_to_parent); 188 | *T_this_rel_root = frep_mat_add(T_parent_rel_root, frep_mat_mul_transpose(R_root_to_parent, T_this_rel_parent)); 189 | } 190 | 191 | -------------------------------------------------------------------------------- /src/frep_builder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "frep.h" 3 | 4 | /* 5 | FRep primitives 6 | */ 7 | frep_t *fBox(float width, float height, float depth) { 8 | frep_t *f = frep_calloc(); 9 | f->opcode = FREP_BOX; 10 | f->box.width = width; 11 | f->box.height = height; 12 | f->box.depth = depth; 13 | return f; 14 | } 15 | frep_t *fBoxCheap(float width, float height, float depth) { 16 | frep_t *f = frep_calloc(); 17 | f->opcode = FREP_BOX_CHEAP; 18 | f->box.width = width; 19 | f->box.height = height; 20 | f->box.depth = depth; 21 | return f; 22 | } 23 | frep_t *fSphere(float radius) { 24 | frep_t *f = frep_calloc(); 25 | f->opcode = FREP_SPHERE; 26 | f->sphere.radius = radius; 27 | return f; 28 | } 29 | frep_t *fCylinder(float radius, float height) { 30 | frep_t *f = frep_calloc(); 31 | f->opcode = FREP_CYLINDER; 32 | f->cylinder.radius = radius; 33 | f->cylinder.height = height; 34 | return f; 35 | } 36 | frep_t *fPlane(float sign, float offset) { 37 | frep_t *f = frep_calloc(); 38 | f->opcode = FREP_PLANE; 39 | f->plane.sign = sign; 40 | f->plane.offset = offset; 41 | return f; 42 | } 43 | 44 | /* 45 | Function operators 46 | */ 47 | frep_t *fOpUnion(frep_t *left, frep_t *right) { 48 | frep_t *f = frep_calloc(); 49 | f->opcode = FREP_UNION; 50 | f->left = left; 51 | f->right = right; 52 | return f; 53 | } 54 | frep_t *fOpSubtract(frep_t *left, frep_t *right) { 55 | frep_t *f = frep_calloc(); 56 | f->opcode = FREP_SUBTRACT; 57 | f->left = left; 58 | f->right = right; 59 | return f; 60 | } 61 | frep_t *fOpIntersect(frep_t *left, frep_t *right) { 62 | frep_t *f = frep_calloc(); 63 | f->opcode = FREP_INTERSECT; 64 | f->left = left; 65 | f->right = right; 66 | return f; 67 | } 68 | 69 | /* 70 | Spatial operators 71 | */ 72 | frep_t *pOpRotate(frep_t *f, float rx, float ry, float rz) { 73 | f->rx = rx; 74 | f->ry = ry; 75 | f->rz = rz; 76 | return f; 77 | } 78 | frep_t *pOpTranslate(frep_t *f, float tx, float ty, float tz) { 79 | f->tx = tx; 80 | f->ty = ty; 81 | f->tz = tz; 82 | return f; 83 | } 84 | -------------------------------------------------------------------------------- /src/frep_eval.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "frep.h" 3 | #include 4 | #include 5 | 6 | float frep_eval(frep_t *f, float x, float y, float z) 7 | { 8 | assert(f); 9 | 10 | x -= f->tx; 11 | y -= f->ty; 12 | z -= f->tz; 13 | 14 | if (f->rx != 0.0f) 15 | { 16 | float cx = cosf(-f->rx); 17 | float sx = sinf(-f->rx); 18 | float zz = cx*z + sx*y; 19 | y = cx*y - sx*z; 20 | z = zz; 21 | } 22 | if (f->ry != 0.0f) 23 | { 24 | float cy = cosf(-f->ry); 25 | float sy = sinf(-f->ry); 26 | float xx = cy*x + sy*z; 27 | z = cy*z - sy*x; 28 | x = xx; 29 | } 30 | if (f->rz != 0.0f) 31 | { 32 | float cz = cosf(-f->rz); 33 | float sz = sinf(-f->rz); 34 | float xx = cz*x - sz*y; 35 | y = cz*y + sz*x; 36 | x = xx; 37 | } 38 | 39 | switch (f->opcode) 40 | { 41 | case FREP_BOX: 42 | { 43 | float dx = fabsf(x) - f->box.width; 44 | float dy = fabsf(y) - f->box.height; 45 | float dz = fabsf(z) - f->box.depth; 46 | float dbx = (dx < 0.0f) ? dx : 0.0f; float b = dbx; 47 | float dby = (dy < 0.0f) ? dy : 0.0f; if (dby > b) b = dby; 48 | float dbz = (dz < 0.0f) ? dz : 0.0f; if (dbz > b) b = dbz; 49 | if (dx < 0.0f) dx = 0.0f; 50 | if (dy < 0.0f) dy = 0.0f; 51 | if (dz < 0.0f) dz = 0.0f; 52 | return sqrtf(dx*dx + dy*dy + dz*dz) + b; 53 | } 54 | case FREP_BOX_CHEAP: 55 | { 56 | float dx = fabsf(x) - f->box.width; 57 | float dy = fabsf(y) - f->box.height; 58 | float dz = fabsf(z) - f->box.depth; 59 | float d = dx; 60 | if (dy > d) d = dy; 61 | if (dz > d) d = dz; 62 | return d; 63 | } 64 | case FREP_SPHERE: 65 | { 66 | return sqrtf(x*x + y*y + z*z) - f->sphere.radius; 67 | } 68 | case FREP_CYLINDER: 69 | { 70 | float a = sqrtf(x*x + z*z) - f->cylinder.radius; 71 | float b = fabsf(y) - f->cylinder.height; 72 | return a > b ? a : b; 73 | } 74 | case FREP_PLANE: 75 | { 76 | return f->plane.sign*x - f->plane.offset; 77 | } 78 | case FREP_UNION: 79 | { 80 | float f1 = frep_eval(f->left, x, y, z); 81 | float f2 = frep_eval(f->right, x, y, z); 82 | return f1 < f2 ? f1 : f2; 83 | } 84 | case FREP_INTERSECT: 85 | { 86 | float f1 = frep_eval(f->left, x, y, z); 87 | float f2 = frep_eval(f->right, x, y, z); 88 | return f1 > f2 ? f1 : f2; 89 | } 90 | case FREP_SUBTRACT: 91 | { 92 | float f1 = frep_eval(f->left, x, y, z); 93 | float f2 = -frep_eval(f->right, x, y, z); 94 | return f1 > f2 ? f1 : f2; 95 | } 96 | #if 0 97 | case FREP_BLEND: 98 | { 99 | float f1 = frep_eval(f->left, x, y, z); 100 | float f2 = frep_eval(f->right, x, y, z); 101 | return f->blend.alpha*f1 + (1.0f - f->blend.alpha)*f2; 102 | } 103 | #endif 104 | default: 105 | { 106 | assert(false && "invalid node type"); 107 | } 108 | } 109 | return 0.0f; 110 | } 111 | -------------------------------------------------------------------------------- /src/frep_serialize.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "ast.h" 3 | #include 4 | 5 | #ifdef _MSC_VER 6 | // Note: MSVC version returns -1 on overflow, but glibc returns total count (which may be >= buf_size) 7 | #define snprintf _snprintf 8 | #endif 9 | 10 | static char *ast__to_string(ast_t *a, char *stream, size_t sizeof_buffer) 11 | { 12 | if (!a) return stream; 13 | if (a->type == AST_BOX) stream += snprintf(stream, sizeof_buffer, "b[%g,%g,%g]", a->box.w, a->box.h, a->box.d); 14 | else if (a->type == AST_SPHERE) stream += snprintf(stream, sizeof_buffer, "s[%g]", a->sphere.r); 15 | else if (a->type == AST_CYLINDER) stream += snprintf(stream, sizeof_buffer, "c[%g,%g]", a->cylinder.r, a->cylinder.h); 16 | else if (a->type == AST_PLANE) stream += snprintf(stream, sizeof_buffer, "p[%g]", a->plane.x); 17 | else if (a->type == AST_UNION) stream += snprintf(stream, sizeof_buffer, "U"); 18 | else if (a->type == AST_INTERSECT) stream += snprintf(stream, sizeof_buffer, "I"); 19 | else if (a->type == AST_SUBTRACT) stream += snprintf(stream, sizeof_buffer, "S"); 20 | else if (a->type == AST_BLEND) stream += snprintf(stream, sizeof_buffer, "B[%g]", a->blend.alpha); 21 | stream += snprintf(stream, sizeof_buffer, "[%g,%g,%g]", a->rx, a->ry, a->rz); 22 | stream += snprintf(stream, sizeof_buffer, "[%g,%g,%g]", a->tx, a->ty, a->tz); 23 | stream = ast__to_string(a->left, stream, sizeof_buffer); 24 | stream = ast__to_string(a->right, stream, sizeof_buffer); 25 | return stream; 26 | } 27 | 28 | static ast_t *ast__from_string(char **inout_stream) 29 | { 30 | char *stream = *inout_stream; 31 | if (!stream) return NULL; 32 | if (*stream == '\0') return NULL; 33 | 34 | ast_t *a = ast_new(); 35 | 36 | #define next_bracket() { while (*stream && *stream != '[') stream++; assert(*stream); stream++; assert(*stream); } 37 | if (*stream == 'b') { a->type = AST_BOX; next_bracket(); assert(3 == sscanf(stream, "%f,%f,%f", &a->box.w, &a->box.h, &a->box.d)); next_bracket(); } 38 | else if (*stream == 's') { a->type = AST_SPHERE; next_bracket(); assert(1 == sscanf(stream, "%f", &a->sphere.r )); next_bracket(); } 39 | else if (*stream == 'c') { a->type = AST_CYLINDER; next_bracket(); assert(2 == sscanf(stream, "%f,%f", &a->cylinder.r, &a->cylinder.h )); next_bracket(); } 40 | else if (*stream == 'p') { a->type = AST_PLANE; next_bracket(); assert(1 == sscanf(stream, "%f", &a->plane.x )); next_bracket(); } 41 | else if (*stream == 'U') { a->type = AST_UNION; next_bracket(); } 42 | else if (*stream == 'I') { a->type = AST_INTERSECT; next_bracket(); } 43 | else if (*stream == 'S') { a->type = AST_SUBTRACT; next_bracket(); } 44 | else if (*stream == 'B') { a->type = AST_BLEND; next_bracket(); assert(1 == sscanf(stream, "%f", &a->blend.alpha )); next_bracket(); } 45 | else assert(false && "invalid node type"); 46 | assert(3 == sscanf(stream, "%f,%f,%f", &a->rx, &a->ry, &a->rz)); 47 | next_bracket(); 48 | assert(3 == sscanf(stream, "%f,%f,%f", &a->tx, &a->ty, &a->tz)); 49 | while (*stream && *stream != ']') stream++; 50 | assert(*stream); 51 | stream++; 52 | #undef next_bracket 53 | 54 | a->left = ast__from_string(&stream); 55 | a->right = ast__from_string(&stream); 56 | *inout_stream = stream; 57 | return a; 58 | } 59 | 60 | char *ast_to_string(ast_t *a) 61 | { 62 | static char buffer[1024*1024]; 63 | ast__to_string(a, buffer, sizeof(buffer)); 64 | return buffer; 65 | } 66 | 67 | ast_t *ast_from_string(char *stream) 68 | { 69 | return ast__from_string(&stream); 70 | } 71 | 72 | #ifdef _MSC_VER 73 | #undef snprintf 74 | #endif 75 | -------------------------------------------------------------------------------- /src/sass_6_x/backend.h: -------------------------------------------------------------------------------- 1 | // Developed by Simen Haugo. 2 | // See LICENSE.txt for copyright and licensing details (standard MIT License). 3 | // 4 | // This file contains the machine code generation backend for NVIDIA SASS (Shader 5 | // Assembly) ISA. Unlike the PTX backend, this directly outputs to binary code that 6 | // can be patched into a Cubin binary module and loaded immediately with the Cuda 7 | // Driver API (see NVRTC example in SDK). This avoids the slow PTX compiler provided 8 | // in CUDA. 9 | // 10 | // This backend is for devices of compute capability 6.x, such as the Maxwell and 11 | // Pascal GPU families. It does not support Volta or Turing families (which have 12 | // compute capability 7.x). 13 | // 14 | // SASS code generation consists of the following major steps 15 | // 16 | // 1. Generate instruction blocks 17 | // the input frep tree is parsed to produce independent sequences of temporary 18 | // SASS instructions (not binary). These are assigned virtual register names, 19 | // which must be assigned to physical registers in the next step. 20 | // 21 | // 2. Schedule instructions and assign physical registers 22 | // 23 | // 3. Generate SASS binary 24 | // With the physical registers assigned, we can now generate the actual binary 25 | // instructions that go into the final ELF executable. 26 | // 27 | // 4. Link SASS ELF executable (a "Cubin" module) 28 | // 29 | 30 | #pragma once 31 | #include "../frep.h" 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include "registers.h" 37 | #include "instruction.h" 38 | #include "scheduler.h" 39 | #include "blocks.h" 40 | #include "bytecode.h" 41 | 42 | #if 0 43 | uint64_t get_ctrl_segment(instruction_t i) 44 | { 45 | uint8_t ra,rb,rc,rd; 46 | uint8_t reuse; // register reuse flags 47 | uint8_t yield; // can relinquish control to other warp or not 48 | uint8_t stall; // number of cycles to wait before continuing 49 | uint8_t wrtdb; // write dependencies 50 | uint8_t readb; // read dependencies 51 | uint8_t watdb; // wait dependencies 52 | } 53 | 54 | void *frep_compile_to_sass(frep_t *tree, size_t *length) 55 | { 56 | static const uint8_t header[] = { 57 | 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x33, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 58 | 0x01, 0x00, 0xbe, 0x00, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 59 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 60 | 0x3c, 0x05, 0x3c, 0x00, 0x40, 0x00, 0x38, 0x00, 0x00, 0x00, 0x40, 0x00, 0x09, 0x00, 0x01, 0x00, 61 | 0x00, 0x2e, 0x73, 0x68, 0x73, 0x74, 0x72, 0x74, 0x61, 0x62, 0x00, 0x2e, 0x73, 0x74, 0x72, 0x74, 62 | 0x61, 0x62, 0x00, 0x2e, 0x73, 0x79, 0x6d, 0x74, 0x61, 0x62, 0x00, 0x2e, 0x73, 0x79, 0x6d, 0x74, 63 | 0x61, 0x62, 0x5f, 0x73, 0x68, 0x6e, 0x64, 0x78, 0x00, 0x2e, 0x6e, 0x76, 0x2e, 0x69, 0x6e, 0x66, 64 | 0x6f, 0x00, 0x2e, 0x74, 0x65, 0x78, 0x74, 0x2e, 0x74, 0x72, 0x65, 0x65, 0x00, 0x2e, 0x6e, 0x76, 65 | 0x2e, 0x69, 0x6e, 0x66, 0x6f, 0x2e, 0x74, 0x72, 0x65, 0x65, 0x00, 0x2e, 0x6e, 0x76, 0x2e, 0x63, 66 | 0x61, 0x6c, 0x6c, 0x67, 0x72, 0x61, 0x70, 0x68, 0x00, 0x2e, 0x6e, 0x76, 0x2e, 0x70, 0x72, 0x6f, 67 | 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x00, 0x00, 0x2e, 0x73, 0x68, 0x73, 0x74, 0x72, 0x74, 0x61, 68 | 0x62, 0x00, 0x2e, 0x73, 0x74, 0x72, 0x74, 0x61, 0x62, 0x00, 0x2e, 0x73, 0x79, 0x6d, 0x74, 0x61, 69 | 0x62, 0x00, 0x2e, 0x73, 0x79, 0x6d, 0x74, 0x61, 0x62, 0x5f, 0x73, 0x68, 0x6e, 0x64, 0x78, 0x00, 70 | 0x2e, 0x6e, 0x76, 0x2e, 0x69, 0x6e, 0x66, 0x6f, 0x00, 0x74, 0x72, 0x65, 0x65, 0x00, 0x2e, 0x74, 71 | 0x65, 0x78, 0x74, 0x2e, 0x74, 0x72, 0x65, 0x65, 0x00, 0x2e, 0x6e, 0x76, 0x2e, 0x69, 0x6e, 0x66, 72 | 0x6f, 0x2e, 0x74, 0x72, 0x65, 0x65, 0x00, 0x23, 0x66, 0x66, 0x66, 0x66, 0x00, 0x2e, 0x6e, 0x76, 73 | 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x67, 0x72, 0x61, 0x70, 0x68, 0x00, 0x2e, 0x6e, 0x76, 0x2e, 0x70, 74 | 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 75 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 76 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x03, 0x00, 0x08, 0x00, 77 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 78 | 0x56, 0x00, 0x00, 0x00, 0x03, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 79 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, 0x03, 0x00, 0x07, 0x00, 80 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 81 | 0x32, 0x00, 0x00, 0x00, 0x12, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 82 | 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x2f, 0x08, 0x00, 0x04, 0x00, 0x00, 0x00, 83 | 0x07, 0x00, 0x00, 0x00, 0x04, 0x23, 0x08, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 84 | 0x04, 0x11, 0x08, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x30, 0x00, 0x00, 85 | 0x01, 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 86 | 0xfe, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0xfd, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 87 | 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 88 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 89 | }; 90 | 91 | static const uint8_t footer[] = { 92 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 93 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 94 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 95 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 96 | 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 97 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 98 | 0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 99 | 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 100 | 0x0b, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 101 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 102 | 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 103 | 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 104 | 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 105 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 106 | 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 107 | 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 108 | 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 109 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 110 | 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 111 | 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 112 | 0x3d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 113 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xbc, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 114 | 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 115 | 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 116 | 0x4b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 117 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc4, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 118 | 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 119 | 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 120 | 0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 121 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 122 | 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 123 | 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 124 | 0x32, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 125 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 126 | 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x07, 127 | 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 128 | }; 129 | 130 | using namespace backend_sass; 131 | instruction_blocks_t blocks = generate_sass_blocks(tree); 132 | 133 | int num_instructions; 134 | instruction_t *instructions = schedule_blocks(blocks, &num_instructions); 135 | 136 | size_t sizeof_cubin = sizeof(header) + sizeof(footer) + sizeof_sass; 137 | uint8_t *cubin = (uint8_t*)malloc(sizeof_cubin); 138 | memcpy(cubin, header, sizeof(header)); 139 | uint64_t *instruction_bin = cubin + sizeof(header); 140 | for (size_t i = 0; i < num_instructions; i++) 141 | { 142 | instruction_t i1 = instructions[i]; 143 | instruction_t i2 = instructions[i]; 144 | instruction_t i3 = instructions[i]; 145 | instruction_t instruction = instructions[i]; 146 | switch (instruction.type) 147 | { 148 | case INSTRUCTION_FFMA: FFMA(d, a, b, c, FFMA_FTZ); break; 149 | case INSTRUCTION_FMUL: FMUL(d, a, b, FMUL_FTZ); break; 150 | case INSTRUCTION_FADD: FADD(d, a, b, FADD_FTZ); break; 151 | case INSTRUCTION_FFMA20I: FFMA20I(d, a, imm_b, c, FFMA_FTZ); break; 152 | case INSTRUCTION_FMUL20I: FMUL20I(d, a, imm_b, FMUL_FTZ); break; 153 | case INSTRUCTION_FADD20I: FADD20I(d, a, imm_b, FADD_FTZ); break; 154 | case INSTRUCTION_FADD20I_ABS_A: FADD20I(d, a, imm_b, FADD_FTZ|FADD_ABS_A); break; 155 | case INSTRUCTION_FMIN: FMIN(d, a, b, FMNMX_FTZ); break; 156 | case INSTRUCTION_FMAX: FMAX(d, a, b, FMNMX_FTZ); break; 157 | case INSTRUCTION_FMAX_NEG_B: FMIN(d, a, b, FMNMX_FTZ|FMNMX_NEG_B); break; 158 | case INSTRUCTION_SQRT: MUFU_SQRT(d, a); break; 159 | default: assert(false && "Unknown instruction type"); 160 | } 161 | } 162 | memcpy(cubin, footer, sizeof(footer)); 163 | 164 | assert(cubin); 165 | } 166 | #endif 167 | -------------------------------------------------------------------------------- /src/sass_6_x/blocks.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace backend_sass { 4 | 5 | #define CLEAR() memset(&block->instructions[block->num_instructions], 0, sizeof(instruction_t)) 6 | #define TYPE(Expression) block->instructions[block->num_instructions].type = INSTRUCTION_##Expression 7 | #define RA(Expression) block->instructions[block->num_instructions].a = REGISTER_##Expression 8 | #define RB(Expression) block->instructions[block->num_instructions].b = REGISTER_##Expression 9 | #define RC(Expression) block->instructions[block->num_instructions].c = REGISTER_##Expression 10 | #define RD(Expression) block->instructions[block->num_instructions].d = REGISTER_##Expression 11 | #define STALL(Expression) block->instructions[block->num_instructions].stall = Expression; 12 | #define IMMB(Expression) block->instructions[block->num_instructions].imm_b = Expression; 13 | #define NEXT() block->num_instructions++; assert(block->num_instructions <= MAX_INSTRUCTIONS_PER_BLOCK); 14 | 15 | #if 0 // sequential transform code 16 | // (x,y,z) = R_root_to_this*((x0,y0,z0) - T_this_rel_root) 17 | // = Rz(rz)*Ry(ry)*Rx(rx)*((x0-tx, y0-ty, z0-tz)) 18 | void emit_transform(instruction_block_t *block, frep_mat3_t R_root_to_this, frep_vec3_t T_this_rel_root) 19 | { 20 | // Convert to final rotation into euler angles 21 | // (need less registers to do three sequential 22 | // euler rotations, than a full 3x3 matrix multiply, I think...?) 23 | float rx,ry,rz; 24 | frep_so3_to_ypr(R_root_to_this, &rz, &ry, &rx); 25 | float tx = T_this_rel_root[0]; 26 | float ty = T_this_rel_root[1]; 27 | float tz = T_this_rel_root[2]; 28 | float cx = cosf(rx); float sx = sinf(rx); 29 | float cy = cosf(ry); float sy = sinf(ry); 30 | float cz = cosf(rz); float sz = sinf(rz); 31 | // translate: 32 | CLEAR(); TYPE(FADD20I); RD(X); RA(X0); IMMB(-tx); NEXT(); // FADD x, x0, (-tx) 33 | CLEAR(); TYPE(FADD20I); RD(Y); RA(Y0); IMMB(-ty); NEXT(); // FADD y, y0, (-ty) 34 | CLEAR(); TYPE(FADD20I); RD(Z); RA(Z0); IMMB(-tz); NEXT(); // FADD z, z0, (-tz) 35 | // rotate_x: x=x, y=c*y - s*z, z=s*y + c*z 36 | CLEAR(); TYPE(FMUL20I); RD(W); RA(Y); IMMB(+sx); NEXT(); // FMUL w, y, (s) 37 | CLEAR(); TYPE(FMUL20I); RD(Y); RA(Y); IMMB(+cx); NEXT(); // FMUL y, y.reuse, (c) 38 | CLEAR(); TYPE(FFMA20I); RD(Y); RA(Z); IMMB(-sx); RC(Y); NEXT(); // FFMA y, z, (-s), y 39 | CLEAR(); TYPE(FFMA20I); RD(Z); RA(Z); IMMB(+cx); RC(W); NEXT(); // FFMA z, z.reuse, (c), w 40 | // rotate_y: x=c*x + s*z, y=y, z=-s*x + c*z 41 | CLEAR(); TYPE(FMUL20I); RD(W); RA(X); IMMB(-sy); NEXT(); // FMUL w, x, (-s) 42 | CLEAR(); TYPE(FMUL20I); RD(X); RA(X); IMMB(+cy); NEXT(); // FMUL x, x.reuse, (c) 43 | CLEAR(); TYPE(FFMA20I); RD(X); RA(Z); IMMB(+sy); RC(X); NEXT(); // FFMA x, z, (s), x 44 | CLEAR(); TYPE(FFMA20I); RD(Z); RA(Z); IMMB(+cy); RC(W); NEXT(); // FFMA z, z.reuse, (c), w 45 | // rotate_z: x=c*x - s*y, y=s*x + c*y, z=z 46 | CLEAR(); TYPE(FMUL20I); RD(W); RA(X); IMMB(+sz); NEXT(); // FMUL w, x, (s) 47 | CLEAR(); TYPE(FMUL20I); RD(X); RA(X); IMMB(+cz); NEXT(); // FMUL x, x.reuse, (c) 48 | CLEAR(); TYPE(FFMA20I); RD(X); RA(Y); IMMB(-sz); RC(X); NEXT(); // FFMA x, y, (-s), x 49 | CLEAR(); TYPE(FFMA20I); RD(Y); RA(Y); IMMB(+cz); RC(W); NEXT(); // FFMA y, y.reuse, (c), w 50 | } 51 | #else 52 | void emit_transform(instruction_block_t *block, frep_mat3_t R/*_root_to_this*/, frep_vec3_t T/*_this_rel_root*/) 53 | { 54 | // This path is a stall-count optimized version of the above. 55 | // The generated code computes the following: 56 | // (x,y,z) = R_root_to_this*((x0,y0,z0) - T_this_rel_root) 57 | // x = R00*(x0-Tx) + R01*(y0-Ty) + R02*(z0-Tz) 58 | // = R00*x0 + R01*y0 + R02*z0 + (-R00*Tx - R01*Ty - R02*Tz) 59 | // = R00*x0 + R01*y0 + R02*z0 + dx 60 | // etc... 61 | 62 | float dx = -(R.at(0,0)*T[0] + R.at(0,1)*T[1] + R.at(0,2)*T[2]); 63 | float dy = -(R.at(1,0)*T[0] + R.at(1,1)*T[1] + R.at(1,2)*T[2]); 64 | float dz = -(R.at(2,0)*T[0] + R.at(2,1)*T[1] + R.at(2,2)*T[2]); 65 | 66 | CLEAR(); TYPE(FADD20I); RD(X); RA(RZ); IMMB(dx); STALL(1); NEXT(); // 1 FADD x, RZ, dx 67 | CLEAR(); TYPE(FADD20I); RD(Y); RA(RZ); IMMB(dy); STALL(1); NEXT(); // 1 FADD y, RZ, dy 68 | CLEAR(); TYPE(FADD20I); RD(Z); RA(RZ); IMMB(dz); STALL(4); NEXT(); // 4 FADD z, RZ, dz 69 | CLEAR(); TYPE(FFMA20I); RD(X); RA(X0); IMMB(R.at(0,0)); RC(X); STALL(1); NEXT(); // 1 FFMA x, x0, (R00), x // Q) Why not have dx here? 70 | CLEAR(); TYPE(FFMA20I); RD(Y); RA(X0); IMMB(R.at(1,0)); RC(Y); STALL(1); NEXT(); // 1 FFMA y, x0, (R10), y 71 | CLEAR(); TYPE(FFMA20I); RD(Z); RA(X0); IMMB(R.at(2,0)); RC(Z); STALL(4); NEXT(); // 4 FFMA z, x0, (R20), z 72 | CLEAR(); TYPE(FFMA20I); RD(X); RA(Y0); IMMB(R.at(0,1)); RC(X); STALL(1); NEXT(); // 1 FFMA x, y0, (R01), x 73 | CLEAR(); TYPE(FFMA20I); RD(Y); RA(Y0); IMMB(R.at(1,1)); RC(Y); STALL(1); NEXT(); // 1 FFMA y, y0, (R11), y 74 | CLEAR(); TYPE(FFMA20I); RD(Z); RA(Y0); IMMB(R.at(2,1)); RC(Z); STALL(4); NEXT(); // 4 FFMA z, y0, (R21), z 75 | CLEAR(); TYPE(FFMA20I); RD(X); RA(Z0); IMMB(R.at(0,2)); RC(X); STALL(1); NEXT(); // 1 FFMA x, z0, (R02), x 76 | CLEAR(); TYPE(FFMA20I); RD(Y); RA(Z0); IMMB(R.at(1,2)); RC(Y); STALL(1); NEXT(); // 1 FFMA y, z0, (R12), y 77 | CLEAR(); TYPE(FFMA20I); RD(Z); RA(Z0); IMMB(R.at(2,2)); RC(Z); STALL(4); NEXT(); // 4 FFMA z, z0, (R22), z 78 | } 79 | #endif 80 | 81 | // cylinder: max(sqrt(x*x + z*z) - R, abs(y)-H) 82 | void emit_cylinder(instruction_block_t *block, frep_mat3_t R, frep_vec3_t T, float r, float h) 83 | { 84 | emit_transform(block, R, T); 85 | CLEAR(); TYPE(FMUL); RD(W); RA(X); RB(X); NEXT(); // FMUL w, x, x 86 | CLEAR(); TYPE(FFMA); RD(W); RA(Z); RB(Z); RC(W); NEXT(); // FFMA w, z, z, w 87 | CLEAR(); TYPE(SQRT); RD(W); RA(W); RB(W); NEXT(); // SQRT w, w 88 | CLEAR(); TYPE(FADD20I_ABS_A); RD(Y); RA(Y); IMMB(-h); NEXT(); // FADD y, |y|, -H 89 | CLEAR(); TYPE(FADD20I); RD(W); RA(W); IMMB(-r); NEXT(); // FADD w, w, -R 90 | CLEAR(); TYPE(FMAX); RD(D); RA(W); RB(Y); NEXT(); // FMAX d, w, y 91 | } 92 | 93 | // sphere: sqrt(x*x + y*y + z*z) - R 94 | void emit_sphere(instruction_block_t *block, frep_mat3_t R, frep_vec3_t T, float r) 95 | { 96 | #if 1 97 | CLEAR(); TYPE(FADD20I); RD(X); RA(X0); IMMB(-T[0]); STALL(1); NEXT(); // 1 FADD x, x0, (-tx) 98 | CLEAR(); TYPE(FADD20I); RD(Y); RA(Y0); IMMB(-T[1]); STALL(1); NEXT(); // 1 FADD y, y0, (-ty) 99 | CLEAR(); TYPE(FADD20I); RD(Z); RA(Z0); IMMB(-T[2]); STALL(4); NEXT(); // 4 FADD z, z0, (-tz) 100 | CLEAR(); TYPE(FMUL); RD(W); RA(X); RB(X); NEXT(); // 6 FMUL w, x, x 101 | CLEAR(); TYPE(FFMA); RD(W); RA(Y); RB(Y); RC(W); NEXT(); // 6 FFMA w, y, y, w 102 | CLEAR(); TYPE(FFMA); RD(W); RA(Z); RB(Z); RC(W); NEXT(); // 6 FFMA w, z, z, w 103 | CLEAR(); TYPE(SQRT); RD(W); RA(W); RB(W); NEXT(); // 8 SQRT w, w 104 | CLEAR(); TYPE(FADD20I); RD(D); RA(W); IMMB(-r); NEXT(); // 6 FADD d, w, -R 105 | #else 106 | emit_transform(block, R, T); 107 | CLEAR(); TYPE(FMUL); RD(W); RA(X); RB(X); NEXT(); // FMUL w, x, x 108 | CLEAR(); TYPE(FFMA); RD(W); RA(Y); RB(Y); RC(W); NEXT(); // FFMA w, y, y, w 109 | CLEAR(); TYPE(FFMA); RD(W); RA(Z); RB(Z); RC(W); NEXT(); // FFMA w, z, z, w 110 | CLEAR(); TYPE(SQRT); RD(W); RA(W); RB(W); NEXT(); // SQRT w, w 111 | CLEAR(); TYPE(FADD20I); RD(D); RA(W); IMMB(-r); NEXT(); // FADD d, w, -R 112 | #endif 113 | } 114 | 115 | void emit_box(instruction_block_t *block, frep_mat3_t R, frep_vec3_t T, float bx, float by, float bz) 116 | { 117 | assert(false && "fBox is not implemented yet"); 118 | } 119 | 120 | // box: max(max(|x|-wx, |y|-wy), |z|-wz) 121 | void emit_box_cheap(instruction_block_t *block, frep_mat3_t R, frep_vec3_t T, float bx, float by, float bz) 122 | { 123 | emit_transform(block, R, T); 124 | CLEAR(); TYPE(FADD20I_ABS_A); RD(X); RA(X); IMMB(-bx); STALL(1); NEXT(); // 1 FADD x, |x|, -wx 125 | CLEAR(); TYPE(FADD20I_ABS_A); RD(Y); RA(Y); IMMB(-by); STALL(1); NEXT(); // 1 FADD y, |y|, -wy 126 | CLEAR(); TYPE(FADD20I_ABS_A); RD(Z); RA(Z); IMMB(-bz); STALL(5); NEXT(); // 5 FADD z, |z|, -wz 127 | CLEAR(); TYPE(FMAX); RD(W); RA(X); RB(Y); NEXT(); // 6 FMAX w, x, y 128 | CLEAR(); TYPE(FMAX); RD(D); RA(W); RB(Z); NEXT(); // 6 FMAX d, w, z 129 | } 130 | 131 | void emit_plane(instruction_block_t *block, frep_mat3_t R, frep_vec3_t T, float px) 132 | { 133 | #if 0 134 | // optimized version 135 | float rx,ry,rz; 136 | frep_so3_to_ypr(R, &rz, &ry, &rx); 137 | float cx = cosf(rx); float sx = sinf(rx); 138 | float cy = cosf(ry); float sy = sinf(ry); 139 | float cz = cosf(rz); float sz = sinf(rz); 140 | float rtx = -((cy*cz)*T[0] + (cz*sx*sy - cx*sz)*T[1] + (sx*sz + cx*cz*sy)*T[2]); 141 | 142 | CLEAR(); TYPE(FMUL20I); RD(X); RA(X0); IMMB((cy*cz)); NEXT(); // 6 FMUL x, x0, (cy*cz) 143 | CLEAR(); TYPE(FFMA20I); RD(X); RA(Y0); IMMB((cz*sx*sy-cx*sz)); RC(X); NEXT(); // 6 FFMA x, y0, (cz*sx*sy-cx*sz), x 144 | CLEAR(); TYPE(FFMA20I); RD(X); RA(Z0); IMMB((sx*sz + cx*cz*sy)); RC(X); NEXT(); // 6 FFMA x, z0, (sx*sz + cx*cz*sy), x 145 | CLEAR(); TYPE(FADD20I); RD(D); RA(X); IMMB(rtx-px); NEXT(); // 6 FADD d, x, rtx-px 146 | #else 147 | emit_transform(block, R, T); 148 | // plane: x - px 149 | CLEAR(); TYPE(FADD20I); RD(D); RA(X); IMMB(-px); NEXT(); // FADD d, x, -px 150 | #endif 151 | } 152 | 153 | void emit_union(instruction_block_t *block) { CLEAR(); TYPE(FMIN); RD(D); RA(D_LEFT); RB(D_RIGHT); NEXT(); } 154 | void emit_intersect(instruction_block_t *block) { CLEAR(); TYPE(FMAX); RD(D); RA(D_LEFT); RB(D_RIGHT); NEXT(); } 155 | void emit_subtract(instruction_block_t *block) { CLEAR(); TYPE(FMAX_NEG_B); RD(D); RA(D_LEFT); RB(D_RIGHT); NEXT(); } 156 | void emit_blend(instruction_block_t *block, float alpha) 157 | { 158 | // blend: alpha*d_left + (1-alpha)*d_right 159 | CLEAR(); TYPE(FMUL20I); RD(D); RA(D_LEFT); IMMB(alpha); NEXT(); // FMUL d, d_left, (alpha) 160 | CLEAR(); TYPE(FFMA20I); RD(D); RA(D_RIGHT); IMMB(1.0f-alpha); RC(D); NEXT(); // FFMA d, d_right, (1-alpha), d 161 | } 162 | 163 | #undef TYPE 164 | #undef RA 165 | #undef RB 166 | #undef RC 167 | #undef RD 168 | #undef IMMB 169 | #undef NEXT 170 | #undef STALL 171 | #undef CLEAR 172 | 173 | void _generate_blocks( 174 | instruction_blocks_t *s, 175 | frep_t *node, 176 | int destination=0, 177 | frep_mat3_t R_root_to_parent=frep_identity_3x3, 178 | frep_vec3_t T_parent_rel_root=frep_null_3x1) 179 | // You can do much smarter register allocation here. The register allocation 180 | // may also need to change if we do smarter scheduling. E.g. block reordering. 181 | { 182 | assert(node); 183 | 184 | frep_mat3_t R_root_to_this; 185 | frep_vec3_t T_this_rel_root; 186 | frep_get_global_transform(node, &R_root_to_this, &T_this_rel_root, R_root_to_parent, T_parent_rel_root); 187 | 188 | if (frep_is_boolean(node)) 189 | { 190 | assert(node->left); 191 | assert(node->right); 192 | 193 | int d_left = destination; 194 | int d_right = destination+1; 195 | _generate_blocks(s, node->left, d_left, R_root_to_this, T_this_rel_root); 196 | _generate_blocks(s, node->right, d_right, R_root_to_this, T_this_rel_root); 197 | 198 | instruction_block_t *b = &s->blocks[s->num_blocks++]; 199 | b->num_instructions = 0; 200 | b->d_left = d_left; 201 | b->d_right = d_right; 202 | b->d = destination; 203 | if (node->opcode == FREP_UNION) emit_union(b); 204 | else if (node->opcode == FREP_INTERSECT) emit_intersect(b); 205 | else if (node->opcode == FREP_SUBTRACT) emit_subtract(b); 206 | else if (node->opcode == FREP_BLEND) emit_blend(b, node->blend.alpha); 207 | assert(s->num_blocks <= MAX_INSTRUCTION_BLOCKS); 208 | } 209 | else if (frep_is_primitive(node)) 210 | { 211 | instruction_block_t *b = &s->blocks[s->num_blocks++]; 212 | b->num_instructions = 0; 213 | frep_mat3_t R = R_root_to_this; 214 | frep_vec3_t T = T_this_rel_root; 215 | b->d = destination; 216 | if (node->opcode == FREP_BOX) emit_box(b, R, T, node->box.width, node->box.height, node->box.depth); 217 | else if (node->opcode == FREP_BOX_CHEAP) emit_box_cheap(b, R, T, node->box.width, node->box.height, node->box.depth); 218 | else if (node->opcode == FREP_SPHERE) emit_sphere(b, R, T, node->sphere.radius); 219 | else if (node->opcode == FREP_CYLINDER) emit_cylinder(b, R, T, node->cylinder.radius, node->cylinder.height); 220 | else if (node->opcode == FREP_PLANE) emit_plane(b, R, T, node->plane.offset); 221 | assert(s->num_blocks <= MAX_INSTRUCTION_BLOCKS); 222 | } 223 | else 224 | { 225 | assert(false && "Unexpected node type"); 226 | } 227 | } 228 | 229 | instruction_blocks_t generate_blocks(frep_t *node) 230 | // This function generates a list of instruction blocks that evaluates the 231 | // tree and stores the resulting distance value in register[0]. Each block 232 | // is assigned registers during the recursive tree parsing. 233 | { 234 | assert(node); 235 | 236 | static instruction_block_t _blocks[MAX_INSTRUCTION_BLOCKS]; 237 | instruction_blocks_t s = {0}; 238 | s.blocks = _blocks; 239 | s.num_blocks = 0; 240 | 241 | _generate_blocks(&s, node); 242 | 243 | return s; 244 | } 245 | 246 | } 247 | -------------------------------------------------------------------------------- /src/sass_6_x/bytecode.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace backend_sass { 4 | 5 | // 6 | // Instruction flags 7 | // 8 | // Meaning: 9 | // FTZ = Flush to zero 10 | // NEG_A = Use negated value of a operand 11 | // NEG_B = Use negated value of b operand 12 | // ABS_A = Use absolute value of a operand 13 | // ABS_B = Use absolute value of b operand 14 | uint64_t FADD_FTZ = 0x0000100000000000; 15 | uint64_t FADD_NEG_A = 0x0001000000000000; 16 | uint64_t FADD_NEG_B = 0x0000200000000000; 17 | uint64_t FADD_ABS_A = 0x0000400000000000; 18 | uint64_t FADD_ABS_B = 0x0002000000000000; 19 | uint64_t FADD32I_FTZ = 0x0080000000000000; 20 | uint64_t FADD32I_ABS_A = 0x0040000000000000; 21 | uint64_t FMUL_FTZ = 0x0000100000000000; 22 | uint64_t FMUL_NEG_B = 0x0001000000000000; 23 | uint64_t FMUL32I_FTZ = 0x0020000000000000; 24 | uint64_t FMNMX_FTZ = 0x0000100000000000; 25 | uint64_t FMNMX_NEG_A = 0x0001000000000000; 26 | uint64_t FMNMX_NEG_B = 0x0000200000000000; 27 | uint64_t FMNMX_ABS_A = 0x0000400000000000; 28 | uint64_t FMNMX_ABS_B = 0x0002000000000000; 29 | uint64_t FFMA_FTZ = 0x0020000000000000; 30 | uint64_t FFMA_NEG_B = 0x0001000000000000; 31 | uint64_t FFMA_NEG_C = 0x0002000000000000; 32 | 33 | // FADD d, a, b 34 | // d = a+b 35 | uint64_t FADD(uint8_t d, uint8_t a, uint8_t b, uint64_t flags) { 36 | uint64_t RD = (uint64_t)(d) << 0; 37 | uint64_t RA = (uint64_t)(a) << 8; 38 | uint64_t RB = (uint64_t)(b) << 20; 39 | return 0x5c58000000070000 | flags | RB | RA | RD; 40 | } 41 | 42 | // FADD d, -a, -RZ 43 | // d = a+b 44 | uint64_t NEG(uint8_t d, uint8_t a, uint64_t flags) { 45 | uint64_t RD = (uint64_t)(d) << 0; 46 | uint64_t RA = (uint64_t)(a) << 8; 47 | // todo: why is NEG_B flag set? 48 | return 0x5c5930000ff70000 | flags | RA | RD; 49 | } 50 | 51 | // FADD d, a, b immediate 52 | // d = a+b 53 | uint64_t FADD20I(uint8_t d, uint8_t a, float b, uint64_t flags) { 54 | uint64_t b_u64 = *(uint64_t*)&b; 55 | uint64_t sgn_b = b_u64 & 0x0000000080000000; 56 | uint64_t NEG_B = sgn_b ? 0x0100000000000000 : 0x0; 57 | uint64_t B = ((b_u64 & 0x000000007FFFF000) >> 12) << 20; 58 | uint64_t RA = (uint64_t)(a) << 8; 59 | uint64_t RD = (uint64_t)(d) << 0; 60 | return 0x3858000000070000 | flags | NEG_B | B | RA | RD; 61 | } 62 | 63 | // FADD32I d, a, b immediate 64 | // d = a+b 65 | uint64_t FADD32I(uint8_t d, uint8_t a, float b, uint64_t flags) { 66 | uint64_t b_u64 = *(uint64_t*)&b; 67 | uint64_t sgn_b = b_u64 & 0x0000000080000000; 68 | uint64_t NEG_B = sgn_b ? 0x0008000000000000 : 0x0; 69 | uint64_t B = (b_u64 & 0x000000007FFFFFFF) << 20; 70 | uint64_t RA = (uint64_t)(a) << 8; 71 | uint64_t RD = (uint64_t)(d) << 0; 72 | return 0x0880000000070000 | flags | NEG_B | B | RA | RD; 73 | } 74 | 75 | // FTF.FTZ.F32.F32.FLOOR d, b 76 | // d = floor(b) 77 | uint64_t FLOOR32F(uint8_t d, uint8_t b) { 78 | uint64_t RB = (uint64_t)(b) << 20; 79 | uint64_t RD = (uint64_t)(d) << 0; 80 | return 0x5ca8148000070a00 | RB | RD; 81 | } 82 | 83 | // FMUL32I d, a, b immediate 84 | // d = a*b 85 | uint64_t FMUL32I(uint8_t d, uint8_t a, float b, uint64_t flags) { 86 | uint64_t b_u64 = *(uint64_t*)&b; 87 | uint64_t sgn_b = b_u64 & 0x0000000080000000; 88 | uint64_t NEG_B = sgn_b ? 0x0008000000000000 : 0x0; 89 | uint64_t B = (b_u64 & 0x000000007FFFFFFF) << 20; 90 | uint64_t RA = (uint64_t)(a) << 8; 91 | uint64_t RD = (uint64_t)(d) << 0; 92 | return 0x1e00000000070000 | flags | NEG_B | B | RA | RD; 93 | } 94 | 95 | // FMUL d, a, b immediate 96 | // d = a*b 97 | uint64_t FMUL20I(uint8_t d, uint8_t a, float b, uint64_t flags) { 98 | uint64_t b_u64 = *(uint64_t*)&b; 99 | uint64_t sgn_b = b_u64 & 0x0000000080000000; 100 | uint64_t NEG_B = sgn_b ? 0x0100000000000000 : 0x0; 101 | uint64_t B = ((b_u64 & 0x000000007FFFF000) >> 12) << 20; 102 | uint64_t RA = (uint64_t)(a) << 8; 103 | uint64_t RD = (uint64_t)(d) << 0; 104 | return 0x3868000000070000 | flags | NEG_B | B | RA | RD; 105 | } 106 | 107 | // FMUL d, a, b 108 | // d = a*b 109 | uint64_t FMUL(uint8_t d, uint8_t a, uint8_t b, uint64_t flags) { 110 | uint64_t RD = (uint64_t)(d) << 0; 111 | uint64_t RA = (uint64_t)(a) << 8; 112 | uint64_t RB = (uint64_t)(b) << 20; 113 | return 0x5c68000000070000 | flags | RB | RA | RD; 114 | } 115 | 116 | // FFMA d, a, b, c 117 | // d = a*b + c 118 | uint64_t FFMA(uint8_t d, uint8_t a, uint8_t b, uint8_t c, uint64_t flags) { 119 | uint64_t RD = (uint64_t)(d) << 0; 120 | uint64_t RA = (uint64_t)(a) << 8; 121 | uint64_t RB = (uint64_t)(b) << 20; 122 | uint64_t RC = (uint64_t)(c) << 39; 123 | return 0x5980000000070000 | flags | RC | RB | RA | RD; 124 | } 125 | 126 | // FFMA d, a, b immediate, c 127 | // d = a*b + c 128 | uint64_t FFMA20I(uint8_t d, uint8_t a, float b, uint8_t c, uint64_t flags) { 129 | uint64_t b_u64 = *(uint64_t*)&b; 130 | uint64_t sgn_b = b_u64 & 0x0000000080000000; 131 | uint64_t NEG_B = sgn_b ? 0x0100000000000000 : 0x0; 132 | uint64_t B = ((b_u64 & 0x000000007FFFF000) >> 12) << 20; 133 | uint64_t RC = (uint64_t)(c) << 39; 134 | uint64_t RA = (uint64_t)(a) << 8; 135 | uint64_t RD = (uint64_t)(d) << 0; 136 | return 0x3280000000070000 | flags | NEG_B | RC | B | RA | RD; 137 | } 138 | 139 | // FMNMX d, a, b, !PT 140 | // d = max(a,b) 141 | uint64_t FMAX(uint8_t d, uint8_t a, uint8_t b, uint64_t flags) { 142 | uint64_t RD = (uint64_t)(d) << 0; 143 | uint64_t RA = (uint64_t)(a) << 8; 144 | uint64_t RB = (uint64_t)(b) << 20; 145 | return 0x5c60078000070000 | flags | RB | RA | RD; 146 | } 147 | 148 | // FMNMX d, a, b, PT 149 | // d = min(a,b) 150 | uint64_t FMIN(uint8_t d, uint8_t a, uint8_t b, uint64_t flags) { 151 | uint64_t RD = (uint64_t)(d) << 0; 152 | uint64_t RA = (uint64_t)(a) << 8; 153 | uint64_t RB = (uint64_t)(b) << 20; 154 | return 0x5c60038000070000 | flags | RB | RA | RD; 155 | } 156 | 157 | // FMNMX d, a, b immediate, !PT 158 | // d = min(a,b) 159 | uint64_t FMAX20I(uint8_t d, uint8_t a, float b, uint64_t flags) { 160 | uint64_t b_u64 = *(uint64_t*)&b; 161 | uint64_t sgn_b = b_u64 & 0x0000000080000000; 162 | uint64_t NEG_B = sgn_b ? 0x0100000000000000 : 0x0; 163 | uint64_t B = ((b_u64 & 0x000000007FFFF000) >> 12) << 20; 164 | uint64_t RA = (uint64_t)(a) << 8; 165 | uint64_t RD = (uint64_t)(d) << 0; 166 | return 0x3860078000070000 | NEG_B | flags | B | RA | RD; 167 | } 168 | 169 | // FMNMX d, a, b immediate, PT 170 | // d = min(a,b) 171 | uint64_t FMIN20I(uint8_t d, uint8_t a, float b, uint64_t flags) { 172 | uint64_t b_u64 = *(uint64_t*)&b; 173 | uint64_t sgn_b = b_u64 & 0x0000000080000000; 174 | uint64_t NEG_B = sgn_b ? 0x0100000000000000 : 0x0; 175 | uint64_t B = ((b_u64 & 0x000000007FFFF000) >> 12) << 20; 176 | uint64_t RA = (uint64_t)(a) << 8; 177 | uint64_t RD = (uint64_t)(d) << 0; 178 | return 0x3860038000070000 | NEG_B | flags | B | RA | RD; 179 | } 180 | 181 | // MUFU.SQRT d, a 182 | // d = sqrt(a) 183 | uint64_t MUFU_SQRT(uint8_t d, uint8_t a) { 184 | uint64_t RD = (uint64_t)(d) << 0; 185 | uint64_t RA = (uint64_t)(a) << 8; 186 | return 0x5080000000870000 | RA | RD; 187 | } 188 | 189 | // NOP should be issued along with --:-:-:Y:0 control codes 190 | uint64_t NOP() { return 0x50b0000000070f00; } 191 | // RET should be issued along with --:-:-:-:f control codes 192 | uint64_t RET() { return 0xe32000000007000f; } 193 | 194 | struct control_flags_t 195 | { 196 | uint8_t reuse; 197 | uint8_t yield; 198 | uint8_t stall; 199 | uint8_t wrtdb; 200 | uint8_t readb; 201 | uint8_t watdb; 202 | }; 203 | 204 | static control_flags_t ctrl[3]; 205 | 206 | // watdb:readb:wrtdb:yield:stall [reuse] 207 | // read and write barriers are numbered 1...6 208 | void wait_on_barrier(uint8_t op, uint8_t barrier_number) { 209 | ctrl[op].watdb |= (1 << (barrier_number-1)); 210 | } 211 | void set_write_barrier(uint8_t op, uint8_t barrier_number) { 212 | ctrl[op].wrtdb = barrier_number-1; 213 | } 214 | void set_read_barrier(uint8_t op, uint8_t barrier_number) { 215 | ctrl[op].readb = barrier_number-1; 216 | } 217 | void yield(uint8_t op) { // enables yield on instruction number op 218 | ctrl[op].yield = 0; // zero means enable 219 | } 220 | void stall(uint8_t op, uint8_t count) { 221 | ctrl[op].stall = count; 222 | } 223 | void reuse(uint8_t op, bool ra, bool rb, bool rc, bool rd) { 224 | ctrl[op].reuse = 0; 225 | if (ra) ctrl[op].reuse |= 0x1; 226 | if (rb) ctrl[op].reuse |= 0x2; 227 | if (rc) ctrl[op].reuse |= 0x4; 228 | if (rd) ctrl[op].reuse |= 0x8; 229 | } 230 | void reset_ctrl() { 231 | for (int op = 0; op < 3; op++) 232 | { 233 | ctrl[op].watdb = 0x00; 234 | ctrl[op].readb = 7; 235 | ctrl[op].wrtdb = 7; 236 | ctrl[op].yield = 1; 237 | ctrl[op].stall = 0; 238 | } 239 | } 240 | uint64_t CTRL() { 241 | uint64_t ret = 0; 242 | for (int op = 0; op < 3; op++) { 243 | uint64_t stall = (((uint64_t)ctrl[op].stall) & 0x0f) << 0; 244 | uint64_t yield = (((uint64_t)ctrl[op].yield) & 0x01) << 4; 245 | uint64_t wrtdb = (((uint64_t)ctrl[op].wrtdb) & 0x07) << 5; 246 | uint64_t readb = (((uint64_t)ctrl[op].readb) & 0x07) << 8; 247 | uint64_t watdb = (((uint64_t)ctrl[op].watdb) & 0x3f) << 11; 248 | uint64_t reuse = (((uint64_t)ctrl[op].reuse) & 0x0f) << 17; 249 | uint64_t ctrl = reuse|watdb|readb|wrtdb|yield|stall; 250 | ret |= ctrl << (op*21); 251 | } 252 | return ret; 253 | } 254 | 255 | void print_ctrl_segment(uint64_t x) { 256 | uint8_t stall = (uint8_t)((x & 0x0000f) >> 0); 257 | uint8_t yield = (uint8_t)((x & 0x00010) >> 4); 258 | uint8_t wrtdb = (uint8_t)((x & 0x000e0) >> 5); // 7 = no dependency 259 | uint8_t readb = (uint8_t)((x & 0x00700) >> 8); // 7 = no dependency 260 | uint8_t watdb = (uint8_t)((x & 0x1f800) >> 11); 261 | if (watdb) printf("%02x:", watdb); else printf("--:"); 262 | if (readb==7) printf("-:"); else printf("%d:", readb+1); 263 | if (wrtdb==7) printf("-:"); else printf("%d:", wrtdb+1); 264 | if (yield) printf("-:"); else printf("Y:"); 265 | printf("%x", stall); 266 | } 267 | 268 | void print_ctrl(uint64_t x) { 269 | uint64_t ctrl1 = (x & 0x000000000001ffff) >> 0; 270 | uint64_t ctrl2 = (x & 0x0000003fffe00000) >> 21; 271 | uint64_t ctrl3 = (x & 0x07fffc0000000000) >> 42; 272 | uint64_t reuse1 = (x & 0x00000000001e0000) >> 17; 273 | uint64_t reuse2 = (x & 0x000003c000000000) >> 38; 274 | uint64_t reuse3 = (x & 0x7800000000000000) >> 59; 275 | print_ctrl_segment(ctrl1); printf(" | "); 276 | print_ctrl_segment(ctrl2); printf(" | "); 277 | print_ctrl_segment(ctrl3); 278 | } 279 | 280 | } 281 | 282 | /* 283 | Notes 284 | 285 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 286 | IMMEDIATE VALUES 287 | 288 | FADD20I, FMUL20I and FFMA20I are immediate versions of their respective instructions, 289 | except the rightmost 12 bits of the single-precision mantissa are masked to zero. If 290 | you need full 23-bit mantissa precision you can use FADD32I and FMUL32I, which encode 291 | the entire float. FFMA does not have a 32-bit immediate version, but it can load from 292 | constant memory. 293 | 294 | *20I appear to be treated the same (flag-wise) as their non-immediate counterparts. 295 | 296 | FMNMX d, a, b, !PT -> MAX(a,b) 297 | FMNMX d, a, b, PT -> MIN(a,b) 298 | 299 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 300 | REGISTER BANKS 301 | 302 | Maxwell has four register banks per thread. The assignment of registers to banks is easy: 303 | Bank = Register number mod 4 (e.g. R0 and R4 are bank0, R3 and R7 are bank3) 304 | On Maxwell and Pascal, instructions can only access one value from each memory bank? 305 | 306 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 307 | REGISTER REUSE 308 | 309 | Maxwell and Pascal have 4 register reuse caches and 4 source operand slots. Each of the 310 | 4 reuse flag bits correspond to one of the 8-byte slots. The LSB in reuse flags controls 311 | the cache for the first source operand slot (a?), while the MSB is for the fourth. 312 | e.g. instruction dst, op0 ("first"), op1, op2, op3 ("last") 313 | e.g. FFMA.FTZ R3, R4, R4, R0.reuse -> has reuse flag 0100 314 | e.g. FFMA.FTZ R3, R4.reuse, R4, R0 -> has reuse flag 0001 315 | */ 316 | 317 | -------------------------------------------------------------------------------- /src/sass_6_x/cubin.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | struct elf64_hdr_t 7 | { 8 | uint8_t magic[4]; 9 | uint8_t fileClass; 10 | uint8_t encoding; 11 | uint8_t fileVersion; 12 | uint8_t padding[9]; 13 | uint16_t type; 14 | uint16_t machine; 15 | uint32_t version; 16 | uint64_t entry; 17 | uint64_t phOffset; 18 | uint64_t shOffset; 19 | uint32_t flags; 20 | uint16_t ehSize; 21 | uint16_t phEntSize; 22 | uint16_t phNum; 23 | uint16_t shEntSize; 24 | uint16_t shNum; 25 | uint16_t shStrIndx; 26 | }; 27 | 28 | struct elf64_prg_hdr_t 29 | { 30 | uint32_t type; 31 | uint32_t flags; 32 | uint64_t offset; 33 | uint64_t vaddr; 34 | uint64_t paddr; 35 | uint64_t fileSize; 36 | uint64_t memSize; 37 | uint64_t align; 38 | }; 39 | 40 | struct elf64_sec_hdr_t 41 | { 42 | uint32_t name; 43 | uint32_t type; 44 | uint64_t flags; 45 | uint64_t addr; 46 | uint64_t offset; 47 | uint64_t size; 48 | uint32_t link; 49 | uint32_t info; 50 | uint64_t align; 51 | uint64_t entSize; 52 | }; 53 | 54 | struct elf64_sym_ent_t 55 | { 56 | uint32_t name; 57 | uint8_t info; 58 | uint8_t other; 59 | uint16_t shIndx; 60 | uint64_t value; 61 | uint64_t size; 62 | }; 63 | 64 | struct cubin_function_t 65 | { 66 | char *name; 67 | char *b; 68 | elf64_sec_hdr_t *h; 69 | elf64_sym_ent_t *e; 70 | 71 | #if 0 72 | uint64_t *instructions() { return (uint64_t*)(b + h->offset); } 73 | int num_instructions() { return (int)(h->size / sizeof(uint64_t)); } 74 | void set_num_instructions(int n) { assert(n >= 0); h->size = n*sizeof(uint64_t); } 75 | #else 76 | // e->value is non-zero if the function is inlined, in which case it describe the 77 | // byte offset of the first instruction in the containing function's instructions. 78 | uint64_t *instructions() { return (uint64_t*)(b + h->offset + e->value); } 79 | int num_instructions() { return (int)(e->size/sizeof(uint64_t)); } 80 | void set_num_instructions(int n) 81 | { 82 | assert(n >= 0); 83 | assert(e->size == h->size && "The function appears to be an inline function. Changing the size of these is beyond the scope of this program."); 84 | e->size = ((uint64_t)n)*sizeof(uint64_t); 85 | h->size = ((uint64_t)n)*sizeof(uint64_t); 86 | } 87 | #endif 88 | 89 | uint8_t register_count() { return (h->info & 0xff000000)>>24; } 90 | void set_register_count(uint8_t n) { h->info = (h->info & 0x00ffffff) | (n<<24); } 91 | }; 92 | 93 | enum { cubin_max_prg_hdrs = 1024 }; 94 | enum { cubin_max_sec_hdrs = 1024 }; 95 | enum { cubin_max_functions = 1024 }; 96 | struct cubin_t 97 | { 98 | int sizeof_binary; 99 | char *binary; 100 | elf64_prg_hdr_t *prg_hdrs[cubin_max_prg_hdrs]; 101 | int num_prg_hdrs; 102 | 103 | elf64_sec_hdr_t *sec_hdrs[cubin_max_sec_hdrs]; 104 | int num_sec_hdrs; 105 | 106 | cubin_function_t functions[cubin_max_functions]; 107 | int num_functions; 108 | 109 | cubin_function_t *get_function(const char *name) 110 | { 111 | for (int i = 0; i < num_functions; i++) 112 | if (strcmp(functions[i].name, name) == 0) 113 | return functions + i; 114 | return NULL; 115 | } 116 | }; 117 | 118 | cubin_t read_cubin(const char *filename) 119 | { 120 | { 121 | uint16_t x = 0xaabb; 122 | uint8_t *p = (uint8_t*)&x; 123 | assert(p[0] == 0xbb && "machine is not little (?) endian"); 124 | } 125 | 126 | cubin_t cubin = {0}; 127 | { 128 | FILE *f = fopen(filename, "rb"); 129 | assert(f); 130 | fseek(f, 0, SEEK_END); 131 | long size = ftell(f); 132 | rewind(f); 133 | char *data = new char[size + 1]; 134 | int ok = fread(data, 1, size, f); 135 | assert(ok); 136 | data[size] = 0; 137 | fclose(f); 138 | 139 | cubin.binary = data; 140 | cubin.sizeof_binary = size; 141 | } 142 | assert(cubin.binary); 143 | assert(cubin.sizeof_binary); 144 | 145 | elf64_hdr_t elf_hdr = *(elf64_hdr_t*)cubin.binary; 146 | assert(elf_hdr.fileClass == 2 && "assuming 64-bit ELF"); 147 | assert((elf_hdr.flags & 0xff) == 60 && "assuming sm_60 architecture"); 148 | assert(elf_hdr.flags & 0x400 && "assuming 64-bit addresses"); 149 | assert(elf_hdr.phNum <= cubin_max_prg_hdrs); 150 | assert(elf_hdr.shNum <= cubin_max_sec_hdrs); 151 | 152 | // read program headers 153 | { 154 | char *b = cubin.binary + elf_hdr.phOffset; 155 | for (int i = 0; i < elf_hdr.phNum; i++) 156 | { 157 | cubin.prg_hdrs[cubin.num_prg_hdrs++] = (elf64_prg_hdr_t*)b; 158 | b += elf_hdr.phEntSize; 159 | } 160 | } 161 | 162 | // read section headers 163 | { 164 | char *b = cubin.binary + elf_hdr.shOffset; 165 | for (int i = 0; i < elf_hdr.shNum; i++) 166 | { 167 | cubin.sec_hdrs[cubin.num_sec_hdrs++] = (elf64_sec_hdr_t*)b; 168 | b += elf_hdr.shEntSize; 169 | } 170 | } 171 | 172 | 173 | // find section headers called strtab and shstrtab 174 | char *strtab = NULL; 175 | char *shstrtab = NULL; 176 | for (int i = 0; i < cubin.num_sec_hdrs; i++) 177 | { 178 | elf64_sec_hdr_t *sh = (elf64_sec_hdr_t*)cubin.sec_hdrs[i]; 179 | if (sh->type == 3) 180 | { 181 | char *data = cubin.binary + sh->offset; 182 | char *name = data + sh->name; 183 | if (strcmp(name, ".strtab") == 0) strtab = data; 184 | else if (strcmp(name, ".shstrtab") == 0) shstrtab = data; 185 | 186 | printf("found section \"%s\"\ndata (%d bytes): ", name, sh->size); 187 | for (int j = 0; j < sh->size; j++) 188 | printf("%c", data[j] ? data[j] : ' '); 189 | printf("\n\n"); 190 | } 191 | #if 0 192 | else 193 | { 194 | char *name = shstrtab + sh->name; 195 | uint8_t *data = (uint8_t*)(cubin.binary + sh->offset); 196 | printf("found section \"%s\" (type=%x)\ndata(%d bytes):", name, sh->type, sh->size); 197 | for (int j = 0; j < sh->size; j++) 198 | printf("%02x ", data[j]); 199 | printf("\n\n"); 200 | } 201 | #endif 202 | } 203 | assert(strtab); 204 | assert(shstrtab); 205 | 206 | for (int i = 0; i < cubin.num_sec_hdrs; i++) 207 | { 208 | elf64_sec_hdr_t *sh = cubin.sec_hdrs[i]; 209 | if (sh->type == 2) // look for symbol table 210 | { 211 | printf("found symbol table section with these symbols:\n"); 212 | char *data = cubin.binary + sh->offset; 213 | uint64_t offset = 0; 214 | while (offset < sh->size) // go through each symbol entry 215 | { 216 | elf64_sym_ent_t *ent = (elf64_sym_ent_t*)(data + offset); 217 | offset += sh->entSize; 218 | char *name = strtab + ent->name; 219 | 220 | if ((ent->info & 0x0f) == 0x02) // look for symbols tagged FUNC 221 | { 222 | printf("(function) \"%s\"\n", name); 223 | assert(cubin.num_functions < cubin_max_functions); 224 | cubin_function_t func = {0}; 225 | func.name = name; 226 | func.h = cubin.sec_hdrs[ent->shIndx]; 227 | func.b = cubin.binary; 228 | func.e = ent; 229 | cubin.functions[cubin.num_functions++] = func; 230 | 231 | // elf64_sec_hdr_t *ent_sh = cubin.sec_hdrs[ent->shIndx]; 232 | // printf("section header \"%s\"\n", strtab + ent_sh->name); 233 | } 234 | else 235 | { 236 | printf("(other) \"%s\"\n", name); 237 | } 238 | 239 | #if 0 240 | printf("\tinfo:0x%x\n", ent->info); 241 | printf("\tother:0x%x\n", ent->other); 242 | printf("\tvalue:0x%llx\n", ent->value); 243 | printf("\tsize:0x%llx (%llu)\n", ent->size, ent->size); 244 | #endif 245 | } 246 | } 247 | } 248 | 249 | printf("\nfound %d functions\n", cubin.num_functions); 250 | for (int i = 0; i < cubin.num_functions; i++) 251 | { 252 | printf("\"%s\"\n", cubin.functions[i].name); 253 | printf("\tRegister count: %d\n", cubin.functions[i].register_count()); 254 | printf("\tInstructions:\n"); 255 | uint64_t *in = cubin.functions[i].instructions(); 256 | int num_instructions = cubin.functions[i].num_instructions(); 257 | for (int j = 0; j < 10 && j < num_instructions; j++) 258 | printf("\t0x%016llx\n", in[j]); 259 | if (num_instructions > 10) 260 | printf("\t... (%d more instructions)\n", num_instructions - 10); 261 | } 262 | return cubin; 263 | } 264 | 265 | void save_cubin(cubin_t *cubin, const char *filename) 266 | { 267 | FILE *f = fopen(filename, "wb+"); 268 | assert(f); 269 | fwrite(cubin->binary, 1, cubin->sizeof_binary, f); 270 | fclose(f); 271 | } 272 | -------------------------------------------------------------------------------- /src/sass_6_x/instruction.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace backend_sass { 4 | 5 | enum latency_constants_ 6 | { 7 | // All the 32-bit floating point instructions (except sqrt) take exactly 8 | // 6 cycles before the result is written to and valid. Subsequent instructions 9 | // that read from this result must therefore be executed atleast six cycles 10 | // after the first one began. The scheduler tries to fill the gap between one 11 | // instruction and one that depends on its results by looking for others that 12 | // do not depend on its results. We conveniently structure our input code into 13 | // 'blocks' that are entirely independent from other blocks, but the instructions 14 | // within a block cannot be reordered. If the scheduler can't find enough 15 | // instructions to fill the pipeline, it will have to insert 'stalls', which 16 | // do nothing for a given number of clock cycles. 17 | LATENCY_X32T = 6, 18 | 19 | // sqrt is a variable latency instruction and needs to set a write barrier 20 | // which dependent instructions must wait on. The later that instruction 21 | // actually does the wait, the more likely it is that the sqrt is finished, 22 | // and the barrier does not incur a stall. We work under the assumption that 23 | // sqrt finishes after 'LATENCY_SQRT' cycles. 24 | LATENCY_SQRT = 8, 25 | 26 | // Setting the write barrier takes non-zero clock cycles. 27 | LATENCY_WRTDB = 1, 28 | }; 29 | 30 | enum instruction_type_t 31 | { 32 | INSTRUCTION_FFMA, 33 | INSTRUCTION_FMUL, 34 | INSTRUCTION_FADD, 35 | INSTRUCTION_FFMA20I, 36 | INSTRUCTION_FMUL20I, 37 | INSTRUCTION_FADD20I, 38 | INSTRUCTION_FADD20I_ABS_A, 39 | INSTRUCTION_FMIN, 40 | INSTRUCTION_FMAX, 41 | INSTRUCTION_FMAX_NEG_B, 42 | INSTRUCTION_SQRT 43 | }; 44 | 45 | struct instruction_t 46 | { 47 | instruction_type_t type; 48 | named_register_t a,b,c; // source registers ("operands") 49 | named_register_t d; // destination register 50 | float imm_b; // immediate value in b-slot 51 | 52 | // filled in by scheduler 53 | uint8_t ra,rb,rc,rd; 54 | uint8_t reuse; // register reuse flags 55 | uint8_t yield; // can relinquish control to other warp or not 56 | uint8_t stall; // number of cycles to wait before continuing 57 | uint8_t wrtdb; // write dependencies 58 | uint8_t readb; // read dependencies 59 | uint8_t watdb; // wait dependencies 60 | }; 61 | 62 | enum { MAX_INSTRUCTIONS_PER_BLOCK = 64 }; 63 | struct instruction_block_t 64 | // An instruction block is a list of instructions that implements a single basic 65 | // AST opcode, either a primitive or an operator. During code generation (parsing 66 | // the AST), we create a list of instruction blocks, evaluating the AST bottom-up. 67 | // During this, we assign to each block up to three register addresses. 68 | // A destination register, where the output of the block is to be stored, and 69 | // a left- and right-child register (for boolean operators). 70 | { 71 | instruction_t instructions[MAX_INSTRUCTIONS_PER_BLOCK]; 72 | int num_instructions; 73 | int d,d_left,d_right; 74 | }; 75 | 76 | enum { MAX_INSTRUCTION_BLOCKS = 128 }; 77 | struct instruction_blocks_t 78 | { 79 | instruction_block_t *blocks; 80 | int num_blocks; 81 | }; 82 | 83 | void print_instruction(instruction_t in) 84 | { 85 | int n = 0; 86 | if (in.type==INSTRUCTION_FFMA) n+=printf("FFMA r%d, r%d , r%d, r%d", in.rd, in.ra, in.rb, in.rc); 87 | else if (in.type==INSTRUCTION_FMUL) n+=printf("FMUL r%d, r%d , r%d", in.rd, in.ra, in.rb); 88 | else if (in.type==INSTRUCTION_FADD) n+=printf("FADD r%d, r%d , r%d", in.rd, in.ra, in.rb); 89 | else if (in.type==INSTRUCTION_FFMA20I) n+=printf("FFMA r%d, r%d , %5.2ff, r%d", in.rd, in.ra, in.imm_b, in.rc); 90 | else if (in.type==INSTRUCTION_FMUL20I) n+=printf("FMUL r%d, r%d , %5.2ff", in.rd, in.ra, in.imm_b); 91 | else if (in.type==INSTRUCTION_FADD20I) n+=printf("FADD r%d, r%d , %5.2ff", in.rd, in.ra, in.imm_b); 92 | else if (in.type==INSTRUCTION_FADD20I_ABS_A) n+=printf("FADD r%d, |r%d|, %5.2ff", in.rd, in.ra, in.imm_b); 93 | else if (in.type==INSTRUCTION_FMIN) n+=printf("FMIN r%d, r%d , r%d", in.rd, in.ra, in.rb); 94 | else if (in.type==INSTRUCTION_FMAX) n+=printf("FMAX r%d, r%d , r%d", in.rd, in.ra, in.rb); 95 | else if (in.type==INSTRUCTION_FMAX_NEG_B) n+=printf("FMAX r%d, -r%d , r%d", in.rd, in.ra, in.rb); 96 | else if (in.type==INSTRUCTION_SQRT) n+=printf("SQRT r%d, r%d", in.rd, in.ra); 97 | else assert(false); 98 | 99 | for (int i = n; i < 30; i++) 100 | printf(" "); 101 | 102 | if (in.watdb) printf("%02x:", in.watdb); else printf("--:"); 103 | if (in.readb==7) printf("-:"); else printf("%d:", in.readb+1); 104 | if (in.wrtdb==7) printf("-:"); else printf("%d:", in.wrtdb+1); 105 | if (in.yield) printf("-:"); else printf("Y:"); 106 | printf("%x", in.stall); 107 | if (in.reuse) 108 | printf(" reuse: %s%s%s", 109 | (in.reuse & 1) ? "a" : " ", 110 | (in.reuse & 2) ? "b" : " ", 111 | (in.reuse & 4) ? "c" : " "); 112 | printf("\n"); 113 | } 114 | 115 | } 116 | -------------------------------------------------------------------------------- /src/sass_6_x/registers.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace backend_sass { 4 | 5 | enum named_register_t 6 | { 7 | // This is used to indicate immediate values 8 | // Note: this enum must be 0 because we use memset to clear instructions 9 | NO_REGISTER=0, 10 | 11 | // Input position coordinates 12 | REGISTER_X0, 13 | REGISTER_Y0, 14 | REGISTER_Z0, 15 | 16 | // Temporary calculations 17 | REGISTER_X, 18 | REGISTER_Y, 19 | REGISTER_Z, 20 | REGISTER_W, 21 | 22 | // Result registers (e.g. f(p)) 23 | REGISTER_D, // result is to be stored here 24 | REGISTER_D_LEFT, // result from left child in tree is stored here 25 | REGISTER_D_RIGHT, // result from right child in tree is stored here 26 | 27 | // constant zero 28 | REGISTER_RZ, 29 | NUM_NAMED_REGISTERS 30 | }; 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/sass_6_x/scheduler.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace backend_sass { 4 | 5 | instruction_t * 6 | schedule_blocks(instruction_blocks_t blocks, int *return_num_instructions) 7 | // This function performs physical register allocation and instruction scheduling. 8 | // Register allocation maps the virtual register names used by each instruction to 9 | // physical register addresses (0 to 255). Instruction scheduling makes sure that 10 | // enough clock cycles passes between instructions so that the results are ready. 11 | { 12 | enum { max_instructions = 1024 }; 13 | static instruction_t out[max_instructions]; 14 | int num_out = 0; 15 | 16 | enum { max_registers = 256 }; 17 | enum { num_wait_barriers = 6 }; 18 | enum { max_temp_registers = 24 }; 19 | 20 | struct wait_barrier_t 21 | { 22 | uint8_t barrier_on_register[max_registers]; 23 | bool is_barrier_active[num_wait_barriers]; 24 | void init() 25 | { 26 | for (int i = 0; i < num_wait_barriers; i++) 27 | is_barrier_active[i] = false; 28 | for (int i = 0; i < max_registers; i++) 29 | barrier_on_register[i] = 7; 30 | } 31 | bool is_set(uint8_t reg) { return barrier_on_register[reg] != 7; } 32 | uint8_t set(uint8_t reg) // return wrtdb flag 33 | { 34 | for (int i = 0; i < num_wait_barriers; i++) 35 | { 36 | if (!is_barrier_active[i]) 37 | { 38 | uint8_t barrier = (uint8_t)(i); 39 | barrier_on_register[reg] = barrier; 40 | is_barrier_active[i] = true; 41 | return barrier; 42 | } 43 | } 44 | assert(false && "Ran out of wait barriers"); 45 | return 7; 46 | } 47 | uint8_t wait(uint8_t reg) // return watdb flag (to be OR'd with current flag) 48 | { 49 | uint8_t barrier = barrier_on_register[reg]; 50 | assert(barrier != 7 && "Tried to wait on a register that had no wait barrier set."); 51 | uint8_t watdb = 1 << barrier; 52 | is_barrier_active[barrier] = false; 53 | barrier_on_register[reg] = 7; 54 | return watdb; 55 | } 56 | }; 57 | 58 | static wait_barrier_t wait_barrier; 59 | wait_barrier.init(); 60 | 61 | for (int i = 0; i < blocks.num_blocks; i++) 62 | { 63 | int d = blocks.blocks[i].d; 64 | assert(d < max_temp_registers); 65 | int d_left = blocks.blocks[i].d_left; 66 | int d_right = blocks.blocks[i].d_right; 67 | 68 | static uint8_t register_map[NUM_NAMED_REGISTERS] = {0}; 69 | register_map[NO_REGISTER] = 0xff; 70 | register_map[REGISTER_X0] = 0x00; 71 | register_map[REGISTER_Y0] = 0x01; 72 | register_map[REGISTER_Z0] = 0x02; 73 | register_map[REGISTER_X] = 0x03; 74 | register_map[REGISTER_Y] = 0x04; 75 | register_map[REGISTER_Z] = 0x05; 76 | register_map[REGISTER_W] = 0x06; 77 | register_map[REGISTER_D] = 0x07 + d; 78 | register_map[REGISTER_D_LEFT] = 0x07 + d_left; 79 | register_map[REGISTER_D_RIGHT] = 0x07 + d_right; 80 | register_map[REGISTER_RZ] = 0xff; 81 | 82 | for (int j = 0; j < blocks.blocks[i].num_instructions; j++) 83 | { 84 | instruction_t *in = &blocks.blocks[i].instructions[j]; 85 | in->ra = register_map[in->a]; 86 | in->rb = register_map[in->b]; 87 | in->rc = register_map[in->c]; 88 | in->rd = register_map[in->d]; 89 | in->reuse = 0; 90 | in->watdb = 0; 91 | in->readb = 7; 92 | in->wrtdb = 7; 93 | in->yield = 0; 94 | if (in->a != NO_REGISTER && wait_barrier.is_set(in->ra)) { in->watdb |= wait_barrier.wait(in->ra); } 95 | if (in->b != NO_REGISTER && wait_barrier.is_set(in->rb)) { in->watdb |= wait_barrier.wait(in->rb); } 96 | if (in->c != NO_REGISTER && wait_barrier.is_set(in->rc)) { in->watdb |= wait_barrier.wait(in->rc); } 97 | 98 | // if we the instruction doesn't have a stall count set already 99 | // we set it to the latency of the instruction. 100 | if (in->stall == 0) 101 | { 102 | if (in->type == INSTRUCTION_SQRT) in->stall = 1+LATENCY_WRTDB; 103 | else in->stall = LATENCY_X32T; 104 | } 105 | 106 | if (in->type == INSTRUCTION_SQRT) in->wrtdb = wait_barrier.set(in->rd); 107 | 108 | // simple reuse tactic 109 | #if 1 110 | if (j > 0) 111 | { 112 | instruction_t *last = &blocks.blocks[i].instructions[j-1]; 113 | if (last->a != NO_REGISTER && last->ra == in->ra && last->rd != in->ra) in->reuse |= 1 << 0; 114 | if (last->b != NO_REGISTER && last->rb == in->rb && last->rd != in->rb) in->reuse |= 1 << 1; 115 | if (last->c != NO_REGISTER && last->rc == in->rc && last->rd != in->rc) in->reuse |= 1 << 2; 116 | } 117 | #endif 118 | 119 | out[num_out++] = *in; 120 | assert(num_out <= max_instructions); 121 | } 122 | } 123 | 124 | *return_num_instructions = num_out; 125 | return out; 126 | } 127 | 128 | } 129 | -------------------------------------------------------------------------------- /src/sass_6_x/simulator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace backend_sass { 4 | 5 | struct sass_simulator_t 6 | { 7 | bool debug; 8 | int t; 9 | float reg[256]; 10 | 11 | // writes in progress 12 | struct job_t 13 | { 14 | uint8_t dst; 15 | float val; 16 | int t_write; 17 | }; 18 | enum { max_write_jobs = 1024 }; 19 | job_t writes[max_write_jobs]; 20 | int num_writes_waiting; 21 | 22 | // barriers 23 | enum { num_write_barriers = 6 }; 24 | int register_on_barrier[num_write_barriers]; 25 | 26 | void init(bool _debug) 27 | { 28 | reg[REGISTER_RZ] = 0.0f; 29 | num_writes_waiting = 0; 30 | t = 0; 31 | debug = _debug; 32 | for (int i = 0; i < num_write_barriers; i++) 33 | register_on_barrier[i] = -1; 34 | } 35 | void _step(int cycles) 36 | { 37 | t += cycles; 38 | for (int i = 0; i < num_writes_waiting; i++) 39 | { 40 | if (t >= writes[i].t_write) 41 | { 42 | reg[writes[i].dst] = writes[i].val; 43 | 44 | // if a write barrier was set on the register we can take it down 45 | for (int j = 0; j < 6; j++) 46 | { 47 | if (register_on_barrier[j] == writes[i].dst) 48 | register_on_barrier[j] = -1; 49 | } 50 | 51 | writes[i] = writes[--num_writes_waiting]; 52 | i--; 53 | } 54 | } 55 | } 56 | void _set_write_barrier(uint8_t reg, uint8_t barrier) 57 | { 58 | assert(barrier >= 0 && barrier <= num_write_barriers-1); 59 | assert(register_on_barrier[barrier] == -1 && "overwrote an existing write barrier."); 60 | register_on_barrier[barrier] = reg; 61 | } 62 | void _wait_on_barrier(uint8_t barrier) 63 | { 64 | if (register_on_barrier[barrier] == -1) 65 | return; 66 | assert(barrier >= 0 && barrier <= num_write_barriers-1); 67 | bool resolved = false; 68 | for (int i = 0; i < num_writes_waiting; i++) 69 | { 70 | if (writes[i].dst == (uint8_t)register_on_barrier[barrier]) 71 | { 72 | int t_to_wait = writes[i].t_write - t; 73 | if (t_to_wait > 0) 74 | { 75 | if (debug) printf("waited %d cycles on barrier\n", t_to_wait); 76 | _step(t_to_wait); 77 | } 78 | resolved = true; 79 | register_on_barrier[barrier] = -1; 80 | } 81 | } 82 | assert(resolved && "waited on a barrier which is not resolved by any on-going writes."); 83 | } 84 | float _read_reg(uint8_t src) 85 | { 86 | for (int i = 0; i < num_writes_waiting; i++) 87 | if (writes[i].dst == src && debug) 88 | printf("read-before-write conflict on r%d\n", src); 89 | return reg[src]; 90 | } 91 | void _write_reg(uint8_t dst, float val, int latency) 92 | { 93 | assert(num_writes_waiting+1 <= max_write_jobs); 94 | writes[num_writes_waiting].dst = dst; 95 | writes[num_writes_waiting].val = val; 96 | writes[num_writes_waiting].t_write = t + latency; 97 | num_writes_waiting++; 98 | } 99 | void execute(instruction_t in) 100 | { 101 | using namespace backend_sass; 102 | bool is_immediate = 103 | in.type == INSTRUCTION_FFMA20I || 104 | in.type == INSTRUCTION_FMUL20I || 105 | in.type == INSTRUCTION_FADD20I || 106 | in.type == INSTRUCTION_FADD20I_ABS_A; 107 | 108 | if (in.watdb) 109 | { 110 | if (in.watdb & 1) _wait_on_barrier(0); 111 | if (in.watdb & 2) _wait_on_barrier(1); 112 | if (in.watdb & 4) _wait_on_barrier(2); 113 | if (in.watdb & 8) _wait_on_barrier(3); 114 | if (in.watdb & 16) _wait_on_barrier(4); 115 | if (in.watdb & 32) _wait_on_barrier(5); 116 | } 117 | 118 | if (in.wrtdb != 7) _set_write_barrier(in.rd, in.wrtdb); 119 | 120 | float a = _read_reg(in.ra); 121 | float b = is_immediate ? in.imm_b : _read_reg(in.rb); 122 | float c = _read_reg(in.rc); 123 | 124 | float d; 125 | int lat; 126 | if (in.type==INSTRUCTION_FFMA) { lat = LATENCY_X32T; d = a*b + c; } 127 | else if (in.type==INSTRUCTION_FMUL) { lat = LATENCY_X32T; d = a*b; } 128 | else if (in.type==INSTRUCTION_FADD) { lat = LATENCY_X32T; d = a + b; } 129 | else if (in.type==INSTRUCTION_FFMA20I) { lat = LATENCY_X32T; d = a*b + c; } 130 | else if (in.type==INSTRUCTION_FMUL20I) { lat = LATENCY_X32T; d = a*b; } 131 | else if (in.type==INSTRUCTION_FADD20I) { lat = LATENCY_X32T; d = a + b; } 132 | else if (in.type==INSTRUCTION_FADD20I_ABS_A) { lat = LATENCY_X32T; d = fabsf(a) + b; } 133 | else if (in.type==INSTRUCTION_FMIN) { lat = LATENCY_X32T; d = (a < b) ? a : b; } 134 | else if (in.type==INSTRUCTION_FMAX) { lat = LATENCY_X32T; d = (a > b) ? a : b; } 135 | else if (in.type==INSTRUCTION_FMAX_NEG_B) { lat = LATENCY_X32T; d = (a > -b) ? a : -b; } 136 | else if (in.type==INSTRUCTION_SQRT) { lat = LATENCY_SQRT; d = sqrtf(a); } 137 | else assert(false && "unhandled instruction"); 138 | 139 | _write_reg(in.rd, d, lat); 140 | _step(in.stall); 141 | 142 | if (debug) print_instruction(in); 143 | } 144 | }; 145 | 146 | } 147 | -------------------------------------------------------------------------------- /test/backend_glsl.cpp: -------------------------------------------------------------------------------- 1 | #include "../src/backend_glsl.h" 2 | #include "../src/frep_builder.h" 3 | 4 | int main() { 5 | frep_t *f = fBoxCheap(1.0f, 0.5f, 0.25f); 6 | f = fOpUnion(f, fBox(2.0f, 1.0f, 1.0f)); 7 | char *s = frep_compile_to_glsl(f); 8 | printf("%s\n", s); 9 | } 10 | -------------------------------------------------------------------------------- /test/backend_ptx.cpp: -------------------------------------------------------------------------------- 1 | // Example compilation instructions for Linux, g++: 2 | // (Replace include directory with your installation and version of CUDA) 3 | // $ g++ -std=c++11 backend_ptx.cpp -I/usr/local/cuda-10.1/include -lcuda 4 | 5 | #include 6 | #include 7 | #include 8 | #include "util/cuda_error.h" 9 | #include "util/init_cuda.h" 10 | 11 | #define PTX_FP20_IMMEDIATE 12 | #include "../src/frep.h" 13 | #include "../src/frep_eval.h" 14 | #include "../src/frep_builder.h" 15 | #include "../src/backend_ptx.h" 16 | 17 | // This generates a PTX program equivalent to: 18 | // float tree(float x, float y, float z) { 19 | // // generated PTX instructions 20 | // } 21 | // void main(vec4 *input, float *output) { 22 | // int tid = threadIdx.x + blockDim.x*blockIdx.x; 23 | // vec4 p = input[tid]; 24 | // output[tid] = tree(p.x, p.y, p.z); 25 | // } 26 | // Note: out_length _DOES NOT_ include the null-terminator. 27 | char *generate_ptx_program(frep_t *f, size_t *out_length) 28 | { 29 | const char *ptx_template = R"str( 30 | .version 6.0 31 | .target sm_60 32 | .address_size 64 33 | .func (.reg.f32 f%d) tree(.reg.f32 x0, .reg.f32 y0, .reg.f32 z0) { 34 | .reg.f32 f<%d>; 35 | %s 36 | ret.uni; 37 | } 38 | .visible.entry main(.param.u64 param0, .param.u64 param1) { 39 | .reg.f32 x0; 40 | .reg.f32 y0; 41 | .reg.f32 z0; 42 | .reg.f32 w0; 43 | .reg.b32 r<5>; 44 | .reg.b64 rd<9>; 45 | .reg.f32 d; 46 | ld.param.u64 rd1, [param0]; 47 | ld.param.u64 rd2, [param1]; 48 | cvta.to.global.u64 rd3, rd2; 49 | cvta.to.global.u64 rd4, rd1; 50 | mov.u32 r1, %%tid.x; // threadIdx.x 51 | mov.u32 r2, %%ctaid.x; // blockIdx.x 52 | mov.u32 r3, %%ntid.x; // blockDim.x 53 | mad.lo.s32 r4, r3, r2, r1; // blockDim.x*blockIdx.x + threadIdx.x 54 | mul.wide.s32 rd5, r4, 16; // sizeof(vec4)*(blockDim.x*blockIdx.x + threadIdx.x) 55 | add.s64 rd6, rd4, rd5; // param0 + sizeof(vec4)*(blockDim.x*blockIdx.x + threadIdx.x) 56 | ld.global.v4.f32 {x0, y0, z0, w0}, [rd6]; 57 | mul.wide.s32 rd7, r4, 4; // sizeof(float)*(blockDim.x*blockIdx.x + threadIdx.x) 58 | add.s64 rd8, rd3, rd7; // param1 + sizeof(float)*(blockDim.x*blockIdx.x + threadIdx.x) 59 | call.uni (d), tree, (x0,y0,z0); 60 | st.global.f32 [rd8], d; 61 | ret; 62 | } 63 | )str"; 64 | 65 | static char buffer[10*1024*1024]; 66 | char *stream = buffer; 67 | int result_register; 68 | char *ptx = frep_compile_to_ptx(f, &result_register); 69 | stream += sprintf(stream, ptx_template, result_register, result_register, ptx); 70 | *out_length = (stream - buffer); 71 | return buffer; 72 | } 73 | 74 | CUmodule load_ptx_program( 75 | const char *ptx_source, size_t ptx_source_length, 76 | int jit_optimization_level) 77 | { 78 | CUmodule module; 79 | void *cubin; size_t cubin_size; 80 | CUlinkState link_state; 81 | enum { num_options = 8 }; 82 | CUjit_option options[num_options]; 83 | void *option_values[num_options]; 84 | float walltime; 85 | char error_log[8192], info_log[8192]; 86 | 87 | assert(jit_optimization_level >= 0 && jit_optimization_level <= 4); 88 | 89 | // see CUDA Driver API manual for these options (look up cuLinkCreate) 90 | options[0] = CU_JIT_WALL_TIME; option_values[0] = (void *) &walltime; 91 | options[1] = CU_JIT_INFO_LOG_BUFFER; option_values[1] = (void *) info_log; 92 | options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; option_values[2] = (void *) (long)sizeof(info_log); 93 | options[3] = CU_JIT_ERROR_LOG_BUFFER; option_values[3] = (void *) error_log; 94 | options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; option_values[4] = (void *) (long)sizeof(error_log); 95 | options[5] = CU_JIT_LOG_VERBOSE; option_values[5] = (void *) 1; 96 | options[6] = CU_JIT_TARGET; option_values[6] = (void *) CU_TARGET_COMPUTE_60; 97 | options[7] = CU_JIT_OPTIMIZATION_LEVEL; option_values[7] = (void *) (long)jit_optimization_level; 98 | cudaCheckError(cuLinkCreate(num_options, options, option_values, &link_state)); 99 | 100 | int err = cuLinkAddData(link_state, CU_JIT_INPUT_PTX, (void *)ptx_source, ptx_source_length+1, 0, 0, 0, 0); 101 | if (err != CUDA_SUCCESS) 102 | fprintf(stderr, "PTX Linker Error:\n%s\n", error_log); 103 | cudaCheckError(cuLinkComplete(link_state, &cubin, &cubin_size)); 104 | printf("Linking done in %fms. Linker Output:\n%s\n", walltime, info_log); 105 | 106 | cudaCheckError(cuModuleLoadData(&module, cubin)); assert(module); 107 | cudaCheckError(cuLinkDestroy(link_state)); 108 | return module; 109 | } 110 | 111 | void run_ptx_program( 112 | void *input, size_t sizeof_input, 113 | void *output, size_t sizeof_output, 114 | const char *ptx_source, size_t ptx_source_length, const char *entry_name, 115 | int num_blocks, int threads_per_block, int shared_memory_bytes=1024, 116 | int jit_optimization_level=1 /*allowed values = 0,1,2,3,4*/) 117 | { 118 | CUdeviceptr dev_input; 119 | CUdeviceptr dev_output; 120 | cudaCheckError(cuMemAlloc(&dev_input, sizeof_input)); assert(dev_input); 121 | cudaCheckError(cuMemAlloc(&dev_output, sizeof_output)); assert(dev_output); 122 | cudaCheckError(cuMemcpyHtoD(dev_input, input, sizeof_input)); 123 | CUmodule module = load_ptx_program(ptx_source, ptx_source_length, jit_optimization_level); 124 | CUfunction kernel = 0; 125 | cudaCheckError(cuModuleGetFunction(&kernel, module, entry_name)); 126 | uint64_t param0 = (uint64_t)(dev_input); 127 | uint64_t param1 = (uint64_t)(dev_output); 128 | void *kernel_params[] = { (void*)¶m0, (void*)¶m1 }; 129 | cuLaunchKernel(kernel, num_blocks,1,1, threads_per_block,1,1, shared_memory_bytes, NULL, kernel_params, NULL); 130 | cudaCheckError(cuCtxSynchronize()); 131 | cudaCheckError(cuMemcpyDtoH(output, dev_output, sizeof_output)); 132 | cudaCheckError(cuMemFree(dev_output)); 133 | cudaCheckError(cuMemFree(dev_input)); 134 | cudaCheckError(cuModuleUnload(module)); 135 | } 136 | 137 | void run_test(int test_number, frep_t *f) 138 | { 139 | printf("///////////////////////////////////////////////////\n"); 140 | printf(" running test number %d\n", test_number); 141 | 142 | const int num_points_x = 4; 143 | const int num_points_y = 4; 144 | const int num_points_z = 4; 145 | const int num_threads = 32; 146 | const int num_points = num_points_x*num_points_y*num_points_z; 147 | const int num_blocks = num_points/num_threads; 148 | const int sizeof_input = num_points*4*sizeof(float); 149 | const int sizeof_output = num_points*1*sizeof(float); 150 | 151 | float *output = (float*)malloc(sizeof_output); assert(output); 152 | float *cpu_output = (float*)malloc(sizeof_output); assert(cpu_output); 153 | float *input = (float*)malloc(num_points*4*sizeof(float)); 154 | 155 | // generate input array data (points sampled in regular grid) 156 | { 157 | float *p = input; 158 | for (int zi = 0; zi < num_points_z; zi++) 159 | for (int yi = 0; yi < num_points_y; yi++) 160 | for (int xi = 0; xi < num_points_x; xi++) 161 | { 162 | p[0] = (-1.0f + 2.0f*xi/num_points_x); 163 | p[1] = (-1.0f + 2.0f*yi/num_points_y); 164 | p[2] = (-1.0f + 2.0f*zi/num_points_z); 165 | p[3] = 0.0f; 166 | p += 4; 167 | } 168 | } 169 | 170 | // compute expected output using CPU-based evaluator 171 | { 172 | for (int i = 0; i < num_points; i++) 173 | { 174 | float x = input[4*i + 0]; 175 | float y = input[4*i + 1]; 176 | float z = input[4*i + 2]; 177 | cpu_output[i] = frep_eval(f, x, y, z); 178 | } 179 | } 180 | 181 | // compute output using GPU 182 | { 183 | size_t ptx_length; 184 | char *ptx_source = generate_ptx_program(f, &ptx_length); 185 | run_ptx_program( 186 | input, sizeof_input, 187 | output, sizeof_output, 188 | ptx_source, ptx_length, 189 | "main", 190 | num_blocks, num_threads); 191 | } 192 | 193 | // verify that GPU output matches CPU output 194 | for (int i = 0; i < num_points; i++) 195 | { 196 | float d_cpu = cpu_output[i]; 197 | float d_ptx = output[i]; 198 | if (fabsf(d_cpu - d_ptx) > 0.01f) 199 | { 200 | float x = input[4*i + 0]; 201 | float y = input[4*i + 1]; 202 | float z = input[4*i + 2]; 203 | printf("\nEvaluation mismatch!\n"); 204 | printf("cpu: f(%.2f,%.2f,%.2f) = %f\n", x, y, z, d_cpu); 205 | printf("ptx: f(%.2f,%.2f,%.2f) = %f\n", x, y, z, d_ptx); 206 | exit(1); 207 | } 208 | } 209 | 210 | free(output); 211 | free(cpu_output); 212 | free(input); 213 | } 214 | 215 | int main(int argc, char **argv) 216 | { 217 | init_cuda(); 218 | 219 | frep_t *f = fBoxCheap(1.0f, 0.5f, 0.25f); 220 | run_test(1, f); 221 | 222 | return 0; 223 | } 224 | -------------------------------------------------------------------------------- /test/backend_sass_6_x.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "util/cuda_error.h" 5 | #include "util/init_cuda.h" 6 | #include "../src/frep.h" 7 | #include "../src/frep_eval.h" 8 | #include "../src/frep_builder.h" 9 | #include "../src/backend_sass.h" 10 | 11 | CUmodule link_sass(CUmodule *module, 12 | void *cubin1, size_t sizeof_cubin1, 13 | void *cubin2, size_t sizeof_cubin2); 14 | 15 | int main(int argc, char **argv) 16 | { 17 | setenv("CUDA_CACHE_DISABLE", "1", 1); 18 | init_cuda(); 19 | 20 | system("/usr/local/cuda-10.1/bin/nvcc " 21 | "--gpu-architecture=sm_60 " 22 | "--cubin " 23 | "--relocatable-device-code=true " 24 | "main.cu " 25 | "--output-file main.cubin"); 26 | 27 | size_t sizeof_cubin_main; 28 | void *cubin_main = read_file("main.cubin", &sizeof_cubin_main); 29 | 30 | frep_t *tree = fBoxCheap(1.0f, 0.5f, 0.25f); 31 | 32 | size_t sizeof_cubin_tree; 33 | void *cubin_tree = frep_compile_to_sass(tree, &sizeof_cubin_tree); 34 | 35 | CUmodule module = 0; 36 | link_sass(&module, cubin_main, sizeof_cubin_main, cubin_tree, sizeof_cubin_tree); 37 | 38 | CUfunction kernel; 39 | cudaCheckError(cuModuleGetFunction(&kernel, module, "main")); assert(kernel); 40 | 41 | // 42 | // finally we run the thing to make sure that it actually works. 43 | // 44 | int N = 32; 45 | size_t sizeof_input = 4*N*sizeof(float); 46 | size_t sizeof_output = N*sizeof(float); 47 | float *input = (float*)malloc(sizeof_input); 48 | float *output = (float*)malloc(sizeof_output); 49 | 50 | for (int i = 0; i < N; i++) 51 | { 52 | input[4*i + 0] = 1.0f; 53 | input[4*i + 1] = 0.0f; 54 | input[4*i + 2] = 0.0f; 55 | input[4*i + 3] = 0.0f; 56 | } 57 | 58 | int num_blocks = 8; 59 | int num_threads = 4; 60 | int shared_memory_bytes = 1024; 61 | CUdeviceptr dev_input; 62 | CUdeviceptr dev_output; 63 | cudaCheckError(cuMemAlloc(&dev_input, sizeof_input)); assert(dev_input); 64 | cudaCheckError(cuMemAlloc(&dev_output, sizeof_output)); assert(dev_output); 65 | cudaCheckError(cuMemcpyHtoD(dev_input, input, sizeof_input)); 66 | uint64_t param0 = (uint64_t)(dev_input); 67 | uint64_t param1 = (uint64_t)(dev_output); 68 | void *kernel_params[] = { (void*)¶m0, (void*)¶m1 }; 69 | cuLaunchKernel(kernel, num_blocks,1,1, num_threads,1,1, shared_memory_bytes, NULL, kernel_params, NULL); 70 | cudaCheckError(cuCtxSynchronize()); 71 | cudaCheckError(cuMemcpyDtoH(output, dev_output, sizeof_output)); 72 | cudaCheckError(cuMemFree(dev_output)); 73 | cudaCheckError(cuMemFree(dev_input)); 74 | 75 | cudaCheckError(cuModuleUnload(module)); 76 | 77 | printf("output:\n"); 78 | for (int i = 0; i < N; i++) 79 | printf("%f ", output[i]); 80 | 81 | return 0; 82 | } 83 | 84 | void link_sass(CUmodule *module, 85 | void *cubin1, size_t sizeof_cubin1, 86 | void *cubin2, size_t sizeof_cubin2) 87 | { 88 | enum { num_options = 6 }; 89 | CUjit_option options[num_options]; 90 | void *option_values[num_options]; 91 | char error_log[8192]; 92 | char info_log[8192]; 93 | options[0] = CU_JIT_INFO_LOG_BUFFER; option_values[0] = (void *) info_log; 94 | options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; option_values[1] = (void *) (long)sizeof(info_log); 95 | options[2] = CU_JIT_ERROR_LOG_BUFFER; option_values[2] = (void *) error_log; 96 | options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; option_values[3] = (void *) (long)sizeof(error_log); 97 | options[4] = CU_JIT_LOG_VERBOSE; option_values[4] = (void *) 1; 98 | options[5] = CU_JIT_TARGET; option_values[5] = (void *) CU_TARGET_COMPUTE_60; 99 | CUlinkState link_state; 100 | cudaCheckError(cuLinkCreate(num_options, options, option_values, &link_state)); 101 | 102 | if (CUDA_SUCCESS != cuLinkAddData(link_state, CU_JIT_INPUT_CUBIN, 103 | (void *)cubin_main, sizeof_cubin_main, 0,0,0,0)) 104 | fprintf(stderr, "nvlink error:\n%s\n", error_log); 105 | 106 | if (CUDA_SUCCESS != cuLinkAddData(link_state, CU_JIT_INPUT_CUBIN, 107 | (void *)cubin_tree, sizeof_cubin_tree, 0,0,0,0)) 108 | fprintf(stderr, "nvlink error:\n%s\n", error_log); 109 | 110 | void *cubin; 111 | size_t cubin_size; 112 | cudaCheckError(cuLinkComplete(link_state, &cubin, &cubin_size)); 113 | cudaCheckError(cuModuleLoadData(module, cubin)); assert(module); 114 | cudaCheckError(cuLinkDestroy(link_state)); 115 | } 116 | -------------------------------------------------------------------------------- /test/backend_sass_6_x_mock.cpp: -------------------------------------------------------------------------------- 1 | #define COMPUTE_CAPABILITY_6_X 2 | #include 3 | #include 4 | #include 5 | #include "../src/frep.h" 6 | #include "../src/frep_builder.h" 7 | #include "../src/frep_eval.h" 8 | #include "../src/backend_sass.h" 9 | #include "../src/sass_6_x/simulator.h" 10 | 11 | using namespace backend_sass; 12 | 13 | float frep_eval_sass( 14 | float x0, float y0, float z0, 15 | instruction_t *instructions, int num_instructions, 16 | bool debug=false) 17 | { 18 | static sass_simulator_t sim = {0}; 19 | sim.init(debug); 20 | sim.reg[0x00] = x0; 21 | sim.reg[0x01] = y0; 22 | sim.reg[0x02] = z0; 23 | for (int i = 0; i < num_instructions; i++) 24 | sim.execute(instructions[i]); 25 | return sim.reg[0x07]; 26 | } 27 | 28 | void run_test(int test_number, frep_t *tree) 29 | { 30 | instruction_blocks_t blocks = generate_blocks(tree); 31 | 32 | int num_instructions; 33 | instruction_t *instructions = schedule_blocks(blocks, &num_instructions); 34 | 35 | printf("///////////////////////////////////////////////////\n"); 36 | printf(" test number %d\n", test_number); 37 | 38 | frep_eval_sass(0.0f,0.0f,0.0f, instructions, num_instructions, true); 39 | 40 | for (int i = -4; i <= 4; i++) 41 | for (int j = -4; j <= 4; j++) 42 | for (int k = -4; k <= 4; k++) 43 | { 44 | float x0 = i/4.0f; 45 | float y0 = j/4.0f; 46 | float z0 = k/4.0f; 47 | float f_sass = frep_eval_sass(x0,y0,z0, instructions, num_instructions); 48 | float f_true = frep_eval(tree, x0,y0,z0); 49 | if (fabsf(f_sass - f_true) > 0.00001f) 50 | { 51 | printf("\nEvaluation mismatch!\n"); 52 | printf("true: f(%.2f,%.2f,%.2f) = %f\n", x0,y0,z0,f_true); 53 | printf("sass: f(%.2f,%.2f,%.2f) = %f\n", x0,y0,z0,f_sass); 54 | exit(1); 55 | } 56 | } 57 | printf("ok!\n"); 58 | } 59 | 60 | int main() 61 | { 62 | frep_t *tree; 63 | 64 | tree = fBoxCheap(0.9f,0.6f,0.3f); 65 | run_test(0, tree); 66 | 67 | tree = fSphere(0.3f); 68 | run_test(1, tree); 69 | 70 | tree = fCylinder(0.6f,0.3f); 71 | run_test(2, tree); 72 | 73 | tree = fPlane(1.0f, 0.3f); 74 | pOpRotate(tree, 0.3f,0.5f,0.4f); 75 | pOpTranslate(tree, 0.2f,0.5f,0.4f); 76 | run_test(3, tree); 77 | 78 | frep_t *d1 = fBoxCheap(1.0f,0.5f,0.25f); 79 | pOpRotate(d1, 0.1f,0.4f,0.3f); 80 | pOpTranslate(d1, 0.5f,0.25f,0.25f); 81 | frep_t *d2 = fSphere(0.8f); 82 | pOpTranslate(d2, 1.0f,0,0); 83 | frep_t *d3 = fCylinder(0.4f, 0.2f); 84 | pOpTranslate(d3, 1.0f, 1.0f, 0.3f); 85 | tree = fOpUnion(fOpUnion(d1, d2), d3); 86 | run_test(4, tree); 87 | } 88 | -------------------------------------------------------------------------------- /test/linker.cpp: -------------------------------------------------------------------------------- 1 | // This file tests the use of seperate compilation to link together 2 | // pre-existing (relocatable) Cubin files. This is useful because we 3 | // can use the CUDA Driver API to generate an executable Cubin from 4 | // the output of our SASS backend and a user-provided Cubin containing 5 | // the entrypoint. 6 | // 7 | // To compile this file on Linux using g++: 8 | // $ g++ -std=c++11 linker.cpp -I/usr/local/cuda-10.1/include -lcuda 9 | // 10 | #include 11 | #include 12 | #include 13 | #include "util/cuda_error.h" 14 | #include "util/init_cuda.h" 15 | #include "util/read_file.h" 16 | #define ENABLE_TIMING 17 | #include "util/profiler.h" 18 | 19 | int main() { 20 | init_cuda(); 21 | 22 | // 23 | // Generate relocatable SASS binaries by invoking the PTX assembler 24 | // on our two test files. Neither of these can be executed on their 25 | // own, so we will link them together into an actual executable using 26 | // the CUDA linker in the Driver API. 27 | // 28 | system("/usr/local/cuda-10.1/bin/ptxas --opt-level 1 --compile-only --gpu-name sm_60 test1.ptx --output-file test1.cubin"); 29 | system("/usr/local/cuda-10.1/bin/ptxas --opt-level 1 --compile-only --gpu-name sm_60 test2.ptx --output-file test2.cubin"); 30 | 31 | int sizeof_cubin1 = 0; 32 | void *cubin1 = (void*)read_file("test1.cubin", &sizeof_cubin1); 33 | assert(cubin1); 34 | 35 | int sizeof_cubin2 = 0; 36 | void *cubin2 = (void*)read_file("test2.cubin", &sizeof_cubin2); 37 | assert(cubin2); 38 | 39 | CUfunction kernel; 40 | CUmodule module; 41 | const char *entry_name = "main"; 42 | 43 | // We do this 100 times and measure the time it takes the driver to 44 | // link together the Cubin file, and report the average in ms. 45 | for (int i = 0; i < 100; i++) 46 | { 47 | TIMING("linker"); 48 | 49 | // 50 | // initialize the linker. note: CU_JIT_TARGET must match compute mode 51 | // specified in test1.ptx and test2.ptx, and the --gpu-name argument 52 | // passed to ptxas above. 53 | // 54 | enum { num_options = 6 }; 55 | CUjit_option options[num_options]; 56 | void *option_values[num_options]; 57 | char error_log[8192]; 58 | char info_log[8192]; 59 | options[0] = CU_JIT_INFO_LOG_BUFFER; option_values[0] = (void *) info_log; 60 | options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; option_values[1] = (void *) (long)sizeof(info_log); 61 | options[2] = CU_JIT_ERROR_LOG_BUFFER; option_values[2] = (void *) error_log; 62 | options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; option_values[3] = (void *) (long)sizeof(error_log); 63 | options[4] = CU_JIT_LOG_VERBOSE; option_values[4] = (void *) 1; 64 | options[5] = CU_JIT_TARGET; option_values[5] = (void *) CU_TARGET_COMPUTE_60; 65 | CUlinkState link_state; 66 | cudaCheckError(cuLinkCreate(num_options, options, option_values, &link_state)); 67 | 68 | if (CUDA_SUCCESS != cuLinkAddData(link_state, CU_JIT_INPUT_CUBIN, 69 | (void *)cubin1, sizeof_cubin1, 0,0,0,0)) 70 | fprintf(stderr, "nvlink error:\n%s\n", error_log); 71 | 72 | if (CUDA_SUCCESS != cuLinkAddData(link_state, CU_JIT_INPUT_CUBIN, 73 | (void *)cubin2, sizeof_cubin2, 0,0,0,0)) 74 | fprintf(stderr, "nvlink error:\n%s\n", error_log); 75 | 76 | void *cubin; 77 | size_t cubin_size; 78 | cudaCheckError(cuLinkComplete(link_state, &cubin, &cubin_size)); 79 | 80 | cudaCheckError(cuModuleLoadData(&module, cubin)); assert(module); 81 | cudaCheckError(cuLinkDestroy(link_state)); 82 | cudaCheckError(cuModuleGetFunction(&kernel, module, entry_name)); assert(kernel); 83 | 84 | TIMING("linker"); 85 | } 86 | assert(kernel); 87 | 88 | // Print the average linking time in milliseconds 89 | TIMING_SUMMARY(); 90 | 91 | // 92 | // finally we run the thing to make sure that it actually works. 93 | // 94 | int N = 32; 95 | size_t sizeof_input = 4*N*sizeof(float); 96 | size_t sizeof_output = N*sizeof(float); 97 | float *input = (float*)malloc(sizeof_input); 98 | float *output = (float*)malloc(sizeof_output); 99 | 100 | for (int i = 0; i < 32; i++) 101 | { 102 | input[4*i + 0] = 1.1f; 103 | input[4*i + 1] = 0.0f; 104 | input[4*i + 2] = 0.0f; 105 | input[4*i + 3] = 0.0f; 106 | } 107 | 108 | int num_blocks = 8; 109 | int num_threads = 4; 110 | int shared_memory_bytes = 1024; 111 | CUdeviceptr dev_input; 112 | CUdeviceptr dev_output; 113 | cudaCheckError(cuMemAlloc(&dev_input, sizeof_input)); assert(dev_input); 114 | cudaCheckError(cuMemAlloc(&dev_output, sizeof_output)); assert(dev_output); 115 | cudaCheckError(cuMemcpyHtoD(dev_input, input, sizeof_input)); 116 | uint64_t param0 = (uint64_t)(dev_input); 117 | uint64_t param1 = (uint64_t)(dev_output); 118 | void *kernel_params[] = { (void*)¶m0, (void*)¶m1 }; 119 | cuLaunchKernel(kernel, num_blocks,1,1, num_threads,1,1, shared_memory_bytes, NULL, kernel_params, NULL); 120 | cudaCheckError(cuCtxSynchronize()); 121 | cudaCheckError(cuMemcpyDtoH(output, dev_output, sizeof_output)); 122 | cudaCheckError(cuMemFree(dev_output)); 123 | cudaCheckError(cuMemFree(dev_input)); 124 | cudaCheckError(cuModuleUnload(module)); 125 | 126 | printf("output:\n"); 127 | for (int i = 0; i < N; i++) 128 | printf("%f ", output[i]); 129 | } 130 | -------------------------------------------------------------------------------- /test/test1.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightbits/fast-csg/53b14f651e9544580ba31ba0f157221a77ba44fe/test/test1.cubin -------------------------------------------------------------------------------- /test/test1.ptx: -------------------------------------------------------------------------------- 1 | .version 6.0 2 | .target sm_60 3 | .address_size 64 4 | 5 | .visible .func (.reg.f32 f1) tree(.reg.f32 x0, .reg.f32 y0, .reg.f32 z0) { 6 | .reg.f32 x; 7 | .reg.f32 y; 8 | .reg.f32 z; 9 | abs.f32.ftz x, x0; 10 | abs.f32.ftz y, y0; 11 | abs.f32.ftz z, z0; 12 | sub.f32.ftz x,x,1.0; 13 | sub.f32.ftz y,y,0.5; 14 | sub.f32.ftz z,z,0.25; 15 | max.f32.ftz f1,x,y; 16 | max.f32.ftz f1,f1,z; 17 | ret.uni; 18 | } 19 | -------------------------------------------------------------------------------- /test/test2.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightbits/fast-csg/53b14f651e9544580ba31ba0f157221a77ba44fe/test/test2.cubin -------------------------------------------------------------------------------- /test/test2.ptx: -------------------------------------------------------------------------------- 1 | .version 6.0 2 | .target sm_60 3 | .address_size 64 4 | 5 | .extern .func (.reg.f32 f1) tree(.reg.f32 x0, .reg.f32 y0, .reg.f32 z0) 6 | 7 | .visible.entry main(.param.u64 param0, .param.u64 param1) { 8 | .reg.f32 x0; 9 | .reg.f32 y0; 10 | .reg.f32 z0; 11 | .reg.f32 w0; 12 | .reg.b32 r<5>; 13 | .reg.b64 rd<9>; 14 | .reg.f32 d; 15 | ld.param.u64 rd1, [param0]; 16 | ld.param.u64 rd2, [param1]; 17 | cvta.to.global.u64 rd3, rd2; 18 | cvta.to.global.u64 rd4, rd1; 19 | mov.u32 r1, %tid.x; // threadIdx.x 20 | mov.u32 r2, %ctaid.x; // blockIdx.x 21 | mov.u32 r3, %ntid.x; // blockDim.x 22 | mad.lo.s32 r4, r3, r2, r1; // blockDim.x*blockIdx.x + threadIdx.x 23 | mul.wide.s32 rd5, r4, 16; // sizeof(vec4)*(blockDim.x*blockIdx.x + threadIdx.x) 24 | add.s64 rd6, rd4, rd5; // param0 + sizeof(vec4)*(blockDim.x*blockIdx.x + threadIdx.x) 25 | ld.global.v4.f32 {x0, y0, z0, w0}, [rd6]; 26 | mul.wide.s32 rd7, r4, 4; // sizeof(float)*(blockDim.x*blockIdx.x + threadIdx.x) 27 | add.s64 rd8, rd3, rd7; // param1 + sizeof(float)*(blockDim.x*blockIdx.x + threadIdx.x) 28 | call.uni (d), tree, (x0,y0,z0); 29 | st.global.f32 [rd8], d; 30 | ret; 31 | } 32 | -------------------------------------------------------------------------------- /test/util/cuda_error.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | static const char *cudaErrorToString(CUresult error) 5 | { 6 | switch (error) 7 | { 8 | case CUDA_SUCCESS: 9 | return "CUDA_SUCCESS"; 10 | 11 | case CUDA_ERROR_INVALID_VALUE: 12 | return "CUDA_ERROR_INVALID_VALUE"; 13 | 14 | case CUDA_ERROR_OUT_OF_MEMORY: 15 | return "CUDA_ERROR_OUT_OF_MEMORY"; 16 | 17 | case CUDA_ERROR_NOT_INITIALIZED: 18 | return "CUDA_ERROR_NOT_INITIALIZED"; 19 | 20 | case CUDA_ERROR_DEINITIALIZED: 21 | return "CUDA_ERROR_DEINITIALIZED"; 22 | 23 | case CUDA_ERROR_PROFILER_DISABLED: 24 | return "CUDA_ERROR_PROFILER_DISABLED"; 25 | 26 | case CUDA_ERROR_PROFILER_NOT_INITIALIZED: 27 | return "CUDA_ERROR_PROFILER_NOT_INITIALIZED"; 28 | 29 | case CUDA_ERROR_PROFILER_ALREADY_STARTED: 30 | return "CUDA_ERROR_PROFILER_ALREADY_STARTED"; 31 | 32 | case CUDA_ERROR_PROFILER_ALREADY_STOPPED: 33 | return "CUDA_ERROR_PROFILER_ALREADY_STOPPED"; 34 | 35 | case CUDA_ERROR_NO_DEVICE: 36 | return "CUDA_ERROR_NO_DEVICE"; 37 | 38 | case CUDA_ERROR_INVALID_DEVICE: 39 | return "CUDA_ERROR_INVALID_DEVICE"; 40 | 41 | case CUDA_ERROR_INVALID_IMAGE: 42 | return "CUDA_ERROR_INVALID_IMAGE"; 43 | 44 | case CUDA_ERROR_INVALID_CONTEXT: 45 | return "CUDA_ERROR_INVALID_CONTEXT"; 46 | 47 | case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: 48 | return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT"; 49 | 50 | case CUDA_ERROR_MAP_FAILED: 51 | return "CUDA_ERROR_MAP_FAILED"; 52 | 53 | case CUDA_ERROR_UNMAP_FAILED: 54 | return "CUDA_ERROR_UNMAP_FAILED"; 55 | 56 | case CUDA_ERROR_ARRAY_IS_MAPPED: 57 | return "CUDA_ERROR_ARRAY_IS_MAPPED"; 58 | 59 | case CUDA_ERROR_ALREADY_MAPPED: 60 | return "CUDA_ERROR_ALREADY_MAPPED"; 61 | 62 | case CUDA_ERROR_NO_BINARY_FOR_GPU: 63 | return "CUDA_ERROR_NO_BINARY_FOR_GPU"; 64 | 65 | case CUDA_ERROR_ALREADY_ACQUIRED: 66 | return "CUDA_ERROR_ALREADY_ACQUIRED"; 67 | 68 | case CUDA_ERROR_NOT_MAPPED: 69 | return "CUDA_ERROR_NOT_MAPPED"; 70 | 71 | case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: 72 | return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY"; 73 | 74 | case CUDA_ERROR_NOT_MAPPED_AS_POINTER: 75 | return "CUDA_ERROR_NOT_MAPPED_AS_POINTER"; 76 | 77 | case CUDA_ERROR_ECC_UNCORRECTABLE: 78 | return "CUDA_ERROR_ECC_UNCORRECTABLE"; 79 | 80 | case CUDA_ERROR_UNSUPPORTED_LIMIT: 81 | return "CUDA_ERROR_UNSUPPORTED_LIMIT"; 82 | 83 | case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: 84 | return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE"; 85 | 86 | case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: 87 | return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED"; 88 | 89 | case CUDA_ERROR_INVALID_PTX: 90 | return "CUDA_ERROR_INVALID_PTX"; 91 | 92 | case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: 93 | return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT"; 94 | 95 | case CUDA_ERROR_NVLINK_UNCORRECTABLE: 96 | return "CUDA_ERROR_NVLINK_UNCORRECTABLE"; 97 | 98 | case CUDA_ERROR_JIT_COMPILER_NOT_FOUND: 99 | return "CUDA_ERROR_JIT_COMPILER_NOT_FOUND"; 100 | 101 | case CUDA_ERROR_INVALID_SOURCE: 102 | return "CUDA_ERROR_INVALID_SOURCE"; 103 | 104 | case CUDA_ERROR_FILE_NOT_FOUND: 105 | return "CUDA_ERROR_FILE_NOT_FOUND"; 106 | 107 | case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: 108 | return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"; 109 | 110 | case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: 111 | return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"; 112 | 113 | case CUDA_ERROR_OPERATING_SYSTEM: 114 | return "CUDA_ERROR_OPERATING_SYSTEM"; 115 | 116 | case CUDA_ERROR_INVALID_HANDLE: 117 | return "CUDA_ERROR_INVALID_HANDLE"; 118 | 119 | case CUDA_ERROR_NOT_FOUND: 120 | return "CUDA_ERROR_NOT_FOUND"; 121 | 122 | case CUDA_ERROR_NOT_READY: 123 | return "CUDA_ERROR_NOT_READY"; 124 | 125 | case CUDA_ERROR_ILLEGAL_ADDRESS: 126 | return "CUDA_ERROR_ILLEGAL_ADDRESS"; 127 | 128 | case CUDA_ERROR_LAUNCH_FAILED: 129 | return "CUDA_ERROR_LAUNCH_FAILED"; 130 | 131 | case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: 132 | return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; 133 | 134 | case CUDA_ERROR_LAUNCH_TIMEOUT: 135 | return "CUDA_ERROR_LAUNCH_TIMEOUT"; 136 | 137 | case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: 138 | return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; 139 | 140 | case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: 141 | return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"; 142 | 143 | case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: 144 | return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"; 145 | 146 | case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: 147 | return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"; 148 | 149 | case CUDA_ERROR_CONTEXT_IS_DESTROYED: 150 | return "CUDA_ERROR_CONTEXT_IS_DESTROYED"; 151 | 152 | case CUDA_ERROR_ASSERT: 153 | return "CUDA_ERROR_ASSERT"; 154 | 155 | case CUDA_ERROR_TOO_MANY_PEERS: 156 | return "CUDA_ERROR_TOO_MANY_PEERS"; 157 | 158 | case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: 159 | return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"; 160 | 161 | case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: 162 | return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"; 163 | 164 | case CUDA_ERROR_HARDWARE_STACK_ERROR: 165 | return "CUDA_ERROR_HARDWARE_STACK_ERROR"; 166 | 167 | case CUDA_ERROR_ILLEGAL_INSTRUCTION: 168 | return "CUDA_ERROR_ILLEGAL_INSTRUCTION"; 169 | 170 | case CUDA_ERROR_MISALIGNED_ADDRESS: 171 | return "CUDA_ERROR_MISALIGNED_ADDRESS"; 172 | 173 | case CUDA_ERROR_INVALID_ADDRESS_SPACE: 174 | return "CUDA_ERROR_INVALID_ADDRESS_SPACE"; 175 | 176 | case CUDA_ERROR_INVALID_PC: 177 | return "CUDA_ERROR_INVALID_PC"; 178 | 179 | case CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE: 180 | return "CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE"; 181 | 182 | case CUDA_ERROR_NOT_PERMITTED: 183 | return "CUDA_ERROR_NOT_PERMITTED"; 184 | 185 | case CUDA_ERROR_NOT_SUPPORTED: 186 | return "CUDA_ERROR_NOT_SUPPORTED"; 187 | 188 | case CUDA_ERROR_UNKNOWN: 189 | return "CUDA_ERROR_UNKNOWN"; 190 | } 191 | 192 | return ""; 193 | } 194 | 195 | static const char *cudaErrorToString(cudaError_t error) 196 | { 197 | switch (error) 198 | { 199 | case cudaSuccess: 200 | return "cudaSuccess"; 201 | 202 | case cudaErrorMissingConfiguration: 203 | return "cudaErrorMissingConfiguration"; 204 | 205 | case cudaErrorMemoryAllocation: 206 | return "cudaErrorMemoryAllocation"; 207 | 208 | case cudaErrorInitializationError: 209 | return "cudaErrorInitializationError"; 210 | 211 | case cudaErrorLaunchFailure: 212 | return "cudaErrorLaunchFailure"; 213 | 214 | case cudaErrorPriorLaunchFailure: 215 | return "cudaErrorPriorLaunchFailure"; 216 | 217 | case cudaErrorLaunchTimeout: 218 | return "cudaErrorLaunchTimeout"; 219 | 220 | case cudaErrorLaunchOutOfResources: 221 | return "cudaErrorLaunchOutOfResources"; 222 | 223 | case cudaErrorInvalidDeviceFunction: 224 | return "cudaErrorInvalidDeviceFunction"; 225 | 226 | case cudaErrorInvalidConfiguration: 227 | return "cudaErrorInvalidConfiguration"; 228 | 229 | case cudaErrorInvalidDevice: 230 | return "cudaErrorInvalidDevice"; 231 | 232 | case cudaErrorInvalidValue: 233 | return "cudaErrorInvalidValue"; 234 | 235 | case cudaErrorInvalidPitchValue: 236 | return "cudaErrorInvalidPitchValue"; 237 | 238 | case cudaErrorInvalidSymbol: 239 | return "cudaErrorInvalidSymbol"; 240 | 241 | case cudaErrorMapBufferObjectFailed: 242 | return "cudaErrorMapBufferObjectFailed"; 243 | 244 | case cudaErrorUnmapBufferObjectFailed: 245 | return "cudaErrorUnmapBufferObjectFailed"; 246 | 247 | case cudaErrorInvalidHostPointer: 248 | return "cudaErrorInvalidHostPointer"; 249 | 250 | case cudaErrorInvalidDevicePointer: 251 | return "cudaErrorInvalidDevicePointer"; 252 | 253 | case cudaErrorInvalidTexture: 254 | return "cudaErrorInvalidTexture"; 255 | 256 | case cudaErrorInvalidTextureBinding: 257 | return "cudaErrorInvalidTextureBinding"; 258 | 259 | case cudaErrorInvalidChannelDescriptor: 260 | return "cudaErrorInvalidChannelDescriptor"; 261 | 262 | case cudaErrorInvalidMemcpyDirection: 263 | return "cudaErrorInvalidMemcpyDirection"; 264 | 265 | case cudaErrorAddressOfConstant: 266 | return "cudaErrorAddressOfConstant"; 267 | 268 | case cudaErrorTextureFetchFailed: 269 | return "cudaErrorTextureFetchFailed"; 270 | 271 | case cudaErrorTextureNotBound: 272 | return "cudaErrorTextureNotBound"; 273 | 274 | case cudaErrorSynchronizationError: 275 | return "cudaErrorSynchronizationError"; 276 | 277 | case cudaErrorInvalidFilterSetting: 278 | return "cudaErrorInvalidFilterSetting"; 279 | 280 | case cudaErrorInvalidNormSetting: 281 | return "cudaErrorInvalidNormSetting"; 282 | 283 | case cudaErrorMixedDeviceExecution: 284 | return "cudaErrorMixedDeviceExecution"; 285 | 286 | case cudaErrorCudartUnloading: 287 | return "cudaErrorCudartUnloading"; 288 | 289 | case cudaErrorUnknown: 290 | return "cudaErrorUnknown"; 291 | 292 | case cudaErrorNotYetImplemented: 293 | return "cudaErrorNotYetImplemented"; 294 | 295 | case cudaErrorMemoryValueTooLarge: 296 | return "cudaErrorMemoryValueTooLarge"; 297 | 298 | case cudaErrorInvalidResourceHandle: 299 | return "cudaErrorInvalidResourceHandle"; 300 | 301 | case cudaErrorNotReady: 302 | return "cudaErrorNotReady"; 303 | 304 | case cudaErrorInsufficientDriver: 305 | return "cudaErrorInsufficientDriver"; 306 | 307 | case cudaErrorSetOnActiveProcess: 308 | return "cudaErrorSetOnActiveProcess"; 309 | 310 | case cudaErrorInvalidSurface: 311 | return "cudaErrorInvalidSurface"; 312 | 313 | case cudaErrorNoDevice: 314 | return "cudaErrorNoDevice"; 315 | 316 | case cudaErrorECCUncorrectable: 317 | return "cudaErrorECCUncorrectable"; 318 | 319 | case cudaErrorSharedObjectSymbolNotFound: 320 | return "cudaErrorSharedObjectSymbolNotFound"; 321 | 322 | case cudaErrorSharedObjectInitFailed: 323 | return "cudaErrorSharedObjectInitFailed"; 324 | 325 | case cudaErrorUnsupportedLimit: 326 | return "cudaErrorUnsupportedLimit"; 327 | 328 | case cudaErrorDuplicateVariableName: 329 | return "cudaErrorDuplicateVariableName"; 330 | 331 | case cudaErrorDuplicateTextureName: 332 | return "cudaErrorDuplicateTextureName"; 333 | 334 | case cudaErrorDuplicateSurfaceName: 335 | return "cudaErrorDuplicateSurfaceName"; 336 | 337 | case cudaErrorDevicesUnavailable: 338 | return "cudaErrorDevicesUnavailable"; 339 | 340 | case cudaErrorInvalidKernelImage: 341 | return "cudaErrorInvalidKernelImage"; 342 | 343 | case cudaErrorNoKernelImageForDevice: 344 | return "cudaErrorNoKernelImageForDevice"; 345 | 346 | case cudaErrorIncompatibleDriverContext: 347 | return "cudaErrorIncompatibleDriverContext"; 348 | 349 | case cudaErrorPeerAccessAlreadyEnabled: 350 | return "cudaErrorPeerAccessAlreadyEnabled"; 351 | 352 | case cudaErrorPeerAccessNotEnabled: 353 | return "cudaErrorPeerAccessNotEnabled"; 354 | 355 | case cudaErrorDeviceAlreadyInUse: 356 | return "cudaErrorDeviceAlreadyInUse"; 357 | 358 | case cudaErrorProfilerDisabled: 359 | return "cudaErrorProfilerDisabled"; 360 | 361 | case cudaErrorProfilerNotInitialized: 362 | return "cudaErrorProfilerNotInitialized"; 363 | 364 | case cudaErrorProfilerAlreadyStarted: 365 | return "cudaErrorProfilerAlreadyStarted"; 366 | 367 | case cudaErrorProfilerAlreadyStopped: 368 | return "cudaErrorProfilerAlreadyStopped"; 369 | 370 | /* Since CUDA 4.0*/ 371 | case cudaErrorAssert: 372 | return "cudaErrorAssert"; 373 | 374 | case cudaErrorTooManyPeers: 375 | return "cudaErrorTooManyPeers"; 376 | 377 | case cudaErrorHostMemoryAlreadyRegistered: 378 | return "cudaErrorHostMemoryAlreadyRegistered"; 379 | 380 | case cudaErrorHostMemoryNotRegistered: 381 | return "cudaErrorHostMemoryNotRegistered"; 382 | 383 | /* Since CUDA 5.0 */ 384 | case cudaErrorOperatingSystem: 385 | return "cudaErrorOperatingSystem"; 386 | 387 | case cudaErrorPeerAccessUnsupported: 388 | return "cudaErrorPeerAccessUnsupported"; 389 | 390 | case cudaErrorLaunchMaxDepthExceeded: 391 | return "cudaErrorLaunchMaxDepthExceeded"; 392 | 393 | case cudaErrorLaunchFileScopedTex: 394 | return "cudaErrorLaunchFileScopedTex"; 395 | 396 | case cudaErrorLaunchFileScopedSurf: 397 | return "cudaErrorLaunchFileScopedSurf"; 398 | 399 | case cudaErrorSyncDepthExceeded: 400 | return "cudaErrorSyncDepthExceeded"; 401 | 402 | case cudaErrorLaunchPendingCountExceeded: 403 | return "cudaErrorLaunchPendingCountExceeded"; 404 | 405 | case cudaErrorNotPermitted: 406 | return "cudaErrorNotPermitted"; 407 | 408 | case cudaErrorNotSupported: 409 | return "cudaErrorNotSupported"; 410 | 411 | /* Since CUDA 6.0 */ 412 | case cudaErrorHardwareStackError: 413 | return "cudaErrorHardwareStackError"; 414 | 415 | case cudaErrorIllegalInstruction: 416 | return "cudaErrorIllegalInstruction"; 417 | 418 | case cudaErrorMisalignedAddress: 419 | return "cudaErrorMisalignedAddress"; 420 | 421 | case cudaErrorInvalidAddressSpace: 422 | return "cudaErrorInvalidAddressSpace"; 423 | 424 | case cudaErrorInvalidPc: 425 | return "cudaErrorInvalidPc"; 426 | 427 | case cudaErrorIllegalAddress: 428 | return "cudaErrorIllegalAddress"; 429 | 430 | /* Since CUDA 6.5*/ 431 | case cudaErrorInvalidPtx: 432 | return "cudaErrorInvalidPtx"; 433 | 434 | case cudaErrorInvalidGraphicsContext: 435 | return "cudaErrorInvalidGraphicsContext"; 436 | 437 | case cudaErrorStartupFailure: 438 | return "cudaErrorStartupFailure"; 439 | 440 | case cudaErrorApiFailureBase: 441 | return "cudaErrorApiFailureBase"; 442 | 443 | /* Since CUDA 8.0*/ 444 | case cudaErrorNvlinkUncorrectable : 445 | return "cudaErrorNvlinkUncorrectable"; 446 | 447 | /* Since CUDA 8.5*/ 448 | case cudaErrorJitCompilerNotFound : 449 | return "cudaErrorJitCompilerNotFound"; 450 | 451 | /* Since CUDA 9.0*/ 452 | case cudaErrorCooperativeLaunchTooLarge : 453 | return "cudaErrorCooperativeLaunchTooLarge"; 454 | 455 | } 456 | 457 | return ""; 458 | } 459 | 460 | template< typename T > 461 | void _cudaCheckError(T result, char const *const func, const char *const file, int const line) 462 | { 463 | if (result) 464 | { 465 | fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", 466 | file, line, static_cast(result), cudaErrorToString(result), func); 467 | CUcontext ctx; 468 | cuCtxGetCurrent(&ctx); 469 | cuCtxDestroy(ctx); 470 | exit(EXIT_FAILURE); 471 | } 472 | } 473 | #define cudaCheckError(val) _cudaCheckError ( (val), #val, __FILE__, __LINE__ ) 474 | -------------------------------------------------------------------------------- /test/util/init_cuda.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "cuda_error.h" 6 | void init_cuda() 7 | { 8 | // disable CUDA from caching SASS programs 9 | setenv("CUDA_CACHE_DISABLE", "1", 1); 10 | 11 | CUcontext context; 12 | CUdevice device; 13 | cudaCheckError(cuInit(0)); 14 | cudaCheckError(cuDeviceGet(&device, 0)); 15 | cudaCheckError(cuCtxCreate(&context, 0, device)); 16 | 17 | char name[256]; 18 | int major = 0, minor = 0; 19 | int compute_mode = -1; 20 | cudaCheckError(cuDeviceGetName(name, 100, device)); 21 | cudaCheckError(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device)); 22 | cudaCheckError(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device)); 23 | cudaCheckError(cuDeviceGetAttribute(&compute_mode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, device)); 24 | assert(compute_mode != CU_COMPUTEMODE_PROHIBITED && "Device is running in Compute Mode Prohibited"); 25 | printf("Using CUDA device %s: Compute SM %d.%d\n", name, major, minor); 26 | } 27 | -------------------------------------------------------------------------------- /test/util/profiler.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | #ifdef _WIN32 6 | #ifndef WIN32_LEAN_AND_MEAN 7 | #define WIN32_LEAN_AND_MEAN 8 | #endif 9 | #include 10 | 11 | LARGE_INTEGER perf_get_tick() 12 | { 13 | LARGE_INTEGER result; 14 | QueryPerformanceCounter(&result); 15 | return result; 16 | } 17 | 18 | float perf_seconds_elapsed(LARGE_INTEGER begin, LARGE_INTEGER end) 19 | { 20 | LARGE_INTEGER frequency; 21 | QueryPerformanceFrequency(&frequency); 22 | return (float)(end.QuadPart - begin.QuadPart) / 23 | (float)frequency.QuadPart; 24 | } 25 | 26 | struct perf_TimingInfo 27 | { 28 | const char *label; 29 | LARGE_INTEGER begin; 30 | LARGE_INTEGER end; 31 | bool counting; 32 | float t_sum; 33 | float t_last; 34 | int hits; 35 | }; 36 | 37 | #else // ifdef _WIN32 38 | #include 39 | 40 | timespec perf_get_tick() 41 | { 42 | timespec result; 43 | clock_gettime(CLOCK_REALTIME, &result); 44 | return result; 45 | } 46 | 47 | float perf_seconds_elapsed(timespec begin, timespec end) 48 | { 49 | time_t dsec = end.tv_sec - begin.tv_sec; 50 | long dnsec = end.tv_nsec - begin.tv_nsec; 51 | double result = (double)dsec + (double)dnsec / 1000000000.0; 52 | return (float)result; 53 | } 54 | 55 | struct perf_TimingInfo 56 | { 57 | const char *label; 58 | timespec begin; 59 | timespec end; 60 | bool counting; 61 | float t_sum; 62 | float t_last; 63 | int hits; 64 | }; 65 | 66 | #endif 67 | 68 | #ifdef ENABLE_TIMING 69 | static perf_TimingInfo perf_timing_blocks[1024]; 70 | static int perf_count = 0; 71 | 72 | void TIMING(const char *label) 73 | { 74 | perf_TimingInfo *block = 0; 75 | for (int i = 0; i < perf_count; i++) 76 | { 77 | if (strcmp(label, perf_timing_blocks[i].label) == 0) 78 | { 79 | block = &perf_timing_blocks[i]; 80 | break; 81 | } 82 | } 83 | if (!block) 84 | { 85 | block = &perf_timing_blocks[perf_count]; 86 | perf_count++; 87 | block->hits = 0; 88 | block->t_sum = 0.0f; 89 | block->t_last = 0.0f; 90 | block->label = label; 91 | } 92 | if (block->counting) 93 | { 94 | block->hits++; 95 | block->end = perf_get_tick(); 96 | float elapsed = perf_seconds_elapsed(block->begin, block->end); 97 | block->t_sum += elapsed; 98 | block->t_last = elapsed; 99 | block->counting = false; 100 | } 101 | else 102 | { 103 | block->counting = true; 104 | block->begin = perf_get_tick(); 105 | } 106 | } 107 | 108 | void TIMING_CLEAR() { perf_count = 0; } 109 | 110 | void TIMING_SUMMARY() 111 | { 112 | printf("AVG \tLAST \tHITS\tNAME\n"); 113 | for (int i = 0; i < perf_count; i++) 114 | { 115 | perf_TimingInfo block = perf_timing_blocks[i]; 116 | int hits = block.hits; 117 | float avg = 1000.0f * block.t_sum / block.hits; 118 | float last = 1000.0f * block.t_last; 119 | printf("%.2f\t%.2f\t%04d\t%s\n", avg, last, hits, block.label); 120 | } 121 | } 122 | 123 | float TIMING_GET_AVG(const char *label) 124 | { 125 | perf_TimingInfo *block = 0; 126 | for (int i = 0; i < perf_count; i++) 127 | { 128 | if (strcmp(label, perf_timing_blocks[i].label) == 0) 129 | { 130 | block = &perf_timing_blocks[i]; 131 | break; 132 | } 133 | } 134 | if (!block) 135 | return -1.0f; 136 | return block->t_sum / block->hits; 137 | } 138 | 139 | #else 140 | void TIMING(const char *label) { } 141 | void TIMING_CLEAR() { } 142 | void TIMING_SUMMARY() { } 143 | void TIMING_GET_AVG(const char *label) { } 144 | #endif 145 | -------------------------------------------------------------------------------- /test/util/test_models.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "sdf_builder.h" 3 | 4 | sdf_node_t *model_simple01() { return sdf_box(1.0f, 0.5f, 0.25f); } 5 | sdf_node_t *model_simple02() { return sdf_cylinder(1.0f, 0.5f); } 6 | sdf_node_t *model_simple03() { return sdf_sphere(0.98f); } 7 | sdf_node_t *model_simple04() { return sdf_plane(0.98f); } 8 | sdf_node_t *model_simple05() { return sdf_rotate(sdf_translate(sdf_box(0.98f, 0.63f, 0.33f), 0.1f,-0.2f,0.3f), 0.1f,0.2f,-0.3f); } 9 | sdf_node_t *model_simple06() { return sdf_rotate(sdf_translate(sdf_sphere(0.98f), 0.1f,-0.2f,0.3f), 0.1f,0.2f,-0.3f); } 10 | sdf_node_t *model_simple07() { return sdf_rotate(sdf_translate(sdf_cylinder(0.98f, 0.63f), 0.1f,-0.2f,0.3f), 0.1f,0.2f,-0.3f); } 11 | sdf_node_t *model_simple08() { return sdf_rotate(sdf_translate(sdf_plane(0.98f), 0.1f,-0.2f,0.3f), 0.1f,0.2f,-0.3f); } 12 | sdf_node_t *model_simple09() { return sdf_blend(0.4f, sdf_sphere(1.0f), sdf_cylinder(0.3f,1.0f)); } 13 | sdf_node_t *model_simple10() { 14 | sdf_node_t *d1 = sdf_box(0.98f, 0.63f, 0.33f); 15 | sdf_rotate(d1, -0.3f, 0.2f, -0.1f); 16 | sdf_translate(d1, 0.3f, -0.5f, 0.3f); 17 | sdf_node_t *d2 = sdf_sphere(0.63f); 18 | sdf_rotate(d2, 0.7f, 0.8f, -0.3f); 19 | sdf_translate(d2, -0.6f, +0.5f, 0.2f); 20 | sdf_node_t *d = sdf_union(d1, d2); 21 | return d; 22 | } 23 | sdf_node_t *model_simple11() { return sdf_subtract(sdf_box(1.0f,1.0f,1.0f), sdf_translate(sdf_sphere(0.5f), 0,1.0f,0)); } 24 | sdf_node_t *model_simple12() { return sdf_subtract(sdf_rotate(sdf_box(1.0f,1.0f,1.0f), 0.77f,0.77f,0), sdf_sphere(0.5f)); } 25 | sdf_node_t *model_simple13() { return sdf_subtract(sdf_box(1.0f,1.0f,1.0f), sdf_cylinder(0.5f,2.0f)); } 26 | sdf_node_t *model_simple14() { return sdf_union(sdf_box(0.5f,0.5f,0.5f), sdf_translate(sdf_sphere(0.25f),0.5f,0,0)); } 27 | sdf_node_t *model_simple15() { return sdf_intersect(sdf_box(0.5f,0.5f,0.5f), sdf_translate(sdf_sphere(0.25f),0.5f,0,0)); } 28 | sdf_node_t *model_simple16() { return sdf_subtract(sdf_box(0.5f,0.5f,0.5f), sdf_translate(sdf_sphere(0.25f),0.5f,0,0)); } 29 | 30 | sdf_node_t *model_complex_2d_1() 31 | { 32 | auto *d1 = sdf_translate(sdf_box(0.9f, 0.1f, 0.5f), 0.0f, 0.5f, 0.0f); 33 | auto *d2 = sdf_translate(sdf_box(0.8f, 0.05f, 0.5f), 0.0f, -0.5f, 0.0f); 34 | auto *d3 = sdf_sphere(0.5f); 35 | auto *d4 = sdf_box(1.0f, 0.2f, 0.5f); 36 | return sdf_rotate(sdf_translate(sdf_union(sdf_union(d1, d2), sdf_subtract(d3, d4)), 0.1f, -0.2f, 0.0f), 0.0f, 0.0f, 0.2f); 37 | } 38 | 39 | sdf_node_t *model_complex02() 40 | { 41 | sdf_node_t *a1 = sdf_plane(0.3f); 42 | sdf_node_t *a2 = sdf_cylinder(0.2f, 0.3f); 43 | sdf_node_t *a3 = sdf_box(0.3f,0.3f,0.3f); 44 | sdf_node_t *a4 = sdf_sphere(0.5f); 45 | sdf_node_t *a5 = sdf_union(a1,a2); 46 | sdf_node_t *a6 = sdf_subtract(a3,a4); 47 | sdf_node_t *a7 = sdf_union(a5,a6); 48 | sdf_node_t *b1 = sdf_plane(0.3f); 49 | sdf_node_t *b2 = sdf_cylinder(0.2f, 0.3f); 50 | sdf_node_t *b3 = sdf_box(0.3f,0.3f,0.3f); 51 | sdf_node_t *b4 = sdf_sphere(0.5f); 52 | sdf_node_t *b5 = sdf_union(b1,b2); 53 | sdf_node_t *b6 = sdf_subtract(b3,b4); 54 | sdf_node_t *b7 = sdf_union(b5,b6); 55 | sdf_node_t *d = sdf_union(a7,b7); 56 | return d; 57 | } 58 | 59 | sdf_node_t *model_complex03() 60 | { 61 | float s = 0.3f; 62 | sdf_node_t *d1 = sdf_sphere(1.0f*s); 63 | sdf_node_t *c1 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(c1, 0,0,0); 64 | sdf_node_t *c2 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(c2, 1.54f,0,0); 65 | sdf_node_t *c3 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(c3, 0,0,1.54f); 66 | sdf_node_t *c12 = sdf_union(c1,c2); 67 | sdf_node_t *c123 = sdf_union(c12,c3); 68 | sdf_node_t *d2 = sdf_subtract(d1,c123); 69 | 70 | sdf_node_t *b1 = sdf_box(0.74f*s,0.74f*s,0.74f*s); 71 | sdf_node_t *d3 = sdf_intersect(d2,b1); 72 | 73 | sdf_node_t *s2 = sdf_sphere(0.3f*s); 74 | sdf_node_t *c5 = sdf_cylinder(0.1f*s, 0.8f*s); sdf_rotate(c5, 1.54f,0,0); 75 | sdf_node_t *c6 = sdf_cylinder(0.1f*s, 0.8f*s); sdf_rotate(c6, 0,0,0); 76 | sdf_node_t *c56 = sdf_union(c5,c6); sdf_rotate(c56, 0.7f, 0.0f, 0.0f); sdf_translate(c56, 1.0f*s, 0.0f, 0.0f); 77 | sdf_node_t *s2c56 = sdf_union(s2,c56); 78 | sdf_node_t *d4 = sdf_union(d3, s2c56); 79 | 80 | sdf_node_t *b2 = sdf_box(0.2f*s,0.2f*s,0.2f*s); sdf_translate(b2,-1.0f*s,0,0); sdf_rotate(b2,0.77f,0.77f,0); 81 | sdf_node_t *d5 = sdf_union(d4,b2); 82 | 83 | sdf_node_t *d = d5; 84 | return d; 85 | } 86 | 87 | sdf_node_t *model_complex04() 88 | { 89 | float s = 1.5f; 90 | sdf_node_t *d1 = sdf_sphere(1.0f*s); 91 | sdf_node_t *d2 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(d2, 0,0,0); 92 | sdf_node_t *d3 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(d3, 1.54f,0,0); 93 | sdf_node_t *d4 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(d4, 0,0,1.54f); 94 | sdf_node_t *d5 = sdf_subtract(d1,d2); 95 | sdf_node_t *d6 = sdf_subtract(d5,d3); 96 | sdf_node_t *d7 = sdf_subtract(d6,d4); 97 | sdf_node_t *d8 = sdf_plane(0.74f*s); sdf_rotate(d8, 0,0,1.54f); 98 | sdf_node_t *d9 = sdf_plane(0.74f*s); sdf_rotate(d9, 0,0,-1.54f); 99 | sdf_node_t *d10 = sdf_plane(0.74f*s); sdf_rotate(d10, 0,1.54f,0); 100 | sdf_node_t *d11 = sdf_plane(0.74f*s); sdf_rotate(d11, 0,-1.54f,0); 101 | sdf_node_t *d12 = sdf_plane(0.74f*s); sdf_rotate(d12, 0,0,0); 102 | sdf_node_t *d13 = sdf_plane(0.74f*s); sdf_rotate(d13, 0,0,3.14f); 103 | sdf_node_t *d14 = sdf_intersect(d7,d8); 104 | sdf_node_t *d15 = sdf_intersect(d14,d9); 105 | sdf_node_t *d16 = sdf_intersect(d15,d10); 106 | sdf_node_t *d17 = sdf_intersect(d16,d11); 107 | sdf_node_t *d18 = sdf_intersect(d17,d12); 108 | sdf_node_t *d19 = sdf_intersect(d18,d13); 109 | sdf_node_t *d20 = sdf_sphere(0.3f*s); 110 | sdf_node_t *d21 = sdf_union(d19, d20); 111 | sdf_node_t *d22 = sdf_cylinder(0.1f*s, 0.8f*s); sdf_rotate(d22, 1.54f,0,0); 112 | sdf_node_t *d23 = sdf_cylinder(0.1f*s, 0.8f*s); sdf_rotate(d23, 0,0,0); 113 | sdf_node_t *d24 = sdf_union(d21, d22); 114 | sdf_node_t *d25 = sdf_union(d24, d23); 115 | return d25; 116 | } 117 | 118 | sdf_node_t *model_complex05() 119 | { 120 | float s = 1.5f; 121 | sdf_node_t *d1 = sdf_sphere(1.0f*s); 122 | sdf_node_t *d2 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(d2, 0,0,0); 123 | sdf_node_t *d3 = sdf_subtract(d1,d2); 124 | sdf_node_t *d4 = sdf_plane(0.44f*s); sdf_rotate(d4, 0,0,1.54f); 125 | sdf_node_t *d5 = sdf_plane(0.44f*s); sdf_rotate(d5, 0,0,-1.54f); 126 | sdf_node_t *d6 = sdf_intersect(d3,d4); 127 | sdf_node_t *d7 = sdf_intersect(d6,d5); 128 | return d7; 129 | } 130 | 131 | sdf_node_t *model_chair1_2d() 132 | { 133 | float k = 0.5f; 134 | sdf_node_t *seat = sdf_box(1.0f*k, 0.1f*k, 1.0f); 135 | sdf_node_t *leg1 = sdf_rotate(sdf_translate(sdf_box(0.1f*k, 1.0f*k, 1.0f), -1.0f*k,0,0), 0,0,-0.2f); 136 | sdf_node_t *leg2 = sdf_rotate(sdf_translate(sdf_box(0.1f*k, 1.0f*k, 1.0f), +1.0f*k,0,0), 0,0,+0.1f); 137 | sdf_node_t *legs = sdf_translate(sdf_union(leg1, leg2), 0,-1.0f*k,0); 138 | sdf_node_t *back = sdf_rotate(sdf_translate(sdf_box(0.1f*k, 1.0f*k, 1.0f), 1.0f*k,1.0f*k,0), 0,0,-0.1f); 139 | sdf_node_t *seat_and_legs = sdf_union(seat, legs); 140 | sdf_node_t *chair = sdf_union(seat_and_legs, back); 141 | return chair; 142 | } 143 | 144 | sdf_node_t *model_chair2_2d() 145 | { 146 | float k = 0.5f; 147 | sdf_node_t *seat = sdf_rotate(sdf_box(0.8f*k, 0.15f*k, 1.0f), 0,0,0.2f); 148 | sdf_node_t *leg1 = sdf_rotate(sdf_translate(sdf_box(0.1f*k, 1.0f*k, 1.0f), -0.75f*k,0,0), 0,0,-0.05f); 149 | sdf_node_t *leg2 = sdf_rotate(sdf_translate(sdf_box(0.1f*k, 1.0f*k, 1.0f), +0.8f*k,0.05f*k,0), 0,0,0.1f); 150 | sdf_node_t *mid = sdf_translate(sdf_box(0.8f*k, 0.05f*k, 1.0f), 0,-1.0f*k,0); 151 | sdf_node_t *legs = sdf_intersect(sdf_translate(sdf_union(leg1, leg2), 0,-1.0f*k,0), 152 | sdf_rotate(sdf_plane(1.9f*k), 0,0,-3.14f/2.0f)); 153 | sdf_node_t *seat_and_legs = sdf_union(seat, legs); 154 | sdf_node_t *chair = sdf_union(seat_and_legs, mid); 155 | return chair; 156 | } 157 | 158 | sdf_node_t *model_translated_sphere() 159 | { 160 | return 161 | sdf_translate(sdf_sphere(1.0f), -0.5f,0.0f,0.0f); 162 | } 163 | 164 | sdf_node_t *model_intersection() 165 | { 166 | return 167 | sdf_intersect(sdf_translate(sdf_sphere(0.5f), -0.2f,0.0f,0.0f), 168 | sdf_translate(sdf_sphere(0.5f), +0.2f,0.0f,0.0f)); 169 | } 170 | 171 | sdf_node_t *model_two_spheres() 172 | { 173 | return 174 | sdf_union(sdf_translate(sdf_sphere(0.1f), -0.5f,0.0f,0.0f), 175 | sdf_translate(sdf_sphere(0.5f), +0.3f,0.0f,0.0f)); 176 | } 177 | 178 | sdf_node_t *model_two_spheres_equal() 179 | { 180 | return 181 | sdf_union(sdf_translate(sdf_sphere(0.3f), -0.4f,0.0f,0.0f), 182 | sdf_translate(sdf_sphere(0.3f), +0.4f,0.0f,0.0f)); 183 | } 184 | 185 | sdf_node_t *model_four_spheres() 186 | { 187 | return 188 | sdf_union( 189 | sdf_union( 190 | sdf_translate(sdf_sphere(0.2f), 0.0f,0.7f,0.0f), 191 | sdf_translate(sdf_sphere(0.2f), 0.0f,-0.7f,0.0f)), 192 | sdf_union( 193 | sdf_translate(sdf_sphere(0.4f), -0.5f,0.0f,0.0f), 194 | sdf_translate(sdf_sphere(0.4f), +0.5f,0.0f,0.0f))); 195 | } 196 | 197 | sdf_node_t *model_scissor() 198 | { 199 | return 200 | sdf_union( 201 | sdf_translate(sdf_sphere(0.4f), 0.0f,0.6f,0.0f), 202 | sdf_intersect( 203 | sdf_translate(sdf_sphere(0.8f), -0.5f,0.0f,0.0f), 204 | sdf_translate(sdf_sphere(0.8f), +0.5f,0.0f,0.0f))); 205 | } 206 | 207 | sdf_node_t *model_fillet() 208 | { 209 | return 210 | sdf_union 211 | ( 212 | sdf_translate(sdf_sphere(0.25f), 0.25f,0.25f,0.0f), 213 | sdf_intersect 214 | ( 215 | sdf_rotate(sdf_plane(0.53f), 0.0f,0.0f,3.1415f/4.0f), 216 | sdf_box(0.5f, 0.5f, 0.5f) 217 | ) 218 | ); 219 | } 220 | 221 | sdf_node_t *model_two_box() 222 | { 223 | return 224 | sdf_union 225 | ( 226 | sdf_translate(sdf_box(0.55f,0.05f,1.0f), 0.25f,0.5f,0.0f), 227 | sdf_translate(sdf_box(0.05f,0.55f,1.0f), -0.25f,0.0f,0.0f) 228 | ); 229 | } 230 | 231 | sdf_node_t *model_two_box_unequal() 232 | { 233 | return 234 | sdf_union 235 | ( 236 | sdf_translate(sdf_box(0.35f,0.05f,1.0f), 0.15f,0.5f,0.0f), 237 | sdf_translate(sdf_box(0.05f,0.55f,1.0f), -0.25f,0.0f,0.0f) 238 | ); 239 | } 240 | 241 | sdf_node_t *model_offset_box() 242 | { 243 | return sdf_rotate(sdf_translate(sdf_box(0.5f,0.5f,0.5f), 0.2f, -0.2f, 0.0f), 0.0f, 0.0f, -0.5f); 244 | } 245 | 246 | sdf_node_t *model_motion0(int which) 247 | { 248 | if (which == 0) { 249 | auto *d1 = sdf_box(0.3f, 0.3f, 0.3f); 250 | auto *d2 = sdf_box(0.2f, 0.2f, 0.2f); 251 | d2 = sdf_rotate(sdf_translate(d2, +0.3f, +0.2f, 0.0f), 0.0f, 0.0f, 0.3f); 252 | auto *d5 = sdf_union(d1, d2); 253 | d5 = sdf_rotate(d5, 0.0f, 0.0f, 0.2f); 254 | d5 = sdf_translate(d5, 0.45f, -0.5f, 0.0f); 255 | auto *d6 = sdf_sphere(0.3f); 256 | d6 = sdf_translate(d6, -0.4f, +0.2f, 0.0); 257 | return sdf_union(d5, d6); 258 | } else { 259 | auto *d1 = sdf_box(0.3f, 0.3f, 0.3f); 260 | auto *d2 = sdf_box(0.2f, 0.2f, 0.2f); 261 | d2 = sdf_rotate(sdf_translate(d2, +0.3f, +0.2f, 0.0f), 0.0f, 0.0f, 0.3f); 262 | auto *d5 = sdf_union(d1, d2); 263 | d5 = sdf_rotate(d5, 0.0f, 0.0f, 0.2f); 264 | d5 = sdf_rotate(sdf_translate(d5, 0.45f, -0.1f, 0.0f), 0.0f, 0.0f, -0.3f); 265 | auto *d6 = sdf_sphere(0.3f); 266 | d6 = sdf_translate(d6, -0.4f, +0.2f, 0.0); 267 | return sdf_union(d5, d6); 268 | } 269 | } 270 | 271 | sdf_node_t *model_motion1(int which) 272 | { 273 | if (which == 0) { 274 | auto *d1 = sdf_box(0.3f, 0.3f, 0.3f); 275 | auto *d2 = sdf_box(0.2f, 0.2f, 0.2f); 276 | auto *d3 = sdf_box(0.2f, 0.2f, 0.2f); 277 | d2 = sdf_rotate(sdf_translate(d2, +0.3f, +0.2f, 0.0f), 0.0f, 0.0f, 0.3f); 278 | d3 = sdf_rotate(sdf_translate(d3, -0.3f, -0.2f, 0.0f), 0.0f, 0.0f, 0.7f); 279 | auto *d4 = sdf_union(d2, d3); 280 | auto *d5 = sdf_union(d1, d4); 281 | d5 = sdf_rotate(d5, 0.0f, 0.0f, 0.2f); 282 | d5 = sdf_translate(d5, 0.4f, -0.2f, 0.0f); 283 | auto *d6 = sdf_sphere(0.3f); 284 | d6 = sdf_translate(d6, -0.3f, +0.2f, 0.0); 285 | return sdf_union(d5, d6); 286 | } else { 287 | auto *d1 = sdf_box(0.3f, 0.3f, 0.3f); 288 | auto *d2 = sdf_box(0.2f, 0.2f, 0.2f); 289 | auto *d3 = sdf_box(0.2f, 0.2f, 0.2f); 290 | d2 = sdf_rotate(sdf_translate(d2, +0.3f, +0.2f, 0.0f), 0.0f, 0.0f, 0.3f); 291 | d3 = sdf_rotate(sdf_translate(d3, -0.3f, -0.2f, 0.0f), 0.0f, 0.0f, 0.7f); 292 | auto *d4 = sdf_union(d2, d3); 293 | auto *d5 = sdf_union(d1, d4); 294 | d5 = sdf_rotate(d5, 0.0f, 0.0f, 0.2f); 295 | d5 = sdf_translate(d5, 0.45f, -0.1f, 0.0f); 296 | auto *d6 = sdf_sphere(0.3f); 297 | d6 = sdf_translate(d6, -0.3f, +0.2f, 0.0); 298 | return sdf_union(d5, d6); 299 | } 300 | } 301 | --------------------------------------------------------------------------------