├── LICENSE.txt
├── README.md
├── doc
└── overview-small.svg
├── src
├── backend_glsl.h
├── backend_ptx.h
├── backend_sass.h
├── frep.h
├── frep_builder.h
├── frep_eval.h
├── frep_serialize.h
└── sass_6_x
│ ├── backend.h
│ ├── blocks.h
│ ├── bytecode.h
│ ├── cubin.h
│ ├── instruction.h
│ ├── registers.h
│ ├── scheduler.h
│ └── simulator.h
└── test
├── backend_glsl.cpp
├── backend_ptx.cpp
├── backend_sass_6_x.cpp
├── backend_sass_6_x_mock.cpp
├── linker.cpp
├── test1.cubin
├── test1.ptx
├── test2.cubin
├── test2.ptx
└── util
├── cuda_error.h
├── init_cuda.h
├── profiler.h
└── test_models.h
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015-2019 Simen Haugo
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # fast-csg
2 |
3 | 
4 |
5 | A compiler for functional representations (see e.g. OpenSCAD, libfive, Hyperfun) that directly generates executable bytecode instructions for GPU architectures.
6 |
7 | It gives you the benefit of fast tree evaluation as the tree structure is compiled into optimized machine code instructions (which makes the program compute-limited, not memory bandwidth-limited), while avoiding the long compile times that you would get by compiling to an intermediate target, such as GLSL, PTX, CUDA or NVVM IR.
8 |
9 | Unlike NVIDIA's closed-source compiler chain, we focus on compilation speed, aiming for sub-millisecond time from compilation start to having the kernel uploaded to the GPU and ready to run.
10 |
11 | Possible applications:
12 |
13 | * Fast and parallelized hypothesis generation and testing, for e.g. program synthesis or 3D reconstruction.
14 | * GPU-accelerated visualization where you can live edit the CSG tree structure with instant feedback
15 |
16 | ## Project status
17 |
18 | This project is currently in limbo. I'm open-sourcing it in the event that anyone finds some parts of it useful. To that end, here's a list of stuff that's in here:
19 |
20 | * CSG tree grammar and interpreter (see [src/frep.h](src/frep.h))
21 | * Complete CSG->GLSL compiler (see [src/backend_glsl.h](src/backend_glsl.h))
22 | * Complete CSG->PTX compiler (see [src/backend_ptx.h](src/backend_ptx.h))
23 | * Partial CSG->SASS 6.x compiler (missing Cubin linking stage)
24 |
25 | Naturally, implementing a custom SASS compiler is difficult, as NVIDIA does not publically document the ISA and their PTX compiler is closed-source. With the help of Scott Gray's MaxAs (a reverse engineering of the Maxwell SASS), I was able to implement a rudimentary compiler for compute capability 6.x devices (Maxwell, Pascal families). Although the succeeding families Volta and Turing have not made huge changes to the ISA, it's a tedious task to implement backends for all of them.
26 |
27 | Nonetheless, you can find
28 | * Scheduler and register allocation ([src/sass_6_x/backend.h](src/sass_6_x/backend.h))
29 | * Bytecode generation ([src/sass_6_x/bytecode.h](src/sass_6_x/bytecode.h))
30 |
--------------------------------------------------------------------------------
/doc/overview-small.svg:
--------------------------------------------------------------------------------
1 |
2 |
68 |
--------------------------------------------------------------------------------
/src/backend_glsl.h:
--------------------------------------------------------------------------------
1 | // Developed by Simen Haugo.
2 | // See LICENSE.txt for copyright and licensing details (standard MIT License).
3 |
4 | // This is the code generation backend for GLSL (GL Shading Language).
5 | // The output is a stripped GLSL source code, meaning you must insert
6 | // it into a GLSL shader as necessary for your application.
7 |
8 | #pragma once
9 | #include "frep.h"
10 | #include
11 |
12 | // Generates a null-terminated string of GLSL code that computes
13 | //
14 | // Variables are expected to be defined:
15 | // vec3 p0;
16 | //
17 | // Output is stored in:
18 | // float d1 = f(p0.x, p0.y, p0.z);
19 | //
20 | // The following functions must be declared and linked into the GLSL:
21 | // float fBox(vec3 p, vec3 dim);
22 | // float fBoxCheap(vec3 p, vec3 dim);
23 | // float fCylinder(vec3 p, float r, float h);
24 | // float fSphere(vec3 p, float r, float h);
25 | //
26 | char *frep_compile_to_glsl(frep_t *f);
27 |
28 | //////////////////////////////////////////////////////////////////
29 | // Implementation
30 | //////////////////////////////////////////////////////////////////
31 |
32 | namespace backend_glsl {
33 |
34 | struct glsl_t
35 | {
36 | int destination;
37 | char *stream;
38 | };
39 |
40 | int _frep_compile_to_glsl(frep_t *node,
41 | glsl_t &s,
42 | frep_mat3_t R_root_to_parent=frep_identity_3x3,
43 | frep_vec3_t T_parent_rel_root=frep_null_3x1)
44 | {
45 | assert(node);
46 |
47 | frep_mat3_t R_root_to_this;
48 | frep_vec3_t T_this_rel_root;
49 | frep_get_global_transform(node, &R_root_to_this, &T_this_rel_root, R_root_to_parent, T_parent_rel_root);
50 |
51 | int my_index = s.destination++;
52 |
53 | // p^this = R_root_to_this*(p^0 - T_this_rel_root)
54 | // = R_root_to_this*p^0 + (-R_root_to_this*T_this_rel_root)
55 | {
56 | #define R(row,col) R_root_to_this.at(row,col)
57 | #define T(i) T_this_rel_root[i]
58 | float dtx = -(R(0,0)*T(0) + R(0,1)*T(1) + R(0,2)*T(2));
59 | float dty = -(R(1,0)*T(0) + R(1,1)*T(1) + R(1,2)*T(2));
60 | float dtz = -(R(2,0)*T(0) + R(2,1)*T(1) + R(2,2)*T(2));
61 | s.stream += sprintf(s.stream,
62 | "vec3 p%d = "
63 | "vec3(%f,%f,%f)*p0.x + "
64 | "vec3(%f,%f,%f)*p0.y + "
65 | "vec3(%f,%f,%f)*p0.z + "
66 | "vec3(%f,%f,%f);\n",
67 | my_index,
68 | R(0,0), R(1,0), R(2,0),
69 | R(0,1), R(1,1), R(2,1),
70 | R(0,2), R(1,2), R(2,2),
71 | dtx, dty, dtz
72 | );
73 | #undef R
74 | #undef T
75 | }
76 |
77 | if (frep_is_boolean(node))
78 | {
79 | assert(node->left);
80 | assert(node->right);
81 |
82 | int i_left = _frep_compile_to_glsl(node->left, s, R_root_to_this, T_this_rel_root);
83 | int i_right = _frep_compile_to_glsl(node->right, s, R_root_to_this, T_this_rel_root);
84 |
85 | s.stream += sprintf(s.stream, "float d%d = ", my_index);
86 |
87 | switch (node->opcode)
88 | {
89 | case FREP_UNION: s.stream += sprintf(s.stream, "min(d%d,d%d);\n", i_left, i_right); break;
90 | case FREP_INTERSECT: s.stream += sprintf(s.stream, "max(d%d,d%d);\n", i_left, i_right); break;
91 | case FREP_SUBTRACT: s.stream += sprintf(s.stream, "max(d%d,-d%d);\n", i_left, i_right); break;
92 | case FREP_BLEND: s.stream += sprintf(s.stream, "%f*d%d + %f*d%d;\n", node->blend.alpha, i_left, 1.0f-node->blend.alpha, i_right); break;
93 | default: assert(false && "Unexpected opcode");
94 | }
95 | }
96 | else if (frep_is_primitive(node))
97 | {
98 | s.stream += sprintf(s.stream, "float d%d = ", my_index);
99 |
100 | switch (node->opcode)
101 | {
102 | case FREP_BOX: s.stream += sprintf(s.stream, "fBox(p%d, vec3(%f, %f, %f));\n", my_index, node->box.width, node->box.height, node->box.depth); break;
103 | case FREP_BOX_CHEAP: s.stream += sprintf(s.stream, "fBoxCheap(p%d, vec3(%f, %f, %f));\n", my_index, node->box.width, node->box.height, node->box.depth); break;
104 | case FREP_SPHERE: s.stream += sprintf(s.stream, "fSphere(p%d, %f);\n", my_index, node->sphere.radius); break;
105 | case FREP_CYLINDER: s.stream += sprintf(s.stream, "fCylinder(p%d, %f, %f);\n", my_index, node->cylinder.radius, node->cylinder.height); break;
106 | case FREP_PLANE: s.stream += sprintf(s.stream, "p%d.x - %f;\n", my_index, node->plane.offset); break;
107 | default: assert(false && "Unexpected opcode");
108 | }
109 | }
110 | else
111 | {
112 | assert(false && "Unexpected node type");
113 | }
114 | return my_index;
115 | }
116 |
117 | }
118 |
119 | char *frep_compile_to_glsl(frep_t *node)
120 | {
121 | using namespace backend_glsl;
122 | static char *buffer = (char*)malloc(10*1024*1024);
123 | assert(buffer && "Failed to allocate buffer to contain GLSL output");
124 | glsl_t s;
125 | s.stream = buffer;
126 | s.destination = 1;
127 | _frep_compile_to_glsl(node, s);
128 | return buffer;
129 | }
130 |
--------------------------------------------------------------------------------
/src/backend_ptx.h:
--------------------------------------------------------------------------------
1 | // Developed by Simen Haugo.
2 | // See LICENSE.txt for copyright and licensing details (standard MIT License).
3 |
4 | /*
5 | This is the code generation backend for NVIDIA PTX, which is not
6 | a machine code target, but a fake assembly language (stored as text)
7 | which gets compiled into native target-architecture instructions by
8 | the CUDA driver. Note that this compilation can take a long time.
9 | If you need to be able to rapidly compile and upload trees to the
10 | GPU, look at the SASS backend, where we implement our own native
11 | machine code generation.
12 | */
13 |
14 | #pragma once
15 |
16 | #include "frep.h"
17 | #include
18 | #include
19 |
20 | /*
21 | Generates a string containing newline-seperated PTX instructions
22 | which evaluate f(x0, y0, z0) and stores the result in a register
23 | named "f%d" % result_register (e.g. "f3"). The input coordinates
24 | are assumed to be in registers named "x0", "y0", and "z0".
25 |
26 | See test/backend_ptx.cpp for an example of a complete PTX program
27 | that uses the generated output.
28 | */
29 | char *frep_compile_to_ptx(frep_t *f, int *result_register);
30 |
31 | //////////////////////////////////////////////////////////////////
32 | // Implementation
33 | //////////////////////////////////////////////////////////////////
34 |
35 | namespace backend_ptx {
36 |
37 | /*
38 | Nodes in the FRep AST have constants (such as sphere radius) that
39 | are involved in the expression for that node's membership function.
40 | When generating code to execute the membership function, constants
41 | can either be placed in Constants Memory (and must be fetched with
42 | an additional load), or be baked directly into the instructions.
43 |
44 | For example, the PTX instruction
45 | add.ftz.f32 x, x, 0f3F000000; // x <- x + 0.5
46 | uses +0.5 as an immediate value. In the generated machine code for
47 | e.g. Maxwell architectures, this instruction may look like this:
48 | 0x3858503f00070409
49 | ^^^^^
50 | immediate value (note that last 12 bits are truncated).
51 |
52 | However, not all instructions can use full 32-bit floating point
53 | immediate values. Notably, min, max and fused-multiply-add (FFMA)
54 | on Maxwell/Pascal target architectures. But all do support 20-bit
55 | floating point immediates, where the last 12 bits of the mantissa
56 | are truncated (assumed to be zero).
57 |
58 | You can choose whether you want to preserve 32-bit floating point
59 | constants at the expense of speed, or if you want to truncate the
60 | last 12 bits and use 20-bit floating point constants.
61 | */
62 | uint32_t encode_f32(float x)
63 | {
64 | #if defined(PTX_FP32_IMMEDIATE)
65 | return (*(uint32_t*)&x);
66 | #elif defined(PTX_FP20_IMMEDIATE)
67 | // Note: PTX immediate values preserve their sign bit, unlike
68 | // SASS immediate values, which encode the sign bit elsewhere
69 | // in the instruction.
70 | return (*(uint32_t*)&x) & 0xFFFFF000;
71 | #else
72 | #error "You must #define either PTX_FP32_IMMEDIATE or PTX_FP20_IMMEDIATE before including this file."
73 | #endif
74 | }
75 |
76 | struct ptx_t
77 | {
78 | int next_register;
79 | char *stream;
80 | };
81 |
82 | void emit_transform(ptx_t &s, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/)
83 | {
84 | // emit transform code: p_this = R_root_to_this*(p_root - T_this_rel_root)
85 | int x = s.next_register++;
86 | int y = s.next_register++;
87 | int z = s.next_register++;
88 |
89 | // compute R_root_to_this*(-T_this_rel_root)
90 | float tx = -(R.at(0,0)*T[0] + R.at(0,1)*T[1] + R.at(0,2)*T[2]);
91 | float ty = -(R.at(1,0)*T[0] + R.at(1,1)*T[1] + R.at(1,2)*T[2]);
92 | float tz = -(R.at(2,0)*T[0] + R.at(2,1)*T[1] + R.at(2,2)*T[2]);
93 |
94 | // emit instructions for R_root_to_this*p_root + R_root_to_this*(-T_this_rel_root)
95 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, x0, 0f%08x, 0f%08x;\n", x, encode_f32(R.at(0,0)), encode_f32(tx));
96 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, x0, 0f%08x, 0f%08x;\n", y, encode_f32(R.at(1,0)), encode_f32(ty));
97 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, x0, 0f%08x, 0f%08x;\n", z, encode_f32(R.at(2,0)), encode_f32(tz));
98 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, y0, 0f%08x, f%d;\n", x, encode_f32(R.at(0,1)), x);
99 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, y0, 0f%08x, f%d;\n", y, encode_f32(R.at(1,1)), y);
100 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, y0, 0f%08x, f%d;\n", z, encode_f32(R.at(2,1)), z);
101 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, z0, 0f%08x, f%d;\n", x, encode_f32(R.at(0,2)), x);
102 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, z0, 0f%08x, f%d;\n", y, encode_f32(R.at(1,2)), y);
103 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, z0, 0f%08x, f%d;\n", z, encode_f32(R.at(2,2)), z);
104 | }
105 |
106 | int emit_box(ptx_t &s, frep_t *node, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/)
107 | {
108 | assert(false && "Box is not implemented in PTX backend yet");
109 | return 0;
110 | }
111 |
112 | int emit_box_cheap(ptx_t &s, frep_t *node, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/)
113 | {
114 | // mathematical expression: Box(p, width,height,depth)
115 | // (x,y,z) = R*(p - T)
116 | // d = max( |x|-width, |y|-height, |z|-depth )
117 |
118 | // ptx template:
119 | //
120 | // abs.ftz.f32 x, x;
121 | // abs.ftz.f32 y, y;
122 | // abs.ftz.f32 z, z;
123 | // sub.ftz.f32 x, x, (width);
124 | // sub.ftz.f32 y, y, (height);
125 | // sub.ftz.f32 z, z, (depth);
126 | // max.ftz.f32 d, x, y;
127 | // max.ftz.f32 d, d, z;
128 |
129 | // emitted instructions:
130 | emit_transform(s, R, T); // todo: inline here and optimize for each primitive
131 | int x = s.next_register - 3;
132 | int y = s.next_register - 2;
133 | int z = s.next_register - 1;
134 | int d = s.next_register++;
135 | s.stream += sprintf(s.stream, "abs.ftz.f32 f%d,f%d;\n", x, x);
136 | s.stream += sprintf(s.stream, "abs.ftz.f32 f%d,f%d;\n", y, y);
137 | s.stream += sprintf(s.stream, "abs.ftz.f32 f%d,f%d;\n", z, z);
138 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", x, x, encode_f32(-node->box.width));
139 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", y, y, encode_f32(-node->box.height));
140 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", z, z, encode_f32(-node->box.depth));
141 | s.stream += sprintf(s.stream, "max.ftz.f32 f%d,f%d,f%d;\n", d, x, y);
142 | s.stream += sprintf(s.stream, "max.ftz.f32 f%d,f%d,f%d;\n", d, d, z);
143 | return d;
144 | }
145 |
146 | int emit_sphere(ptx_t &s, frep_t *node, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/)
147 | {
148 | // mathematical expression:
149 | // d = length(p_this) - r
150 | // = length(R*(p_root - T)) - r
151 | // = length(p_root - T) - r
152 |
153 | // ptx template:
154 | // add.ftz.f32 x, x0, (-tx);
155 | // add.ftz.f32 y, y0, (-ty);
156 | // add.ftz.f32 z, z0, (-tz);
157 | // mul.ftz.f32 d, x, x;
158 | // fma.rn.ftz.f32 d, y, y, d;
159 | // fma.rn.ftz.f32 d, z, z, d;
160 | // sqrt.approx.ftz.f32 d, d;
161 | // sub.f32 d, d, (r);
162 |
163 | // emitted instructions:
164 | int x = s.next_register++;
165 | int y = s.next_register++;
166 | int z = s.next_register++;
167 | int d = s.next_register++;
168 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,x0,0f%08x;\n", x, encode_f32(-T[0])); // x <- x0 - (Tx)
169 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,y0,0f%08x;\n", y, encode_f32(-T[1])); // y <- y0 - (Ty)
170 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,z0,0f%08x;\n", z, encode_f32(-T[2])); // z <- z0 - (Tz)
171 | s.stream += sprintf(s.stream, "mul.ftz.f32 f%d,f%d,f%d;\n", d, x, x); // d <- x*x
172 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,f%d,f%d,f%d;\n", d, y, y, d); // d <- y*y + d
173 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,f%d,f%d,f%d;\n", d, z, z, d); // d <- z*z + d
174 | s.stream += sprintf(s.stream, "sqrt.approx.ftz.f32 f%d,f%d;\n", d, d); // d <- sqrt(d)
175 | s.stream += sprintf(s.stream, "add.f32 f%d,f%d,0f%08x;\n", d, d, encode_f32(-node->sphere.radius)); // d <- d - (r)
176 | return d;
177 | }
178 |
179 | int emit_cylinder(ptx_t &s, frep_t *node, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/)
180 | {
181 | // mathematical expression: cylinder(p, 2*height, radius)
182 | // (x,y,z) = R*(p - T)
183 | // d = max( sqrt(x*x + z*z) - radius, abs(y) - height )
184 |
185 | // ptx template
186 | //
187 | // mul.ftz.f32 d, x, x;
188 | // fma.rn.ftz.f32 d, z, z, d;
189 | // sqrt.approx.ftz.f32 d, d;
190 | // abs.ftz.f32 y, y;
191 | // add.ftz.f32 y, y, (-height);
192 | // add.ftz.f32 d, d, (-radius);
193 | // max.ftz.f32 d, d, y;
194 |
195 | // emitted instructions:
196 | emit_transform(s, R, T); // todo: inline here and optimize for each primitive
197 | int x = s.next_register - 3;
198 | int y = s.next_register - 2;
199 | int z = s.next_register - 1;
200 | int d = s.next_register++;
201 | s.stream += sprintf(s.stream, "mul.ftz.f32 f%d,f%d,f%d;\n", d, x, x);
202 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,f%d,f%d,f%d;\n", d, z, z, d);
203 | s.stream += sprintf(s.stream, "sqrt.approx.ftz.f32 f%d,f%d;\n", d, d);
204 | s.stream += sprintf(s.stream, "abs.ftz.f32 f%d,f%d;\n", y, y);
205 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", y, y, encode_f32(-node->cylinder.height));
206 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", d, d, encode_f32(-node->cylinder.radius));
207 | s.stream += sprintf(s.stream, "max.ftz.f32 f%d,f%d,f%d;\n", d, d, y);
208 | return d;
209 | }
210 |
211 | int emit_plane(ptx_t &s, frep_t *node, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/)
212 | {
213 | // mathematical expression:
214 | // (x,y,z) = R*(p - T)
215 | // d = x - plane.x
216 | // = R00*(x0 - Tx) + R01*(y0 - Ty) + R02*(z0 - Tz) - plane.x
217 | // = R00*x0 + R01*y0 + R02*z0 + (-plane.x - R00*Tx - R01*Ty - R02*Tz)
218 | // = R00*x0 + R01*y0 + R02*z0 + k
219 |
220 | // ptx template:
221 | // mul.ftz.f32 d, x0, (R00);
222 | // fma.rn.ftz.f32 d, y0, (R01), d;
223 | // fma.rn.ftz.f32 d, z0, (R02), d;
224 | // add.ftz.f32 d, d, (k)
225 |
226 | // emitted instructions:
227 | float k = -(R.at(0,0)*T[0] + R.at(0,1)*T[1] + R.at(0,2)*T[2] + node->plane.offset);
228 | int d = s.next_register++;
229 | s.stream += sprintf(s.stream, "mul.ftz.f32 f%d,x0,0f%08x;\n", d, encode_f32(R.at(0,0)));
230 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,y0,0f%08x,f%d;\n", d, encode_f32(R.at(0,1)), d);
231 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,z0,0f%08x,f%d;\n", d, encode_f32(R.at(0,2)), d);
232 | s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", d, d, encode_f32(k));
233 | return d;
234 | }
235 |
236 | int emit_union(ptx_t &s, int left, int right)
237 | {
238 | int d = s.next_register++;
239 | s.stream += sprintf(s.stream, "min.ftz.f32 f%d,f%d,f%d;\n", d, left, right);
240 | return d;
241 | }
242 |
243 | int emit_intersect(ptx_t &s, int left, int right)
244 | {
245 | int d = s.next_register++;
246 | s.stream += sprintf(s.stream, "max.ftz.f32 f%d,f%d,f%d;\n", d, left, right);
247 | return d;
248 | }
249 |
250 | int emit_subtract(ptx_t &s, int left, int right)
251 | {
252 | int d = s.next_register++;
253 | s.stream += sprintf(s.stream, "neg.ftz.f32 f%d,f%d;\n", right, right);
254 | s.stream += sprintf(s.stream, "max.ftz.f32 f%d,f%d,f%d;\n", d, left, right);
255 | return d;
256 | }
257 |
258 | int emit_blend(ptx_t &s, int left, int right, float blend_alpha)
259 | {
260 | int d = s.next_register++;
261 | s.stream += sprintf(s.stream, "mul.ftz.f32 f%d,f%d,0f%08x;\n", d, left, encode_f32(blend_alpha));
262 | s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,f%d,0f%08x,f%d;\n", d, right, encode_f32(1.0f-blend_alpha), d);
263 | return d;
264 | }
265 |
266 | int _frep_compile_to_ptx(
267 | frep_t *node,
268 | ptx_t &state,
269 | frep_mat3_t R_root_to_parent=frep_identity_3x3,
270 | frep_vec3_t T_parent_rel_root=frep_null_3x1)
271 | {
272 | assert(node);
273 |
274 | frep_mat3_t R_root_to_this;
275 | frep_vec3_t T_this_rel_root;
276 | frep_get_global_transform(node, &R_root_to_this, &T_this_rel_root, R_root_to_parent, T_parent_rel_root);
277 |
278 | int result = -1;
279 | if (frep_is_boolean(node))
280 | {
281 | assert(node->left);
282 | assert(node->right);
283 | int left = _frep_compile_to_ptx(node->left, state, R_root_to_this, T_this_rel_root);
284 | int right = _frep_compile_to_ptx(node->right, state, R_root_to_this, T_this_rel_root);
285 | switch (node->opcode)
286 | {
287 | case FREP_UNION: return emit_union(state, left, right);
288 | case FREP_INTERSECT: return emit_intersect(state, left, right);
289 | case FREP_SUBTRACT: return emit_subtract(state, left, right);
290 | case FREP_BLEND: return emit_blend(state, left, right, node->blend.alpha);
291 | }
292 | }
293 | else if (frep_is_primitive(node))
294 | {
295 | switch (node->opcode)
296 | {
297 | case FREP_BOX: return emit_box(state, node, R_root_to_this, T_this_rel_root);
298 | case FREP_BOX_CHEAP: return emit_box_cheap(state, node, R_root_to_this, T_this_rel_root);
299 | case FREP_SPHERE: return emit_sphere(state, node, R_root_to_this, T_this_rel_root);
300 | case FREP_CYLINDER: return emit_cylinder(state, node, R_root_to_this, T_this_rel_root);
301 | case FREP_PLANE: return emit_plane(state, node, R_root_to_this, T_this_rel_root);
302 | }
303 | }
304 |
305 | assert(false && "Unexpected node opcode");
306 | return -1;
307 | }
308 |
309 | }
310 |
311 | char *frep_compile_to_ptx(frep_t *node, int *result_register)
312 | {
313 | using namespace backend_ptx;
314 | static char *buffer = (char*)malloc(10*1024*1024);
315 | assert(buffer && "Failed to allocate buffer to contain PTX output");
316 | ptx_t s;
317 | s.stream = buffer;
318 | s.next_register = 0;
319 | *result_register = _frep_compile_to_ptx(node, s);
320 | return buffer;
321 | }
322 |
--------------------------------------------------------------------------------
/src/backend_sass.h:
--------------------------------------------------------------------------------
1 | #if defined(COMPUTE_CAPABILITY_3_X)
2 | // Kepler
3 | #error "Target devices of compute capability 3.x are not supported by the SASS backend."
4 |
5 | #elif defined(COMPUTE_CAPABILITY_5_X) || defined(COMPUTE_CAPABILITY_6_X)
6 | // Maxwell, Pascal (e.g. GTX 1080, Titan X)
7 | #include "sass_6_x/backend.h"
8 |
9 | #elif defined(COMPUTE_CAPABILITY_7_X)
10 | // Volta, Turing (e.g. RTX Titan, 2080)
11 | #error "Target devices of compute capability 7.x are not supported by the SASS backend."
12 |
13 | #else
14 | #error "Missing #define. Specify the compute capability target for the SASS backend."
15 | #endif
16 |
--------------------------------------------------------------------------------
/src/frep.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 |
8 | typedef int frep_opcode_t;
9 | enum frep_opcode_ {
10 | FREP_INVALID = 0,
11 |
12 | FREP_BOX,
13 | FREP_BOX_CHEAP,
14 | FREP_SPHERE,
15 | FREP_CYLINDER,
16 | FREP_PLANE,
17 | FREP_UNION,
18 | FREP_INTERSECT,
19 | FREP_SUBTRACT,
20 | FREP_BLEND,
21 | };
22 |
23 | struct frep_box_t { float width, height, depth; };
24 | struct frep_sphere_t { float radius; };
25 | struct frep_cylinder_t { float radius, height; };
26 | struct frep_plane_t { float sign, offset; };
27 | struct frep_blend_t { float alpha; };
28 |
29 | /*
30 | Each frep node has a rigid-body transform associated with it.
31 | It can be the identity. If so, it gets optimized out in the
32 | backend. The transformation parameters relate the point argument
33 | of the child node to its parent node by:
34 |
35 | p^parent = Rx(rx)*Ry(ry)*Rz(rz)*p^child + (tx,ty,tz)
36 |
37 | */
38 | struct frep_t {
39 | frep_opcode_t opcode;
40 | frep_t *left;
41 | frep_t *right;
42 | float rx,ry,rz,tx,ty,tz;
43 | union {
44 | frep_box_t box;
45 | frep_sphere_t sphere;
46 | frep_cylinder_t cylinder;
47 | frep_plane_t plane;
48 | frep_blend_t blend;
49 | };
50 | };
51 |
52 | /*
53 | Node creation and deletion utilities
54 | */
55 | frep_t *frep_malloc() {
56 | frep_t *f = (frep_t*)malloc(sizeof(frep_t));
57 | return f;
58 | }
59 | frep_t *frep_calloc() {
60 | frep_t *f = (frep_t*)calloc(1, sizeof(frep_t));
61 | return f;
62 | }
63 | void frep_free(frep_t *f) {
64 | if (!f) return;
65 | frep_free(f->left);
66 | frep_free(f->right);
67 | free(f);
68 | }
69 | frep_t *frep_copy(frep_t *f) {
70 | if (!f) return NULL;
71 | frep_t *f1 = frep_malloc();
72 | *f1 = *f;
73 | f1->left = frep_copy(f->left);
74 | f1->right = frep_copy(f->right);
75 | return f1;
76 | }
77 |
78 | /*
79 | Other utilities
80 | */
81 | bool frep_is_primitive(frep_t *f) {
82 | return f->opcode == FREP_BOX ||
83 | f->opcode == FREP_BOX_CHEAP ||
84 | f->opcode == FREP_SPHERE ||
85 | f->opcode == FREP_CYLINDER ||
86 | f->opcode == FREP_PLANE;
87 | }
88 | bool frep_is_boolean(frep_t *f) {
89 | return f->opcode == FREP_UNION ||
90 | f->opcode == FREP_INTERSECT ||
91 | f->opcode == FREP_SUBTRACT;
92 | }
93 | int frep_get_num_nodes(frep_t *f) {
94 | if (!f) return 0;
95 | return 1 + frep_get_num_nodes(f->left) + frep_get_num_nodes(f->right);
96 | }
97 |
98 | int frep_get_depth(frep_t *f) {
99 | if (!f) return 0;
100 | int l = frep_get_depth(f->left);
101 | int r = frep_get_depth(f->right);
102 | int max_lr = (l > r ? l : r);
103 | return 1 + max_lr;
104 | }
105 | frep_t *frep_find_node(frep_t *a, int find_i, frep_t **out_parent, int *out_depth, frep_t *parent=NULL, int depth=0)
106 | {
107 | assert(a);
108 | assert(find_i >= 0);
109 |
110 | static int i = 0;
111 | if (!parent) i = 0;
112 | else i++;
113 |
114 | if (i == find_i)
115 | {
116 | *out_depth = depth;
117 | *out_parent = parent;
118 | return a;
119 | }
120 | else if (frep_is_boolean(a))
121 | {
122 | frep_t *left = frep_find_node(a->left, find_i, out_parent, out_depth, a, depth+1);
123 | if (left) return left;
124 | frep_t *right = frep_find_node(a->right, find_i, out_parent, out_depth, a, depth+1);
125 | if (right) return right;
126 | }
127 | return NULL;
128 | }
129 |
130 | /*
131 | Utility routines for computing rigid-body transform from root node to a specific child.
132 | */
133 | struct frep_mat3_t { float d[3*3]; float &at(int row, int col) { return d[col + row*3]; } };
134 | struct frep_vec3_t { float d[3]; float &operator[](int i) { return d[i]; } };
135 | static frep_mat3_t frep_identity_3x3 = { 1,0,0, 0,1,0, 0,0,1 };
136 | static frep_vec3_t frep_null_3x1 = { 0,0,0 };
137 |
138 | // d = a*b
139 | frep_mat3_t frep_mat_mul(frep_mat3_t a, frep_mat3_t b) {
140 | frep_mat3_t d = {0};
141 | for (int row = 0; row < 3; row++)
142 | for (int col = 0; col < 3; col++)
143 | {
144 | d.at(row,col) = 0.0f;
145 | for (int i = 0; i < 3; i++)
146 | d.at(row,col) += a.at(row,i)*b.at(i,col);
147 | }
148 | return d;
149 | }
150 |
151 | // d = transpose(a) * b
152 | frep_vec3_t frep_mat_mul_transpose(frep_mat3_t a, frep_vec3_t b) {
153 | frep_vec3_t d = {0};
154 | for (int row = 0; row < 3; row++)
155 | {
156 | d[row] = 0.0f;
157 | for (int i = 0; i < 3; i++)
158 | d[row] += a.at(i,row)*b[i];
159 | }
160 | return d;
161 | }
162 |
163 | frep_vec3_t frep_mat_add(frep_vec3_t a, frep_vec3_t b) {
164 | frep_vec3_t d = { a[0]+b[0], a[1]+b[1], a[2]+b[2] };
165 | return d;
166 | }
167 |
168 | void frep_get_global_transform(frep_t *node,
169 | frep_mat3_t *R_root_to_this,
170 | frep_vec3_t *T_this_rel_root,
171 | frep_mat3_t R_root_to_parent,
172 | frep_vec3_t T_parent_rel_root) {
173 | float cx = cosf(-node->rx); float sx = sinf(-node->rx);
174 | float cy = cosf(-node->ry); float sy = sinf(-node->ry);
175 | float cz = cosf(-node->rz); float sz = sinf(-node->rz);
176 |
177 | // R_this_to_parent = Rx(rx)*Ry(ry)*Rz(rz)
178 | // -> R_parent_to_this = Rz(-rz)*Ry(-ry)*Rx(-rx)
179 | frep_mat3_t R_parent_to_this =
180 | {
181 | cy*cz, cz*sx*sy - cx*sz, sx*sz + cx*cz*sy,
182 | cy*sz, cx*cz + sx*sy*sz, cx*sy*sz - cz*sx,
183 | -sy, cy*sx, cx*cy
184 | };
185 | frep_vec3_t T_this_rel_parent = { node->tx, node->ty, node->tz };
186 |
187 | *R_root_to_this = frep_mat_mul(R_parent_to_this,R_root_to_parent);
188 | *T_this_rel_root = frep_mat_add(T_parent_rel_root, frep_mat_mul_transpose(R_root_to_parent, T_this_rel_parent));
189 | }
190 |
191 |
--------------------------------------------------------------------------------
/src/frep_builder.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "frep.h"
3 |
4 | /*
5 | FRep primitives
6 | */
7 | frep_t *fBox(float width, float height, float depth) {
8 | frep_t *f = frep_calloc();
9 | f->opcode = FREP_BOX;
10 | f->box.width = width;
11 | f->box.height = height;
12 | f->box.depth = depth;
13 | return f;
14 | }
15 | frep_t *fBoxCheap(float width, float height, float depth) {
16 | frep_t *f = frep_calloc();
17 | f->opcode = FREP_BOX_CHEAP;
18 | f->box.width = width;
19 | f->box.height = height;
20 | f->box.depth = depth;
21 | return f;
22 | }
23 | frep_t *fSphere(float radius) {
24 | frep_t *f = frep_calloc();
25 | f->opcode = FREP_SPHERE;
26 | f->sphere.radius = radius;
27 | return f;
28 | }
29 | frep_t *fCylinder(float radius, float height) {
30 | frep_t *f = frep_calloc();
31 | f->opcode = FREP_CYLINDER;
32 | f->cylinder.radius = radius;
33 | f->cylinder.height = height;
34 | return f;
35 | }
36 | frep_t *fPlane(float sign, float offset) {
37 | frep_t *f = frep_calloc();
38 | f->opcode = FREP_PLANE;
39 | f->plane.sign = sign;
40 | f->plane.offset = offset;
41 | return f;
42 | }
43 |
44 | /*
45 | Function operators
46 | */
47 | frep_t *fOpUnion(frep_t *left, frep_t *right) {
48 | frep_t *f = frep_calloc();
49 | f->opcode = FREP_UNION;
50 | f->left = left;
51 | f->right = right;
52 | return f;
53 | }
54 | frep_t *fOpSubtract(frep_t *left, frep_t *right) {
55 | frep_t *f = frep_calloc();
56 | f->opcode = FREP_SUBTRACT;
57 | f->left = left;
58 | f->right = right;
59 | return f;
60 | }
61 | frep_t *fOpIntersect(frep_t *left, frep_t *right) {
62 | frep_t *f = frep_calloc();
63 | f->opcode = FREP_INTERSECT;
64 | f->left = left;
65 | f->right = right;
66 | return f;
67 | }
68 |
69 | /*
70 | Spatial operators
71 | */
72 | frep_t *pOpRotate(frep_t *f, float rx, float ry, float rz) {
73 | f->rx = rx;
74 | f->ry = ry;
75 | f->rz = rz;
76 | return f;
77 | }
78 | frep_t *pOpTranslate(frep_t *f, float tx, float ty, float tz) {
79 | f->tx = tx;
80 | f->ty = ty;
81 | f->tz = tz;
82 | return f;
83 | }
84 |
--------------------------------------------------------------------------------
/src/frep_eval.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "frep.h"
3 | #include
4 | #include
5 |
6 | float frep_eval(frep_t *f, float x, float y, float z)
7 | {
8 | assert(f);
9 |
10 | x -= f->tx;
11 | y -= f->ty;
12 | z -= f->tz;
13 |
14 | if (f->rx != 0.0f)
15 | {
16 | float cx = cosf(-f->rx);
17 | float sx = sinf(-f->rx);
18 | float zz = cx*z + sx*y;
19 | y = cx*y - sx*z;
20 | z = zz;
21 | }
22 | if (f->ry != 0.0f)
23 | {
24 | float cy = cosf(-f->ry);
25 | float sy = sinf(-f->ry);
26 | float xx = cy*x + sy*z;
27 | z = cy*z - sy*x;
28 | x = xx;
29 | }
30 | if (f->rz != 0.0f)
31 | {
32 | float cz = cosf(-f->rz);
33 | float sz = sinf(-f->rz);
34 | float xx = cz*x - sz*y;
35 | y = cz*y + sz*x;
36 | x = xx;
37 | }
38 |
39 | switch (f->opcode)
40 | {
41 | case FREP_BOX:
42 | {
43 | float dx = fabsf(x) - f->box.width;
44 | float dy = fabsf(y) - f->box.height;
45 | float dz = fabsf(z) - f->box.depth;
46 | float dbx = (dx < 0.0f) ? dx : 0.0f; float b = dbx;
47 | float dby = (dy < 0.0f) ? dy : 0.0f; if (dby > b) b = dby;
48 | float dbz = (dz < 0.0f) ? dz : 0.0f; if (dbz > b) b = dbz;
49 | if (dx < 0.0f) dx = 0.0f;
50 | if (dy < 0.0f) dy = 0.0f;
51 | if (dz < 0.0f) dz = 0.0f;
52 | return sqrtf(dx*dx + dy*dy + dz*dz) + b;
53 | }
54 | case FREP_BOX_CHEAP:
55 | {
56 | float dx = fabsf(x) - f->box.width;
57 | float dy = fabsf(y) - f->box.height;
58 | float dz = fabsf(z) - f->box.depth;
59 | float d = dx;
60 | if (dy > d) d = dy;
61 | if (dz > d) d = dz;
62 | return d;
63 | }
64 | case FREP_SPHERE:
65 | {
66 | return sqrtf(x*x + y*y + z*z) - f->sphere.radius;
67 | }
68 | case FREP_CYLINDER:
69 | {
70 | float a = sqrtf(x*x + z*z) - f->cylinder.radius;
71 | float b = fabsf(y) - f->cylinder.height;
72 | return a > b ? a : b;
73 | }
74 | case FREP_PLANE:
75 | {
76 | return f->plane.sign*x - f->plane.offset;
77 | }
78 | case FREP_UNION:
79 | {
80 | float f1 = frep_eval(f->left, x, y, z);
81 | float f2 = frep_eval(f->right, x, y, z);
82 | return f1 < f2 ? f1 : f2;
83 | }
84 | case FREP_INTERSECT:
85 | {
86 | float f1 = frep_eval(f->left, x, y, z);
87 | float f2 = frep_eval(f->right, x, y, z);
88 | return f1 > f2 ? f1 : f2;
89 | }
90 | case FREP_SUBTRACT:
91 | {
92 | float f1 = frep_eval(f->left, x, y, z);
93 | float f2 = -frep_eval(f->right, x, y, z);
94 | return f1 > f2 ? f1 : f2;
95 | }
96 | #if 0
97 | case FREP_BLEND:
98 | {
99 | float f1 = frep_eval(f->left, x, y, z);
100 | float f2 = frep_eval(f->right, x, y, z);
101 | return f->blend.alpha*f1 + (1.0f - f->blend.alpha)*f2;
102 | }
103 | #endif
104 | default:
105 | {
106 | assert(false && "invalid node type");
107 | }
108 | }
109 | return 0.0f;
110 | }
111 |
--------------------------------------------------------------------------------
/src/frep_serialize.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "ast.h"
3 | #include
4 |
5 | #ifdef _MSC_VER
6 | // Note: MSVC version returns -1 on overflow, but glibc returns total count (which may be >= buf_size)
7 | #define snprintf _snprintf
8 | #endif
9 |
10 | static char *ast__to_string(ast_t *a, char *stream, size_t sizeof_buffer)
11 | {
12 | if (!a) return stream;
13 | if (a->type == AST_BOX) stream += snprintf(stream, sizeof_buffer, "b[%g,%g,%g]", a->box.w, a->box.h, a->box.d);
14 | else if (a->type == AST_SPHERE) stream += snprintf(stream, sizeof_buffer, "s[%g]", a->sphere.r);
15 | else if (a->type == AST_CYLINDER) stream += snprintf(stream, sizeof_buffer, "c[%g,%g]", a->cylinder.r, a->cylinder.h);
16 | else if (a->type == AST_PLANE) stream += snprintf(stream, sizeof_buffer, "p[%g]", a->plane.x);
17 | else if (a->type == AST_UNION) stream += snprintf(stream, sizeof_buffer, "U");
18 | else if (a->type == AST_INTERSECT) stream += snprintf(stream, sizeof_buffer, "I");
19 | else if (a->type == AST_SUBTRACT) stream += snprintf(stream, sizeof_buffer, "S");
20 | else if (a->type == AST_BLEND) stream += snprintf(stream, sizeof_buffer, "B[%g]", a->blend.alpha);
21 | stream += snprintf(stream, sizeof_buffer, "[%g,%g,%g]", a->rx, a->ry, a->rz);
22 | stream += snprintf(stream, sizeof_buffer, "[%g,%g,%g]", a->tx, a->ty, a->tz);
23 | stream = ast__to_string(a->left, stream, sizeof_buffer);
24 | stream = ast__to_string(a->right, stream, sizeof_buffer);
25 | return stream;
26 | }
27 |
28 | static ast_t *ast__from_string(char **inout_stream)
29 | {
30 | char *stream = *inout_stream;
31 | if (!stream) return NULL;
32 | if (*stream == '\0') return NULL;
33 |
34 | ast_t *a = ast_new();
35 |
36 | #define next_bracket() { while (*stream && *stream != '[') stream++; assert(*stream); stream++; assert(*stream); }
37 | if (*stream == 'b') { a->type = AST_BOX; next_bracket(); assert(3 == sscanf(stream, "%f,%f,%f", &a->box.w, &a->box.h, &a->box.d)); next_bracket(); }
38 | else if (*stream == 's') { a->type = AST_SPHERE; next_bracket(); assert(1 == sscanf(stream, "%f", &a->sphere.r )); next_bracket(); }
39 | else if (*stream == 'c') { a->type = AST_CYLINDER; next_bracket(); assert(2 == sscanf(stream, "%f,%f", &a->cylinder.r, &a->cylinder.h )); next_bracket(); }
40 | else if (*stream == 'p') { a->type = AST_PLANE; next_bracket(); assert(1 == sscanf(stream, "%f", &a->plane.x )); next_bracket(); }
41 | else if (*stream == 'U') { a->type = AST_UNION; next_bracket(); }
42 | else if (*stream == 'I') { a->type = AST_INTERSECT; next_bracket(); }
43 | else if (*stream == 'S') { a->type = AST_SUBTRACT; next_bracket(); }
44 | else if (*stream == 'B') { a->type = AST_BLEND; next_bracket(); assert(1 == sscanf(stream, "%f", &a->blend.alpha )); next_bracket(); }
45 | else assert(false && "invalid node type");
46 | assert(3 == sscanf(stream, "%f,%f,%f", &a->rx, &a->ry, &a->rz));
47 | next_bracket();
48 | assert(3 == sscanf(stream, "%f,%f,%f", &a->tx, &a->ty, &a->tz));
49 | while (*stream && *stream != ']') stream++;
50 | assert(*stream);
51 | stream++;
52 | #undef next_bracket
53 |
54 | a->left = ast__from_string(&stream);
55 | a->right = ast__from_string(&stream);
56 | *inout_stream = stream;
57 | return a;
58 | }
59 |
60 | char *ast_to_string(ast_t *a)
61 | {
62 | static char buffer[1024*1024];
63 | ast__to_string(a, buffer, sizeof(buffer));
64 | return buffer;
65 | }
66 |
67 | ast_t *ast_from_string(char *stream)
68 | {
69 | return ast__from_string(&stream);
70 | }
71 |
72 | #ifdef _MSC_VER
73 | #undef snprintf
74 | #endif
75 |
--------------------------------------------------------------------------------
/src/sass_6_x/backend.h:
--------------------------------------------------------------------------------
1 | // Developed by Simen Haugo.
2 | // See LICENSE.txt for copyright and licensing details (standard MIT License).
3 | //
4 | // This file contains the machine code generation backend for NVIDIA SASS (Shader
5 | // Assembly) ISA. Unlike the PTX backend, this directly outputs to binary code that
6 | // can be patched into a Cubin binary module and loaded immediately with the Cuda
7 | // Driver API (see NVRTC example in SDK). This avoids the slow PTX compiler provided
8 | // in CUDA.
9 | //
10 | // This backend is for devices of compute capability 6.x, such as the Maxwell and
11 | // Pascal GPU families. It does not support Volta or Turing families (which have
12 | // compute capability 7.x).
13 | //
14 | // SASS code generation consists of the following major steps
15 | //
16 | // 1. Generate instruction blocks
17 | // the input frep tree is parsed to produce independent sequences of temporary
18 | // SASS instructions (not binary). These are assigned virtual register names,
19 | // which must be assigned to physical registers in the next step.
20 | //
21 | // 2. Schedule instructions and assign physical registers
22 | //
23 | // 3. Generate SASS binary
24 | // With the physical registers assigned, we can now generate the actual binary
25 | // instructions that go into the final ELF executable.
26 | //
27 | // 4. Link SASS ELF executable (a "Cubin" module)
28 | //
29 |
30 | #pragma once
31 | #include "../frep.h"
32 | #include
33 | #include
34 | #include
35 | #include
36 | #include "registers.h"
37 | #include "instruction.h"
38 | #include "scheduler.h"
39 | #include "blocks.h"
40 | #include "bytecode.h"
41 |
42 | #if 0
43 | uint64_t get_ctrl_segment(instruction_t i)
44 | {
45 | uint8_t ra,rb,rc,rd;
46 | uint8_t reuse; // register reuse flags
47 | uint8_t yield; // can relinquish control to other warp or not
48 | uint8_t stall; // number of cycles to wait before continuing
49 | uint8_t wrtdb; // write dependencies
50 | uint8_t readb; // read dependencies
51 | uint8_t watdb; // wait dependencies
52 | }
53 |
54 | void *frep_compile_to_sass(frep_t *tree, size_t *length)
55 | {
56 | static const uint8_t header[] = {
57 | 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x33, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
58 | 0x01, 0x00, 0xbe, 0x00, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
59 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
60 | 0x3c, 0x05, 0x3c, 0x00, 0x40, 0x00, 0x38, 0x00, 0x00, 0x00, 0x40, 0x00, 0x09, 0x00, 0x01, 0x00,
61 | 0x00, 0x2e, 0x73, 0x68, 0x73, 0x74, 0x72, 0x74, 0x61, 0x62, 0x00, 0x2e, 0x73, 0x74, 0x72, 0x74,
62 | 0x61, 0x62, 0x00, 0x2e, 0x73, 0x79, 0x6d, 0x74, 0x61, 0x62, 0x00, 0x2e, 0x73, 0x79, 0x6d, 0x74,
63 | 0x61, 0x62, 0x5f, 0x73, 0x68, 0x6e, 0x64, 0x78, 0x00, 0x2e, 0x6e, 0x76, 0x2e, 0x69, 0x6e, 0x66,
64 | 0x6f, 0x00, 0x2e, 0x74, 0x65, 0x78, 0x74, 0x2e, 0x74, 0x72, 0x65, 0x65, 0x00, 0x2e, 0x6e, 0x76,
65 | 0x2e, 0x69, 0x6e, 0x66, 0x6f, 0x2e, 0x74, 0x72, 0x65, 0x65, 0x00, 0x2e, 0x6e, 0x76, 0x2e, 0x63,
66 | 0x61, 0x6c, 0x6c, 0x67, 0x72, 0x61, 0x70, 0x68, 0x00, 0x2e, 0x6e, 0x76, 0x2e, 0x70, 0x72, 0x6f,
67 | 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x00, 0x00, 0x2e, 0x73, 0x68, 0x73, 0x74, 0x72, 0x74, 0x61,
68 | 0x62, 0x00, 0x2e, 0x73, 0x74, 0x72, 0x74, 0x61, 0x62, 0x00, 0x2e, 0x73, 0x79, 0x6d, 0x74, 0x61,
69 | 0x62, 0x00, 0x2e, 0x73, 0x79, 0x6d, 0x74, 0x61, 0x62, 0x5f, 0x73, 0x68, 0x6e, 0x64, 0x78, 0x00,
70 | 0x2e, 0x6e, 0x76, 0x2e, 0x69, 0x6e, 0x66, 0x6f, 0x00, 0x74, 0x72, 0x65, 0x65, 0x00, 0x2e, 0x74,
71 | 0x65, 0x78, 0x74, 0x2e, 0x74, 0x72, 0x65, 0x65, 0x00, 0x2e, 0x6e, 0x76, 0x2e, 0x69, 0x6e, 0x66,
72 | 0x6f, 0x2e, 0x74, 0x72, 0x65, 0x65, 0x00, 0x23, 0x66, 0x66, 0x66, 0x66, 0x00, 0x2e, 0x6e, 0x76,
73 | 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x67, 0x72, 0x61, 0x70, 0x68, 0x00, 0x2e, 0x6e, 0x76, 0x2e, 0x70,
74 | 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
75 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
76 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x03, 0x00, 0x08, 0x00,
77 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
78 | 0x56, 0x00, 0x00, 0x00, 0x03, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
79 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, 0x03, 0x00, 0x07, 0x00,
80 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
81 | 0x32, 0x00, 0x00, 0x00, 0x12, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
82 | 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x2f, 0x08, 0x00, 0x04, 0x00, 0x00, 0x00,
83 | 0x07, 0x00, 0x00, 0x00, 0x04, 0x23, 0x08, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
84 | 0x04, 0x11, 0x08, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x30, 0x00, 0x00,
85 | 0x01, 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
86 | 0xfe, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0xfd, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
87 | 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
88 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
89 | };
90 |
91 | static const uint8_t footer[] = {
92 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
93 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
94 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
95 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
96 | 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
97 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
98 | 0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
99 | 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
100 | 0x0b, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
101 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
102 | 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
103 | 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
104 | 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
105 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
106 | 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
107 | 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
108 | 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
109 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
110 | 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
111 | 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
112 | 0x3d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
113 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xbc, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
114 | 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
115 | 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
116 | 0x4b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
117 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc4, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
118 | 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
119 | 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
120 | 0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
121 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
122 | 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
123 | 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
124 | 0x32, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
125 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
126 | 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x07,
127 | 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
128 | };
129 |
130 | using namespace backend_sass;
131 | instruction_blocks_t blocks = generate_sass_blocks(tree);
132 |
133 | int num_instructions;
134 | instruction_t *instructions = schedule_blocks(blocks, &num_instructions);
135 |
136 | size_t sizeof_cubin = sizeof(header) + sizeof(footer) + sizeof_sass;
137 | uint8_t *cubin = (uint8_t*)malloc(sizeof_cubin);
138 | memcpy(cubin, header, sizeof(header));
139 | uint64_t *instruction_bin = cubin + sizeof(header);
140 | for (size_t i = 0; i < num_instructions; i++)
141 | {
142 | instruction_t i1 = instructions[i];
143 | instruction_t i2 = instructions[i];
144 | instruction_t i3 = instructions[i];
145 | instruction_t instruction = instructions[i];
146 | switch (instruction.type)
147 | {
148 | case INSTRUCTION_FFMA: FFMA(d, a, b, c, FFMA_FTZ); break;
149 | case INSTRUCTION_FMUL: FMUL(d, a, b, FMUL_FTZ); break;
150 | case INSTRUCTION_FADD: FADD(d, a, b, FADD_FTZ); break;
151 | case INSTRUCTION_FFMA20I: FFMA20I(d, a, imm_b, c, FFMA_FTZ); break;
152 | case INSTRUCTION_FMUL20I: FMUL20I(d, a, imm_b, FMUL_FTZ); break;
153 | case INSTRUCTION_FADD20I: FADD20I(d, a, imm_b, FADD_FTZ); break;
154 | case INSTRUCTION_FADD20I_ABS_A: FADD20I(d, a, imm_b, FADD_FTZ|FADD_ABS_A); break;
155 | case INSTRUCTION_FMIN: FMIN(d, a, b, FMNMX_FTZ); break;
156 | case INSTRUCTION_FMAX: FMAX(d, a, b, FMNMX_FTZ); break;
157 | case INSTRUCTION_FMAX_NEG_B: FMIN(d, a, b, FMNMX_FTZ|FMNMX_NEG_B); break;
158 | case INSTRUCTION_SQRT: MUFU_SQRT(d, a); break;
159 | default: assert(false && "Unknown instruction type");
160 | }
161 | }
162 | memcpy(cubin, footer, sizeof(footer));
163 |
164 | assert(cubin);
165 | }
166 | #endif
167 |
--------------------------------------------------------------------------------
/src/sass_6_x/blocks.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | namespace backend_sass {
4 |
5 | #define CLEAR() memset(&block->instructions[block->num_instructions], 0, sizeof(instruction_t))
6 | #define TYPE(Expression) block->instructions[block->num_instructions].type = INSTRUCTION_##Expression
7 | #define RA(Expression) block->instructions[block->num_instructions].a = REGISTER_##Expression
8 | #define RB(Expression) block->instructions[block->num_instructions].b = REGISTER_##Expression
9 | #define RC(Expression) block->instructions[block->num_instructions].c = REGISTER_##Expression
10 | #define RD(Expression) block->instructions[block->num_instructions].d = REGISTER_##Expression
11 | #define STALL(Expression) block->instructions[block->num_instructions].stall = Expression;
12 | #define IMMB(Expression) block->instructions[block->num_instructions].imm_b = Expression;
13 | #define NEXT() block->num_instructions++; assert(block->num_instructions <= MAX_INSTRUCTIONS_PER_BLOCK);
14 |
15 | #if 0 // sequential transform code
16 | // (x,y,z) = R_root_to_this*((x0,y0,z0) - T_this_rel_root)
17 | // = Rz(rz)*Ry(ry)*Rx(rx)*((x0-tx, y0-ty, z0-tz))
18 | void emit_transform(instruction_block_t *block, frep_mat3_t R_root_to_this, frep_vec3_t T_this_rel_root)
19 | {
20 | // Convert to final rotation into euler angles
21 | // (need less registers to do three sequential
22 | // euler rotations, than a full 3x3 matrix multiply, I think...?)
23 | float rx,ry,rz;
24 | frep_so3_to_ypr(R_root_to_this, &rz, &ry, &rx);
25 | float tx = T_this_rel_root[0];
26 | float ty = T_this_rel_root[1];
27 | float tz = T_this_rel_root[2];
28 | float cx = cosf(rx); float sx = sinf(rx);
29 | float cy = cosf(ry); float sy = sinf(ry);
30 | float cz = cosf(rz); float sz = sinf(rz);
31 | // translate:
32 | CLEAR(); TYPE(FADD20I); RD(X); RA(X0); IMMB(-tx); NEXT(); // FADD x, x0, (-tx)
33 | CLEAR(); TYPE(FADD20I); RD(Y); RA(Y0); IMMB(-ty); NEXT(); // FADD y, y0, (-ty)
34 | CLEAR(); TYPE(FADD20I); RD(Z); RA(Z0); IMMB(-tz); NEXT(); // FADD z, z0, (-tz)
35 | // rotate_x: x=x, y=c*y - s*z, z=s*y + c*z
36 | CLEAR(); TYPE(FMUL20I); RD(W); RA(Y); IMMB(+sx); NEXT(); // FMUL w, y, (s)
37 | CLEAR(); TYPE(FMUL20I); RD(Y); RA(Y); IMMB(+cx); NEXT(); // FMUL y, y.reuse, (c)
38 | CLEAR(); TYPE(FFMA20I); RD(Y); RA(Z); IMMB(-sx); RC(Y); NEXT(); // FFMA y, z, (-s), y
39 | CLEAR(); TYPE(FFMA20I); RD(Z); RA(Z); IMMB(+cx); RC(W); NEXT(); // FFMA z, z.reuse, (c), w
40 | // rotate_y: x=c*x + s*z, y=y, z=-s*x + c*z
41 | CLEAR(); TYPE(FMUL20I); RD(W); RA(X); IMMB(-sy); NEXT(); // FMUL w, x, (-s)
42 | CLEAR(); TYPE(FMUL20I); RD(X); RA(X); IMMB(+cy); NEXT(); // FMUL x, x.reuse, (c)
43 | CLEAR(); TYPE(FFMA20I); RD(X); RA(Z); IMMB(+sy); RC(X); NEXT(); // FFMA x, z, (s), x
44 | CLEAR(); TYPE(FFMA20I); RD(Z); RA(Z); IMMB(+cy); RC(W); NEXT(); // FFMA z, z.reuse, (c), w
45 | // rotate_z: x=c*x - s*y, y=s*x + c*y, z=z
46 | CLEAR(); TYPE(FMUL20I); RD(W); RA(X); IMMB(+sz); NEXT(); // FMUL w, x, (s)
47 | CLEAR(); TYPE(FMUL20I); RD(X); RA(X); IMMB(+cz); NEXT(); // FMUL x, x.reuse, (c)
48 | CLEAR(); TYPE(FFMA20I); RD(X); RA(Y); IMMB(-sz); RC(X); NEXT(); // FFMA x, y, (-s), x
49 | CLEAR(); TYPE(FFMA20I); RD(Y); RA(Y); IMMB(+cz); RC(W); NEXT(); // FFMA y, y.reuse, (c), w
50 | }
51 | #else
52 | void emit_transform(instruction_block_t *block, frep_mat3_t R/*_root_to_this*/, frep_vec3_t T/*_this_rel_root*/)
53 | {
54 | // This path is a stall-count optimized version of the above.
55 | // The generated code computes the following:
56 | // (x,y,z) = R_root_to_this*((x0,y0,z0) - T_this_rel_root)
57 | // x = R00*(x0-Tx) + R01*(y0-Ty) + R02*(z0-Tz)
58 | // = R00*x0 + R01*y0 + R02*z0 + (-R00*Tx - R01*Ty - R02*Tz)
59 | // = R00*x0 + R01*y0 + R02*z0 + dx
60 | // etc...
61 |
62 | float dx = -(R.at(0,0)*T[0] + R.at(0,1)*T[1] + R.at(0,2)*T[2]);
63 | float dy = -(R.at(1,0)*T[0] + R.at(1,1)*T[1] + R.at(1,2)*T[2]);
64 | float dz = -(R.at(2,0)*T[0] + R.at(2,1)*T[1] + R.at(2,2)*T[2]);
65 |
66 | CLEAR(); TYPE(FADD20I); RD(X); RA(RZ); IMMB(dx); STALL(1); NEXT(); // 1 FADD x, RZ, dx
67 | CLEAR(); TYPE(FADD20I); RD(Y); RA(RZ); IMMB(dy); STALL(1); NEXT(); // 1 FADD y, RZ, dy
68 | CLEAR(); TYPE(FADD20I); RD(Z); RA(RZ); IMMB(dz); STALL(4); NEXT(); // 4 FADD z, RZ, dz
69 | CLEAR(); TYPE(FFMA20I); RD(X); RA(X0); IMMB(R.at(0,0)); RC(X); STALL(1); NEXT(); // 1 FFMA x, x0, (R00), x // Q) Why not have dx here?
70 | CLEAR(); TYPE(FFMA20I); RD(Y); RA(X0); IMMB(R.at(1,0)); RC(Y); STALL(1); NEXT(); // 1 FFMA y, x0, (R10), y
71 | CLEAR(); TYPE(FFMA20I); RD(Z); RA(X0); IMMB(R.at(2,0)); RC(Z); STALL(4); NEXT(); // 4 FFMA z, x0, (R20), z
72 | CLEAR(); TYPE(FFMA20I); RD(X); RA(Y0); IMMB(R.at(0,1)); RC(X); STALL(1); NEXT(); // 1 FFMA x, y0, (R01), x
73 | CLEAR(); TYPE(FFMA20I); RD(Y); RA(Y0); IMMB(R.at(1,1)); RC(Y); STALL(1); NEXT(); // 1 FFMA y, y0, (R11), y
74 | CLEAR(); TYPE(FFMA20I); RD(Z); RA(Y0); IMMB(R.at(2,1)); RC(Z); STALL(4); NEXT(); // 4 FFMA z, y0, (R21), z
75 | CLEAR(); TYPE(FFMA20I); RD(X); RA(Z0); IMMB(R.at(0,2)); RC(X); STALL(1); NEXT(); // 1 FFMA x, z0, (R02), x
76 | CLEAR(); TYPE(FFMA20I); RD(Y); RA(Z0); IMMB(R.at(1,2)); RC(Y); STALL(1); NEXT(); // 1 FFMA y, z0, (R12), y
77 | CLEAR(); TYPE(FFMA20I); RD(Z); RA(Z0); IMMB(R.at(2,2)); RC(Z); STALL(4); NEXT(); // 4 FFMA z, z0, (R22), z
78 | }
79 | #endif
80 |
81 | // cylinder: max(sqrt(x*x + z*z) - R, abs(y)-H)
82 | void emit_cylinder(instruction_block_t *block, frep_mat3_t R, frep_vec3_t T, float r, float h)
83 | {
84 | emit_transform(block, R, T);
85 | CLEAR(); TYPE(FMUL); RD(W); RA(X); RB(X); NEXT(); // FMUL w, x, x
86 | CLEAR(); TYPE(FFMA); RD(W); RA(Z); RB(Z); RC(W); NEXT(); // FFMA w, z, z, w
87 | CLEAR(); TYPE(SQRT); RD(W); RA(W); RB(W); NEXT(); // SQRT w, w
88 | CLEAR(); TYPE(FADD20I_ABS_A); RD(Y); RA(Y); IMMB(-h); NEXT(); // FADD y, |y|, -H
89 | CLEAR(); TYPE(FADD20I); RD(W); RA(W); IMMB(-r); NEXT(); // FADD w, w, -R
90 | CLEAR(); TYPE(FMAX); RD(D); RA(W); RB(Y); NEXT(); // FMAX d, w, y
91 | }
92 |
93 | // sphere: sqrt(x*x + y*y + z*z) - R
94 | void emit_sphere(instruction_block_t *block, frep_mat3_t R, frep_vec3_t T, float r)
95 | {
96 | #if 1
97 | CLEAR(); TYPE(FADD20I); RD(X); RA(X0); IMMB(-T[0]); STALL(1); NEXT(); // 1 FADD x, x0, (-tx)
98 | CLEAR(); TYPE(FADD20I); RD(Y); RA(Y0); IMMB(-T[1]); STALL(1); NEXT(); // 1 FADD y, y0, (-ty)
99 | CLEAR(); TYPE(FADD20I); RD(Z); RA(Z0); IMMB(-T[2]); STALL(4); NEXT(); // 4 FADD z, z0, (-tz)
100 | CLEAR(); TYPE(FMUL); RD(W); RA(X); RB(X); NEXT(); // 6 FMUL w, x, x
101 | CLEAR(); TYPE(FFMA); RD(W); RA(Y); RB(Y); RC(W); NEXT(); // 6 FFMA w, y, y, w
102 | CLEAR(); TYPE(FFMA); RD(W); RA(Z); RB(Z); RC(W); NEXT(); // 6 FFMA w, z, z, w
103 | CLEAR(); TYPE(SQRT); RD(W); RA(W); RB(W); NEXT(); // 8 SQRT w, w
104 | CLEAR(); TYPE(FADD20I); RD(D); RA(W); IMMB(-r); NEXT(); // 6 FADD d, w, -R
105 | #else
106 | emit_transform(block, R, T);
107 | CLEAR(); TYPE(FMUL); RD(W); RA(X); RB(X); NEXT(); // FMUL w, x, x
108 | CLEAR(); TYPE(FFMA); RD(W); RA(Y); RB(Y); RC(W); NEXT(); // FFMA w, y, y, w
109 | CLEAR(); TYPE(FFMA); RD(W); RA(Z); RB(Z); RC(W); NEXT(); // FFMA w, z, z, w
110 | CLEAR(); TYPE(SQRT); RD(W); RA(W); RB(W); NEXT(); // SQRT w, w
111 | CLEAR(); TYPE(FADD20I); RD(D); RA(W); IMMB(-r); NEXT(); // FADD d, w, -R
112 | #endif
113 | }
114 |
115 | void emit_box(instruction_block_t *block, frep_mat3_t R, frep_vec3_t T, float bx, float by, float bz)
116 | {
117 | assert(false && "fBox is not implemented yet");
118 | }
119 |
120 | // box: max(max(|x|-wx, |y|-wy), |z|-wz)
121 | void emit_box_cheap(instruction_block_t *block, frep_mat3_t R, frep_vec3_t T, float bx, float by, float bz)
122 | {
123 | emit_transform(block, R, T);
124 | CLEAR(); TYPE(FADD20I_ABS_A); RD(X); RA(X); IMMB(-bx); STALL(1); NEXT(); // 1 FADD x, |x|, -wx
125 | CLEAR(); TYPE(FADD20I_ABS_A); RD(Y); RA(Y); IMMB(-by); STALL(1); NEXT(); // 1 FADD y, |y|, -wy
126 | CLEAR(); TYPE(FADD20I_ABS_A); RD(Z); RA(Z); IMMB(-bz); STALL(5); NEXT(); // 5 FADD z, |z|, -wz
127 | CLEAR(); TYPE(FMAX); RD(W); RA(X); RB(Y); NEXT(); // 6 FMAX w, x, y
128 | CLEAR(); TYPE(FMAX); RD(D); RA(W); RB(Z); NEXT(); // 6 FMAX d, w, z
129 | }
130 |
131 | void emit_plane(instruction_block_t *block, frep_mat3_t R, frep_vec3_t T, float px)
132 | {
133 | #if 0
134 | // optimized version
135 | float rx,ry,rz;
136 | frep_so3_to_ypr(R, &rz, &ry, &rx);
137 | float cx = cosf(rx); float sx = sinf(rx);
138 | float cy = cosf(ry); float sy = sinf(ry);
139 | float cz = cosf(rz); float sz = sinf(rz);
140 | float rtx = -((cy*cz)*T[0] + (cz*sx*sy - cx*sz)*T[1] + (sx*sz + cx*cz*sy)*T[2]);
141 |
142 | CLEAR(); TYPE(FMUL20I); RD(X); RA(X0); IMMB((cy*cz)); NEXT(); // 6 FMUL x, x0, (cy*cz)
143 | CLEAR(); TYPE(FFMA20I); RD(X); RA(Y0); IMMB((cz*sx*sy-cx*sz)); RC(X); NEXT(); // 6 FFMA x, y0, (cz*sx*sy-cx*sz), x
144 | CLEAR(); TYPE(FFMA20I); RD(X); RA(Z0); IMMB((sx*sz + cx*cz*sy)); RC(X); NEXT(); // 6 FFMA x, z0, (sx*sz + cx*cz*sy), x
145 | CLEAR(); TYPE(FADD20I); RD(D); RA(X); IMMB(rtx-px); NEXT(); // 6 FADD d, x, rtx-px
146 | #else
147 | emit_transform(block, R, T);
148 | // plane: x - px
149 | CLEAR(); TYPE(FADD20I); RD(D); RA(X); IMMB(-px); NEXT(); // FADD d, x, -px
150 | #endif
151 | }
152 |
153 | void emit_union(instruction_block_t *block) { CLEAR(); TYPE(FMIN); RD(D); RA(D_LEFT); RB(D_RIGHT); NEXT(); }
154 | void emit_intersect(instruction_block_t *block) { CLEAR(); TYPE(FMAX); RD(D); RA(D_LEFT); RB(D_RIGHT); NEXT(); }
155 | void emit_subtract(instruction_block_t *block) { CLEAR(); TYPE(FMAX_NEG_B); RD(D); RA(D_LEFT); RB(D_RIGHT); NEXT(); }
156 | void emit_blend(instruction_block_t *block, float alpha)
157 | {
158 | // blend: alpha*d_left + (1-alpha)*d_right
159 | CLEAR(); TYPE(FMUL20I); RD(D); RA(D_LEFT); IMMB(alpha); NEXT(); // FMUL d, d_left, (alpha)
160 | CLEAR(); TYPE(FFMA20I); RD(D); RA(D_RIGHT); IMMB(1.0f-alpha); RC(D); NEXT(); // FFMA d, d_right, (1-alpha), d
161 | }
162 |
163 | #undef TYPE
164 | #undef RA
165 | #undef RB
166 | #undef RC
167 | #undef RD
168 | #undef IMMB
169 | #undef NEXT
170 | #undef STALL
171 | #undef CLEAR
172 |
173 | void _generate_blocks(
174 | instruction_blocks_t *s,
175 | frep_t *node,
176 | int destination=0,
177 | frep_mat3_t R_root_to_parent=frep_identity_3x3,
178 | frep_vec3_t T_parent_rel_root=frep_null_3x1)
179 | // You can do much smarter register allocation here. The register allocation
180 | // may also need to change if we do smarter scheduling. E.g. block reordering.
181 | {
182 | assert(node);
183 |
184 | frep_mat3_t R_root_to_this;
185 | frep_vec3_t T_this_rel_root;
186 | frep_get_global_transform(node, &R_root_to_this, &T_this_rel_root, R_root_to_parent, T_parent_rel_root);
187 |
188 | if (frep_is_boolean(node))
189 | {
190 | assert(node->left);
191 | assert(node->right);
192 |
193 | int d_left = destination;
194 | int d_right = destination+1;
195 | _generate_blocks(s, node->left, d_left, R_root_to_this, T_this_rel_root);
196 | _generate_blocks(s, node->right, d_right, R_root_to_this, T_this_rel_root);
197 |
198 | instruction_block_t *b = &s->blocks[s->num_blocks++];
199 | b->num_instructions = 0;
200 | b->d_left = d_left;
201 | b->d_right = d_right;
202 | b->d = destination;
203 | if (node->opcode == FREP_UNION) emit_union(b);
204 | else if (node->opcode == FREP_INTERSECT) emit_intersect(b);
205 | else if (node->opcode == FREP_SUBTRACT) emit_subtract(b);
206 | else if (node->opcode == FREP_BLEND) emit_blend(b, node->blend.alpha);
207 | assert(s->num_blocks <= MAX_INSTRUCTION_BLOCKS);
208 | }
209 | else if (frep_is_primitive(node))
210 | {
211 | instruction_block_t *b = &s->blocks[s->num_blocks++];
212 | b->num_instructions = 0;
213 | frep_mat3_t R = R_root_to_this;
214 | frep_vec3_t T = T_this_rel_root;
215 | b->d = destination;
216 | if (node->opcode == FREP_BOX) emit_box(b, R, T, node->box.width, node->box.height, node->box.depth);
217 | else if (node->opcode == FREP_BOX_CHEAP) emit_box_cheap(b, R, T, node->box.width, node->box.height, node->box.depth);
218 | else if (node->opcode == FREP_SPHERE) emit_sphere(b, R, T, node->sphere.radius);
219 | else if (node->opcode == FREP_CYLINDER) emit_cylinder(b, R, T, node->cylinder.radius, node->cylinder.height);
220 | else if (node->opcode == FREP_PLANE) emit_plane(b, R, T, node->plane.offset);
221 | assert(s->num_blocks <= MAX_INSTRUCTION_BLOCKS);
222 | }
223 | else
224 | {
225 | assert(false && "Unexpected node type");
226 | }
227 | }
228 |
229 | instruction_blocks_t generate_blocks(frep_t *node)
230 | // This function generates a list of instruction blocks that evaluates the
231 | // tree and stores the resulting distance value in register[0]. Each block
232 | // is assigned registers during the recursive tree parsing.
233 | {
234 | assert(node);
235 |
236 | static instruction_block_t _blocks[MAX_INSTRUCTION_BLOCKS];
237 | instruction_blocks_t s = {0};
238 | s.blocks = _blocks;
239 | s.num_blocks = 0;
240 |
241 | _generate_blocks(&s, node);
242 |
243 | return s;
244 | }
245 |
246 | }
247 |
--------------------------------------------------------------------------------
/src/sass_6_x/bytecode.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | namespace backend_sass {
4 |
5 | //
6 | // Instruction flags
7 | //
8 | // Meaning:
9 | // FTZ = Flush to zero
10 | // NEG_A = Use negated value of a operand
11 | // NEG_B = Use negated value of b operand
12 | // ABS_A = Use absolute value of a operand
13 | // ABS_B = Use absolute value of b operand
14 | uint64_t FADD_FTZ = 0x0000100000000000;
15 | uint64_t FADD_NEG_A = 0x0001000000000000;
16 | uint64_t FADD_NEG_B = 0x0000200000000000;
17 | uint64_t FADD_ABS_A = 0x0000400000000000;
18 | uint64_t FADD_ABS_B = 0x0002000000000000;
19 | uint64_t FADD32I_FTZ = 0x0080000000000000;
20 | uint64_t FADD32I_ABS_A = 0x0040000000000000;
21 | uint64_t FMUL_FTZ = 0x0000100000000000;
22 | uint64_t FMUL_NEG_B = 0x0001000000000000;
23 | uint64_t FMUL32I_FTZ = 0x0020000000000000;
24 | uint64_t FMNMX_FTZ = 0x0000100000000000;
25 | uint64_t FMNMX_NEG_A = 0x0001000000000000;
26 | uint64_t FMNMX_NEG_B = 0x0000200000000000;
27 | uint64_t FMNMX_ABS_A = 0x0000400000000000;
28 | uint64_t FMNMX_ABS_B = 0x0002000000000000;
29 | uint64_t FFMA_FTZ = 0x0020000000000000;
30 | uint64_t FFMA_NEG_B = 0x0001000000000000;
31 | uint64_t FFMA_NEG_C = 0x0002000000000000;
32 |
33 | // FADD d, a, b
34 | // d = a+b
35 | uint64_t FADD(uint8_t d, uint8_t a, uint8_t b, uint64_t flags) {
36 | uint64_t RD = (uint64_t)(d) << 0;
37 | uint64_t RA = (uint64_t)(a) << 8;
38 | uint64_t RB = (uint64_t)(b) << 20;
39 | return 0x5c58000000070000 | flags | RB | RA | RD;
40 | }
41 |
42 | // FADD d, -a, -RZ
43 | // d = a+b
44 | uint64_t NEG(uint8_t d, uint8_t a, uint64_t flags) {
45 | uint64_t RD = (uint64_t)(d) << 0;
46 | uint64_t RA = (uint64_t)(a) << 8;
47 | // todo: why is NEG_B flag set?
48 | return 0x5c5930000ff70000 | flags | RA | RD;
49 | }
50 |
51 | // FADD d, a, b immediate
52 | // d = a+b
53 | uint64_t FADD20I(uint8_t d, uint8_t a, float b, uint64_t flags) {
54 | uint64_t b_u64 = *(uint64_t*)&b;
55 | uint64_t sgn_b = b_u64 & 0x0000000080000000;
56 | uint64_t NEG_B = sgn_b ? 0x0100000000000000 : 0x0;
57 | uint64_t B = ((b_u64 & 0x000000007FFFF000) >> 12) << 20;
58 | uint64_t RA = (uint64_t)(a) << 8;
59 | uint64_t RD = (uint64_t)(d) << 0;
60 | return 0x3858000000070000 | flags | NEG_B | B | RA | RD;
61 | }
62 |
63 | // FADD32I d, a, b immediate
64 | // d = a+b
65 | uint64_t FADD32I(uint8_t d, uint8_t a, float b, uint64_t flags) {
66 | uint64_t b_u64 = *(uint64_t*)&b;
67 | uint64_t sgn_b = b_u64 & 0x0000000080000000;
68 | uint64_t NEG_B = sgn_b ? 0x0008000000000000 : 0x0;
69 | uint64_t B = (b_u64 & 0x000000007FFFFFFF) << 20;
70 | uint64_t RA = (uint64_t)(a) << 8;
71 | uint64_t RD = (uint64_t)(d) << 0;
72 | return 0x0880000000070000 | flags | NEG_B | B | RA | RD;
73 | }
74 |
75 | // FTF.FTZ.F32.F32.FLOOR d, b
76 | // d = floor(b)
77 | uint64_t FLOOR32F(uint8_t d, uint8_t b) {
78 | uint64_t RB = (uint64_t)(b) << 20;
79 | uint64_t RD = (uint64_t)(d) << 0;
80 | return 0x5ca8148000070a00 | RB | RD;
81 | }
82 |
83 | // FMUL32I d, a, b immediate
84 | // d = a*b
85 | uint64_t FMUL32I(uint8_t d, uint8_t a, float b, uint64_t flags) {
86 | uint64_t b_u64 = *(uint64_t*)&b;
87 | uint64_t sgn_b = b_u64 & 0x0000000080000000;
88 | uint64_t NEG_B = sgn_b ? 0x0008000000000000 : 0x0;
89 | uint64_t B = (b_u64 & 0x000000007FFFFFFF) << 20;
90 | uint64_t RA = (uint64_t)(a) << 8;
91 | uint64_t RD = (uint64_t)(d) << 0;
92 | return 0x1e00000000070000 | flags | NEG_B | B | RA | RD;
93 | }
94 |
95 | // FMUL d, a, b immediate
96 | // d = a*b
97 | uint64_t FMUL20I(uint8_t d, uint8_t a, float b, uint64_t flags) {
98 | uint64_t b_u64 = *(uint64_t*)&b;
99 | uint64_t sgn_b = b_u64 & 0x0000000080000000;
100 | uint64_t NEG_B = sgn_b ? 0x0100000000000000 : 0x0;
101 | uint64_t B = ((b_u64 & 0x000000007FFFF000) >> 12) << 20;
102 | uint64_t RA = (uint64_t)(a) << 8;
103 | uint64_t RD = (uint64_t)(d) << 0;
104 | return 0x3868000000070000 | flags | NEG_B | B | RA | RD;
105 | }
106 |
107 | // FMUL d, a, b
108 | // d = a*b
109 | uint64_t FMUL(uint8_t d, uint8_t a, uint8_t b, uint64_t flags) {
110 | uint64_t RD = (uint64_t)(d) << 0;
111 | uint64_t RA = (uint64_t)(a) << 8;
112 | uint64_t RB = (uint64_t)(b) << 20;
113 | return 0x5c68000000070000 | flags | RB | RA | RD;
114 | }
115 |
116 | // FFMA d, a, b, c
117 | // d = a*b + c
118 | uint64_t FFMA(uint8_t d, uint8_t a, uint8_t b, uint8_t c, uint64_t flags) {
119 | uint64_t RD = (uint64_t)(d) << 0;
120 | uint64_t RA = (uint64_t)(a) << 8;
121 | uint64_t RB = (uint64_t)(b) << 20;
122 | uint64_t RC = (uint64_t)(c) << 39;
123 | return 0x5980000000070000 | flags | RC | RB | RA | RD;
124 | }
125 |
126 | // FFMA d, a, b immediate, c
127 | // d = a*b + c
128 | uint64_t FFMA20I(uint8_t d, uint8_t a, float b, uint8_t c, uint64_t flags) {
129 | uint64_t b_u64 = *(uint64_t*)&b;
130 | uint64_t sgn_b = b_u64 & 0x0000000080000000;
131 | uint64_t NEG_B = sgn_b ? 0x0100000000000000 : 0x0;
132 | uint64_t B = ((b_u64 & 0x000000007FFFF000) >> 12) << 20;
133 | uint64_t RC = (uint64_t)(c) << 39;
134 | uint64_t RA = (uint64_t)(a) << 8;
135 | uint64_t RD = (uint64_t)(d) << 0;
136 | return 0x3280000000070000 | flags | NEG_B | RC | B | RA | RD;
137 | }
138 |
139 | // FMNMX d, a, b, !PT
140 | // d = max(a,b)
141 | uint64_t FMAX(uint8_t d, uint8_t a, uint8_t b, uint64_t flags) {
142 | uint64_t RD = (uint64_t)(d) << 0;
143 | uint64_t RA = (uint64_t)(a) << 8;
144 | uint64_t RB = (uint64_t)(b) << 20;
145 | return 0x5c60078000070000 | flags | RB | RA | RD;
146 | }
147 |
148 | // FMNMX d, a, b, PT
149 | // d = min(a,b)
150 | uint64_t FMIN(uint8_t d, uint8_t a, uint8_t b, uint64_t flags) {
151 | uint64_t RD = (uint64_t)(d) << 0;
152 | uint64_t RA = (uint64_t)(a) << 8;
153 | uint64_t RB = (uint64_t)(b) << 20;
154 | return 0x5c60038000070000 | flags | RB | RA | RD;
155 | }
156 |
157 | // FMNMX d, a, b immediate, !PT
158 | // d = min(a,b)
159 | uint64_t FMAX20I(uint8_t d, uint8_t a, float b, uint64_t flags) {
160 | uint64_t b_u64 = *(uint64_t*)&b;
161 | uint64_t sgn_b = b_u64 & 0x0000000080000000;
162 | uint64_t NEG_B = sgn_b ? 0x0100000000000000 : 0x0;
163 | uint64_t B = ((b_u64 & 0x000000007FFFF000) >> 12) << 20;
164 | uint64_t RA = (uint64_t)(a) << 8;
165 | uint64_t RD = (uint64_t)(d) << 0;
166 | return 0x3860078000070000 | NEG_B | flags | B | RA | RD;
167 | }
168 |
169 | // FMNMX d, a, b immediate, PT
170 | // d = min(a,b)
171 | uint64_t FMIN20I(uint8_t d, uint8_t a, float b, uint64_t flags) {
172 | uint64_t b_u64 = *(uint64_t*)&b;
173 | uint64_t sgn_b = b_u64 & 0x0000000080000000;
174 | uint64_t NEG_B = sgn_b ? 0x0100000000000000 : 0x0;
175 | uint64_t B = ((b_u64 & 0x000000007FFFF000) >> 12) << 20;
176 | uint64_t RA = (uint64_t)(a) << 8;
177 | uint64_t RD = (uint64_t)(d) << 0;
178 | return 0x3860038000070000 | NEG_B | flags | B | RA | RD;
179 | }
180 |
181 | // MUFU.SQRT d, a
182 | // d = sqrt(a)
183 | uint64_t MUFU_SQRT(uint8_t d, uint8_t a) {
184 | uint64_t RD = (uint64_t)(d) << 0;
185 | uint64_t RA = (uint64_t)(a) << 8;
186 | return 0x5080000000870000 | RA | RD;
187 | }
188 |
189 | // NOP should be issued along with --:-:-:Y:0 control codes
190 | uint64_t NOP() { return 0x50b0000000070f00; }
191 | // RET should be issued along with --:-:-:-:f control codes
192 | uint64_t RET() { return 0xe32000000007000f; }
193 |
194 | struct control_flags_t
195 | {
196 | uint8_t reuse;
197 | uint8_t yield;
198 | uint8_t stall;
199 | uint8_t wrtdb;
200 | uint8_t readb;
201 | uint8_t watdb;
202 | };
203 |
204 | static control_flags_t ctrl[3];
205 |
206 | // watdb:readb:wrtdb:yield:stall [reuse]
207 | // read and write barriers are numbered 1...6
208 | void wait_on_barrier(uint8_t op, uint8_t barrier_number) {
209 | ctrl[op].watdb |= (1 << (barrier_number-1));
210 | }
211 | void set_write_barrier(uint8_t op, uint8_t barrier_number) {
212 | ctrl[op].wrtdb = barrier_number-1;
213 | }
214 | void set_read_barrier(uint8_t op, uint8_t barrier_number) {
215 | ctrl[op].readb = barrier_number-1;
216 | }
217 | void yield(uint8_t op) { // enables yield on instruction number op
218 | ctrl[op].yield = 0; // zero means enable
219 | }
220 | void stall(uint8_t op, uint8_t count) {
221 | ctrl[op].stall = count;
222 | }
223 | void reuse(uint8_t op, bool ra, bool rb, bool rc, bool rd) {
224 | ctrl[op].reuse = 0;
225 | if (ra) ctrl[op].reuse |= 0x1;
226 | if (rb) ctrl[op].reuse |= 0x2;
227 | if (rc) ctrl[op].reuse |= 0x4;
228 | if (rd) ctrl[op].reuse |= 0x8;
229 | }
230 | void reset_ctrl() {
231 | for (int op = 0; op < 3; op++)
232 | {
233 | ctrl[op].watdb = 0x00;
234 | ctrl[op].readb = 7;
235 | ctrl[op].wrtdb = 7;
236 | ctrl[op].yield = 1;
237 | ctrl[op].stall = 0;
238 | }
239 | }
240 | uint64_t CTRL() {
241 | uint64_t ret = 0;
242 | for (int op = 0; op < 3; op++) {
243 | uint64_t stall = (((uint64_t)ctrl[op].stall) & 0x0f) << 0;
244 | uint64_t yield = (((uint64_t)ctrl[op].yield) & 0x01) << 4;
245 | uint64_t wrtdb = (((uint64_t)ctrl[op].wrtdb) & 0x07) << 5;
246 | uint64_t readb = (((uint64_t)ctrl[op].readb) & 0x07) << 8;
247 | uint64_t watdb = (((uint64_t)ctrl[op].watdb) & 0x3f) << 11;
248 | uint64_t reuse = (((uint64_t)ctrl[op].reuse) & 0x0f) << 17;
249 | uint64_t ctrl = reuse|watdb|readb|wrtdb|yield|stall;
250 | ret |= ctrl << (op*21);
251 | }
252 | return ret;
253 | }
254 |
255 | void print_ctrl_segment(uint64_t x) {
256 | uint8_t stall = (uint8_t)((x & 0x0000f) >> 0);
257 | uint8_t yield = (uint8_t)((x & 0x00010) >> 4);
258 | uint8_t wrtdb = (uint8_t)((x & 0x000e0) >> 5); // 7 = no dependency
259 | uint8_t readb = (uint8_t)((x & 0x00700) >> 8); // 7 = no dependency
260 | uint8_t watdb = (uint8_t)((x & 0x1f800) >> 11);
261 | if (watdb) printf("%02x:", watdb); else printf("--:");
262 | if (readb==7) printf("-:"); else printf("%d:", readb+1);
263 | if (wrtdb==7) printf("-:"); else printf("%d:", wrtdb+1);
264 | if (yield) printf("-:"); else printf("Y:");
265 | printf("%x", stall);
266 | }
267 |
268 | void print_ctrl(uint64_t x) {
269 | uint64_t ctrl1 = (x & 0x000000000001ffff) >> 0;
270 | uint64_t ctrl2 = (x & 0x0000003fffe00000) >> 21;
271 | uint64_t ctrl3 = (x & 0x07fffc0000000000) >> 42;
272 | uint64_t reuse1 = (x & 0x00000000001e0000) >> 17;
273 | uint64_t reuse2 = (x & 0x000003c000000000) >> 38;
274 | uint64_t reuse3 = (x & 0x7800000000000000) >> 59;
275 | print_ctrl_segment(ctrl1); printf(" | ");
276 | print_ctrl_segment(ctrl2); printf(" | ");
277 | print_ctrl_segment(ctrl3);
278 | }
279 |
280 | }
281 |
282 | /*
283 | Notes
284 |
285 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
286 | IMMEDIATE VALUES
287 |
288 | FADD20I, FMUL20I and FFMA20I are immediate versions of their respective instructions,
289 | except the rightmost 12 bits of the single-precision mantissa are masked to zero. If
290 | you need full 23-bit mantissa precision you can use FADD32I and FMUL32I, which encode
291 | the entire float. FFMA does not have a 32-bit immediate version, but it can load from
292 | constant memory.
293 |
294 | *20I appear to be treated the same (flag-wise) as their non-immediate counterparts.
295 |
296 | FMNMX d, a, b, !PT -> MAX(a,b)
297 | FMNMX d, a, b, PT -> MIN(a,b)
298 |
299 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
300 | REGISTER BANKS
301 |
302 | Maxwell has four register banks per thread. The assignment of registers to banks is easy:
303 | Bank = Register number mod 4 (e.g. R0 and R4 are bank0, R3 and R7 are bank3)
304 | On Maxwell and Pascal, instructions can only access one value from each memory bank?
305 |
306 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
307 | REGISTER REUSE
308 |
309 | Maxwell and Pascal have 4 register reuse caches and 4 source operand slots. Each of the
310 | 4 reuse flag bits correspond to one of the 8-byte slots. The LSB in reuse flags controls
311 | the cache for the first source operand slot (a?), while the MSB is for the fourth.
312 | e.g. instruction dst, op0 ("first"), op1, op2, op3 ("last")
313 | e.g. FFMA.FTZ R3, R4, R4, R0.reuse -> has reuse flag 0100
314 | e.g. FFMA.FTZ R3, R4.reuse, R4, R0 -> has reuse flag 0001
315 | */
316 |
317 |
--------------------------------------------------------------------------------
/src/sass_6_x/cubin.h:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | struct elf64_hdr_t
7 | {
8 | uint8_t magic[4];
9 | uint8_t fileClass;
10 | uint8_t encoding;
11 | uint8_t fileVersion;
12 | uint8_t padding[9];
13 | uint16_t type;
14 | uint16_t machine;
15 | uint32_t version;
16 | uint64_t entry;
17 | uint64_t phOffset;
18 | uint64_t shOffset;
19 | uint32_t flags;
20 | uint16_t ehSize;
21 | uint16_t phEntSize;
22 | uint16_t phNum;
23 | uint16_t shEntSize;
24 | uint16_t shNum;
25 | uint16_t shStrIndx;
26 | };
27 |
28 | struct elf64_prg_hdr_t
29 | {
30 | uint32_t type;
31 | uint32_t flags;
32 | uint64_t offset;
33 | uint64_t vaddr;
34 | uint64_t paddr;
35 | uint64_t fileSize;
36 | uint64_t memSize;
37 | uint64_t align;
38 | };
39 |
40 | struct elf64_sec_hdr_t
41 | {
42 | uint32_t name;
43 | uint32_t type;
44 | uint64_t flags;
45 | uint64_t addr;
46 | uint64_t offset;
47 | uint64_t size;
48 | uint32_t link;
49 | uint32_t info;
50 | uint64_t align;
51 | uint64_t entSize;
52 | };
53 |
54 | struct elf64_sym_ent_t
55 | {
56 | uint32_t name;
57 | uint8_t info;
58 | uint8_t other;
59 | uint16_t shIndx;
60 | uint64_t value;
61 | uint64_t size;
62 | };
63 |
64 | struct cubin_function_t
65 | {
66 | char *name;
67 | char *b;
68 | elf64_sec_hdr_t *h;
69 | elf64_sym_ent_t *e;
70 |
71 | #if 0
72 | uint64_t *instructions() { return (uint64_t*)(b + h->offset); }
73 | int num_instructions() { return (int)(h->size / sizeof(uint64_t)); }
74 | void set_num_instructions(int n) { assert(n >= 0); h->size = n*sizeof(uint64_t); }
75 | #else
76 | // e->value is non-zero if the function is inlined, in which case it describe the
77 | // byte offset of the first instruction in the containing function's instructions.
78 | uint64_t *instructions() { return (uint64_t*)(b + h->offset + e->value); }
79 | int num_instructions() { return (int)(e->size/sizeof(uint64_t)); }
80 | void set_num_instructions(int n)
81 | {
82 | assert(n >= 0);
83 | assert(e->size == h->size && "The function appears to be an inline function. Changing the size of these is beyond the scope of this program.");
84 | e->size = ((uint64_t)n)*sizeof(uint64_t);
85 | h->size = ((uint64_t)n)*sizeof(uint64_t);
86 | }
87 | #endif
88 |
89 | uint8_t register_count() { return (h->info & 0xff000000)>>24; }
90 | void set_register_count(uint8_t n) { h->info = (h->info & 0x00ffffff) | (n<<24); }
91 | };
92 |
93 | enum { cubin_max_prg_hdrs = 1024 };
94 | enum { cubin_max_sec_hdrs = 1024 };
95 | enum { cubin_max_functions = 1024 };
96 | struct cubin_t
97 | {
98 | int sizeof_binary;
99 | char *binary;
100 | elf64_prg_hdr_t *prg_hdrs[cubin_max_prg_hdrs];
101 | int num_prg_hdrs;
102 |
103 | elf64_sec_hdr_t *sec_hdrs[cubin_max_sec_hdrs];
104 | int num_sec_hdrs;
105 |
106 | cubin_function_t functions[cubin_max_functions];
107 | int num_functions;
108 |
109 | cubin_function_t *get_function(const char *name)
110 | {
111 | for (int i = 0; i < num_functions; i++)
112 | if (strcmp(functions[i].name, name) == 0)
113 | return functions + i;
114 | return NULL;
115 | }
116 | };
117 |
118 | cubin_t read_cubin(const char *filename)
119 | {
120 | {
121 | uint16_t x = 0xaabb;
122 | uint8_t *p = (uint8_t*)&x;
123 | assert(p[0] == 0xbb && "machine is not little (?) endian");
124 | }
125 |
126 | cubin_t cubin = {0};
127 | {
128 | FILE *f = fopen(filename, "rb");
129 | assert(f);
130 | fseek(f, 0, SEEK_END);
131 | long size = ftell(f);
132 | rewind(f);
133 | char *data = new char[size + 1];
134 | int ok = fread(data, 1, size, f);
135 | assert(ok);
136 | data[size] = 0;
137 | fclose(f);
138 |
139 | cubin.binary = data;
140 | cubin.sizeof_binary = size;
141 | }
142 | assert(cubin.binary);
143 | assert(cubin.sizeof_binary);
144 |
145 | elf64_hdr_t elf_hdr = *(elf64_hdr_t*)cubin.binary;
146 | assert(elf_hdr.fileClass == 2 && "assuming 64-bit ELF");
147 | assert((elf_hdr.flags & 0xff) == 60 && "assuming sm_60 architecture");
148 | assert(elf_hdr.flags & 0x400 && "assuming 64-bit addresses");
149 | assert(elf_hdr.phNum <= cubin_max_prg_hdrs);
150 | assert(elf_hdr.shNum <= cubin_max_sec_hdrs);
151 |
152 | // read program headers
153 | {
154 | char *b = cubin.binary + elf_hdr.phOffset;
155 | for (int i = 0; i < elf_hdr.phNum; i++)
156 | {
157 | cubin.prg_hdrs[cubin.num_prg_hdrs++] = (elf64_prg_hdr_t*)b;
158 | b += elf_hdr.phEntSize;
159 | }
160 | }
161 |
162 | // read section headers
163 | {
164 | char *b = cubin.binary + elf_hdr.shOffset;
165 | for (int i = 0; i < elf_hdr.shNum; i++)
166 | {
167 | cubin.sec_hdrs[cubin.num_sec_hdrs++] = (elf64_sec_hdr_t*)b;
168 | b += elf_hdr.shEntSize;
169 | }
170 | }
171 |
172 |
173 | // find section headers called strtab and shstrtab
174 | char *strtab = NULL;
175 | char *shstrtab = NULL;
176 | for (int i = 0; i < cubin.num_sec_hdrs; i++)
177 | {
178 | elf64_sec_hdr_t *sh = (elf64_sec_hdr_t*)cubin.sec_hdrs[i];
179 | if (sh->type == 3)
180 | {
181 | char *data = cubin.binary + sh->offset;
182 | char *name = data + sh->name;
183 | if (strcmp(name, ".strtab") == 0) strtab = data;
184 | else if (strcmp(name, ".shstrtab") == 0) shstrtab = data;
185 |
186 | printf("found section \"%s\"\ndata (%d bytes): ", name, sh->size);
187 | for (int j = 0; j < sh->size; j++)
188 | printf("%c", data[j] ? data[j] : ' ');
189 | printf("\n\n");
190 | }
191 | #if 0
192 | else
193 | {
194 | char *name = shstrtab + sh->name;
195 | uint8_t *data = (uint8_t*)(cubin.binary + sh->offset);
196 | printf("found section \"%s\" (type=%x)\ndata(%d bytes):", name, sh->type, sh->size);
197 | for (int j = 0; j < sh->size; j++)
198 | printf("%02x ", data[j]);
199 | printf("\n\n");
200 | }
201 | #endif
202 | }
203 | assert(strtab);
204 | assert(shstrtab);
205 |
206 | for (int i = 0; i < cubin.num_sec_hdrs; i++)
207 | {
208 | elf64_sec_hdr_t *sh = cubin.sec_hdrs[i];
209 | if (sh->type == 2) // look for symbol table
210 | {
211 | printf("found symbol table section with these symbols:\n");
212 | char *data = cubin.binary + sh->offset;
213 | uint64_t offset = 0;
214 | while (offset < sh->size) // go through each symbol entry
215 | {
216 | elf64_sym_ent_t *ent = (elf64_sym_ent_t*)(data + offset);
217 | offset += sh->entSize;
218 | char *name = strtab + ent->name;
219 |
220 | if ((ent->info & 0x0f) == 0x02) // look for symbols tagged FUNC
221 | {
222 | printf("(function) \"%s\"\n", name);
223 | assert(cubin.num_functions < cubin_max_functions);
224 | cubin_function_t func = {0};
225 | func.name = name;
226 | func.h = cubin.sec_hdrs[ent->shIndx];
227 | func.b = cubin.binary;
228 | func.e = ent;
229 | cubin.functions[cubin.num_functions++] = func;
230 |
231 | // elf64_sec_hdr_t *ent_sh = cubin.sec_hdrs[ent->shIndx];
232 | // printf("section header \"%s\"\n", strtab + ent_sh->name);
233 | }
234 | else
235 | {
236 | printf("(other) \"%s\"\n", name);
237 | }
238 |
239 | #if 0
240 | printf("\tinfo:0x%x\n", ent->info);
241 | printf("\tother:0x%x\n", ent->other);
242 | printf("\tvalue:0x%llx\n", ent->value);
243 | printf("\tsize:0x%llx (%llu)\n", ent->size, ent->size);
244 | #endif
245 | }
246 | }
247 | }
248 |
249 | printf("\nfound %d functions\n", cubin.num_functions);
250 | for (int i = 0; i < cubin.num_functions; i++)
251 | {
252 | printf("\"%s\"\n", cubin.functions[i].name);
253 | printf("\tRegister count: %d\n", cubin.functions[i].register_count());
254 | printf("\tInstructions:\n");
255 | uint64_t *in = cubin.functions[i].instructions();
256 | int num_instructions = cubin.functions[i].num_instructions();
257 | for (int j = 0; j < 10 && j < num_instructions; j++)
258 | printf("\t0x%016llx\n", in[j]);
259 | if (num_instructions > 10)
260 | printf("\t... (%d more instructions)\n", num_instructions - 10);
261 | }
262 | return cubin;
263 | }
264 |
265 | void save_cubin(cubin_t *cubin, const char *filename)
266 | {
267 | FILE *f = fopen(filename, "wb+");
268 | assert(f);
269 | fwrite(cubin->binary, 1, cubin->sizeof_binary, f);
270 | fclose(f);
271 | }
272 |
--------------------------------------------------------------------------------
/src/sass_6_x/instruction.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | namespace backend_sass {
4 |
5 | enum latency_constants_
6 | {
7 | // All the 32-bit floating point instructions (except sqrt) take exactly
8 | // 6 cycles before the result is written to and valid. Subsequent instructions
9 | // that read from this result must therefore be executed atleast six cycles
10 | // after the first one began. The scheduler tries to fill the gap between one
11 | // instruction and one that depends on its results by looking for others that
12 | // do not depend on its results. We conveniently structure our input code into
13 | // 'blocks' that are entirely independent from other blocks, but the instructions
14 | // within a block cannot be reordered. If the scheduler can't find enough
15 | // instructions to fill the pipeline, it will have to insert 'stalls', which
16 | // do nothing for a given number of clock cycles.
17 | LATENCY_X32T = 6,
18 |
19 | // sqrt is a variable latency instruction and needs to set a write barrier
20 | // which dependent instructions must wait on. The later that instruction
21 | // actually does the wait, the more likely it is that the sqrt is finished,
22 | // and the barrier does not incur a stall. We work under the assumption that
23 | // sqrt finishes after 'LATENCY_SQRT' cycles.
24 | LATENCY_SQRT = 8,
25 |
26 | // Setting the write barrier takes non-zero clock cycles.
27 | LATENCY_WRTDB = 1,
28 | };
29 |
30 | enum instruction_type_t
31 | {
32 | INSTRUCTION_FFMA,
33 | INSTRUCTION_FMUL,
34 | INSTRUCTION_FADD,
35 | INSTRUCTION_FFMA20I,
36 | INSTRUCTION_FMUL20I,
37 | INSTRUCTION_FADD20I,
38 | INSTRUCTION_FADD20I_ABS_A,
39 | INSTRUCTION_FMIN,
40 | INSTRUCTION_FMAX,
41 | INSTRUCTION_FMAX_NEG_B,
42 | INSTRUCTION_SQRT
43 | };
44 |
45 | struct instruction_t
46 | {
47 | instruction_type_t type;
48 | named_register_t a,b,c; // source registers ("operands")
49 | named_register_t d; // destination register
50 | float imm_b; // immediate value in b-slot
51 |
52 | // filled in by scheduler
53 | uint8_t ra,rb,rc,rd;
54 | uint8_t reuse; // register reuse flags
55 | uint8_t yield; // can relinquish control to other warp or not
56 | uint8_t stall; // number of cycles to wait before continuing
57 | uint8_t wrtdb; // write dependencies
58 | uint8_t readb; // read dependencies
59 | uint8_t watdb; // wait dependencies
60 | };
61 |
62 | enum { MAX_INSTRUCTIONS_PER_BLOCK = 64 };
63 | struct instruction_block_t
64 | // An instruction block is a list of instructions that implements a single basic
65 | // AST opcode, either a primitive or an operator. During code generation (parsing
66 | // the AST), we create a list of instruction blocks, evaluating the AST bottom-up.
67 | // During this, we assign to each block up to three register addresses.
68 | // A destination register, where the output of the block is to be stored, and
69 | // a left- and right-child register (for boolean operators).
70 | {
71 | instruction_t instructions[MAX_INSTRUCTIONS_PER_BLOCK];
72 | int num_instructions;
73 | int d,d_left,d_right;
74 | };
75 |
76 | enum { MAX_INSTRUCTION_BLOCKS = 128 };
77 | struct instruction_blocks_t
78 | {
79 | instruction_block_t *blocks;
80 | int num_blocks;
81 | };
82 |
83 | void print_instruction(instruction_t in)
84 | {
85 | int n = 0;
86 | if (in.type==INSTRUCTION_FFMA) n+=printf("FFMA r%d, r%d , r%d, r%d", in.rd, in.ra, in.rb, in.rc);
87 | else if (in.type==INSTRUCTION_FMUL) n+=printf("FMUL r%d, r%d , r%d", in.rd, in.ra, in.rb);
88 | else if (in.type==INSTRUCTION_FADD) n+=printf("FADD r%d, r%d , r%d", in.rd, in.ra, in.rb);
89 | else if (in.type==INSTRUCTION_FFMA20I) n+=printf("FFMA r%d, r%d , %5.2ff, r%d", in.rd, in.ra, in.imm_b, in.rc);
90 | else if (in.type==INSTRUCTION_FMUL20I) n+=printf("FMUL r%d, r%d , %5.2ff", in.rd, in.ra, in.imm_b);
91 | else if (in.type==INSTRUCTION_FADD20I) n+=printf("FADD r%d, r%d , %5.2ff", in.rd, in.ra, in.imm_b);
92 | else if (in.type==INSTRUCTION_FADD20I_ABS_A) n+=printf("FADD r%d, |r%d|, %5.2ff", in.rd, in.ra, in.imm_b);
93 | else if (in.type==INSTRUCTION_FMIN) n+=printf("FMIN r%d, r%d , r%d", in.rd, in.ra, in.rb);
94 | else if (in.type==INSTRUCTION_FMAX) n+=printf("FMAX r%d, r%d , r%d", in.rd, in.ra, in.rb);
95 | else if (in.type==INSTRUCTION_FMAX_NEG_B) n+=printf("FMAX r%d, -r%d , r%d", in.rd, in.ra, in.rb);
96 | else if (in.type==INSTRUCTION_SQRT) n+=printf("SQRT r%d, r%d", in.rd, in.ra);
97 | else assert(false);
98 |
99 | for (int i = n; i < 30; i++)
100 | printf(" ");
101 |
102 | if (in.watdb) printf("%02x:", in.watdb); else printf("--:");
103 | if (in.readb==7) printf("-:"); else printf("%d:", in.readb+1);
104 | if (in.wrtdb==7) printf("-:"); else printf("%d:", in.wrtdb+1);
105 | if (in.yield) printf("-:"); else printf("Y:");
106 | printf("%x", in.stall);
107 | if (in.reuse)
108 | printf(" reuse: %s%s%s",
109 | (in.reuse & 1) ? "a" : " ",
110 | (in.reuse & 2) ? "b" : " ",
111 | (in.reuse & 4) ? "c" : " ");
112 | printf("\n");
113 | }
114 |
115 | }
116 |
--------------------------------------------------------------------------------
/src/sass_6_x/registers.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | namespace backend_sass {
4 |
5 | enum named_register_t
6 | {
7 | // This is used to indicate immediate values
8 | // Note: this enum must be 0 because we use memset to clear instructions
9 | NO_REGISTER=0,
10 |
11 | // Input position coordinates
12 | REGISTER_X0,
13 | REGISTER_Y0,
14 | REGISTER_Z0,
15 |
16 | // Temporary calculations
17 | REGISTER_X,
18 | REGISTER_Y,
19 | REGISTER_Z,
20 | REGISTER_W,
21 |
22 | // Result registers (e.g. f(p))
23 | REGISTER_D, // result is to be stored here
24 | REGISTER_D_LEFT, // result from left child in tree is stored here
25 | REGISTER_D_RIGHT, // result from right child in tree is stored here
26 |
27 | // constant zero
28 | REGISTER_RZ,
29 | NUM_NAMED_REGISTERS
30 | };
31 |
32 | }
33 |
--------------------------------------------------------------------------------
/src/sass_6_x/scheduler.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | namespace backend_sass {
4 |
5 | instruction_t *
6 | schedule_blocks(instruction_blocks_t blocks, int *return_num_instructions)
7 | // This function performs physical register allocation and instruction scheduling.
8 | // Register allocation maps the virtual register names used by each instruction to
9 | // physical register addresses (0 to 255). Instruction scheduling makes sure that
10 | // enough clock cycles passes between instructions so that the results are ready.
11 | {
12 | enum { max_instructions = 1024 };
13 | static instruction_t out[max_instructions];
14 | int num_out = 0;
15 |
16 | enum { max_registers = 256 };
17 | enum { num_wait_barriers = 6 };
18 | enum { max_temp_registers = 24 };
19 |
20 | struct wait_barrier_t
21 | {
22 | uint8_t barrier_on_register[max_registers];
23 | bool is_barrier_active[num_wait_barriers];
24 | void init()
25 | {
26 | for (int i = 0; i < num_wait_barriers; i++)
27 | is_barrier_active[i] = false;
28 | for (int i = 0; i < max_registers; i++)
29 | barrier_on_register[i] = 7;
30 | }
31 | bool is_set(uint8_t reg) { return barrier_on_register[reg] != 7; }
32 | uint8_t set(uint8_t reg) // return wrtdb flag
33 | {
34 | for (int i = 0; i < num_wait_barriers; i++)
35 | {
36 | if (!is_barrier_active[i])
37 | {
38 | uint8_t barrier = (uint8_t)(i);
39 | barrier_on_register[reg] = barrier;
40 | is_barrier_active[i] = true;
41 | return barrier;
42 | }
43 | }
44 | assert(false && "Ran out of wait barriers");
45 | return 7;
46 | }
47 | uint8_t wait(uint8_t reg) // return watdb flag (to be OR'd with current flag)
48 | {
49 | uint8_t barrier = barrier_on_register[reg];
50 | assert(barrier != 7 && "Tried to wait on a register that had no wait barrier set.");
51 | uint8_t watdb = 1 << barrier;
52 | is_barrier_active[barrier] = false;
53 | barrier_on_register[reg] = 7;
54 | return watdb;
55 | }
56 | };
57 |
58 | static wait_barrier_t wait_barrier;
59 | wait_barrier.init();
60 |
61 | for (int i = 0; i < blocks.num_blocks; i++)
62 | {
63 | int d = blocks.blocks[i].d;
64 | assert(d < max_temp_registers);
65 | int d_left = blocks.blocks[i].d_left;
66 | int d_right = blocks.blocks[i].d_right;
67 |
68 | static uint8_t register_map[NUM_NAMED_REGISTERS] = {0};
69 | register_map[NO_REGISTER] = 0xff;
70 | register_map[REGISTER_X0] = 0x00;
71 | register_map[REGISTER_Y0] = 0x01;
72 | register_map[REGISTER_Z0] = 0x02;
73 | register_map[REGISTER_X] = 0x03;
74 | register_map[REGISTER_Y] = 0x04;
75 | register_map[REGISTER_Z] = 0x05;
76 | register_map[REGISTER_W] = 0x06;
77 | register_map[REGISTER_D] = 0x07 + d;
78 | register_map[REGISTER_D_LEFT] = 0x07 + d_left;
79 | register_map[REGISTER_D_RIGHT] = 0x07 + d_right;
80 | register_map[REGISTER_RZ] = 0xff;
81 |
82 | for (int j = 0; j < blocks.blocks[i].num_instructions; j++)
83 | {
84 | instruction_t *in = &blocks.blocks[i].instructions[j];
85 | in->ra = register_map[in->a];
86 | in->rb = register_map[in->b];
87 | in->rc = register_map[in->c];
88 | in->rd = register_map[in->d];
89 | in->reuse = 0;
90 | in->watdb = 0;
91 | in->readb = 7;
92 | in->wrtdb = 7;
93 | in->yield = 0;
94 | if (in->a != NO_REGISTER && wait_barrier.is_set(in->ra)) { in->watdb |= wait_barrier.wait(in->ra); }
95 | if (in->b != NO_REGISTER && wait_barrier.is_set(in->rb)) { in->watdb |= wait_barrier.wait(in->rb); }
96 | if (in->c != NO_REGISTER && wait_barrier.is_set(in->rc)) { in->watdb |= wait_barrier.wait(in->rc); }
97 |
98 | // if we the instruction doesn't have a stall count set already
99 | // we set it to the latency of the instruction.
100 | if (in->stall == 0)
101 | {
102 | if (in->type == INSTRUCTION_SQRT) in->stall = 1+LATENCY_WRTDB;
103 | else in->stall = LATENCY_X32T;
104 | }
105 |
106 | if (in->type == INSTRUCTION_SQRT) in->wrtdb = wait_barrier.set(in->rd);
107 |
108 | // simple reuse tactic
109 | #if 1
110 | if (j > 0)
111 | {
112 | instruction_t *last = &blocks.blocks[i].instructions[j-1];
113 | if (last->a != NO_REGISTER && last->ra == in->ra && last->rd != in->ra) in->reuse |= 1 << 0;
114 | if (last->b != NO_REGISTER && last->rb == in->rb && last->rd != in->rb) in->reuse |= 1 << 1;
115 | if (last->c != NO_REGISTER && last->rc == in->rc && last->rd != in->rc) in->reuse |= 1 << 2;
116 | }
117 | #endif
118 |
119 | out[num_out++] = *in;
120 | assert(num_out <= max_instructions);
121 | }
122 | }
123 |
124 | *return_num_instructions = num_out;
125 | return out;
126 | }
127 |
128 | }
129 |
--------------------------------------------------------------------------------
/src/sass_6_x/simulator.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | namespace backend_sass {
4 |
5 | struct sass_simulator_t
6 | {
7 | bool debug;
8 | int t;
9 | float reg[256];
10 |
11 | // writes in progress
12 | struct job_t
13 | {
14 | uint8_t dst;
15 | float val;
16 | int t_write;
17 | };
18 | enum { max_write_jobs = 1024 };
19 | job_t writes[max_write_jobs];
20 | int num_writes_waiting;
21 |
22 | // barriers
23 | enum { num_write_barriers = 6 };
24 | int register_on_barrier[num_write_barriers];
25 |
26 | void init(bool _debug)
27 | {
28 | reg[REGISTER_RZ] = 0.0f;
29 | num_writes_waiting = 0;
30 | t = 0;
31 | debug = _debug;
32 | for (int i = 0; i < num_write_barriers; i++)
33 | register_on_barrier[i] = -1;
34 | }
35 | void _step(int cycles)
36 | {
37 | t += cycles;
38 | for (int i = 0; i < num_writes_waiting; i++)
39 | {
40 | if (t >= writes[i].t_write)
41 | {
42 | reg[writes[i].dst] = writes[i].val;
43 |
44 | // if a write barrier was set on the register we can take it down
45 | for (int j = 0; j < 6; j++)
46 | {
47 | if (register_on_barrier[j] == writes[i].dst)
48 | register_on_barrier[j] = -1;
49 | }
50 |
51 | writes[i] = writes[--num_writes_waiting];
52 | i--;
53 | }
54 | }
55 | }
56 | void _set_write_barrier(uint8_t reg, uint8_t barrier)
57 | {
58 | assert(barrier >= 0 && barrier <= num_write_barriers-1);
59 | assert(register_on_barrier[barrier] == -1 && "overwrote an existing write barrier.");
60 | register_on_barrier[barrier] = reg;
61 | }
62 | void _wait_on_barrier(uint8_t barrier)
63 | {
64 | if (register_on_barrier[barrier] == -1)
65 | return;
66 | assert(barrier >= 0 && barrier <= num_write_barriers-1);
67 | bool resolved = false;
68 | for (int i = 0; i < num_writes_waiting; i++)
69 | {
70 | if (writes[i].dst == (uint8_t)register_on_barrier[barrier])
71 | {
72 | int t_to_wait = writes[i].t_write - t;
73 | if (t_to_wait > 0)
74 | {
75 | if (debug) printf("waited %d cycles on barrier\n", t_to_wait);
76 | _step(t_to_wait);
77 | }
78 | resolved = true;
79 | register_on_barrier[barrier] = -1;
80 | }
81 | }
82 | assert(resolved && "waited on a barrier which is not resolved by any on-going writes.");
83 | }
84 | float _read_reg(uint8_t src)
85 | {
86 | for (int i = 0; i < num_writes_waiting; i++)
87 | if (writes[i].dst == src && debug)
88 | printf("read-before-write conflict on r%d\n", src);
89 | return reg[src];
90 | }
91 | void _write_reg(uint8_t dst, float val, int latency)
92 | {
93 | assert(num_writes_waiting+1 <= max_write_jobs);
94 | writes[num_writes_waiting].dst = dst;
95 | writes[num_writes_waiting].val = val;
96 | writes[num_writes_waiting].t_write = t + latency;
97 | num_writes_waiting++;
98 | }
99 | void execute(instruction_t in)
100 | {
101 | using namespace backend_sass;
102 | bool is_immediate =
103 | in.type == INSTRUCTION_FFMA20I ||
104 | in.type == INSTRUCTION_FMUL20I ||
105 | in.type == INSTRUCTION_FADD20I ||
106 | in.type == INSTRUCTION_FADD20I_ABS_A;
107 |
108 | if (in.watdb)
109 | {
110 | if (in.watdb & 1) _wait_on_barrier(0);
111 | if (in.watdb & 2) _wait_on_barrier(1);
112 | if (in.watdb & 4) _wait_on_barrier(2);
113 | if (in.watdb & 8) _wait_on_barrier(3);
114 | if (in.watdb & 16) _wait_on_barrier(4);
115 | if (in.watdb & 32) _wait_on_barrier(5);
116 | }
117 |
118 | if (in.wrtdb != 7) _set_write_barrier(in.rd, in.wrtdb);
119 |
120 | float a = _read_reg(in.ra);
121 | float b = is_immediate ? in.imm_b : _read_reg(in.rb);
122 | float c = _read_reg(in.rc);
123 |
124 | float d;
125 | int lat;
126 | if (in.type==INSTRUCTION_FFMA) { lat = LATENCY_X32T; d = a*b + c; }
127 | else if (in.type==INSTRUCTION_FMUL) { lat = LATENCY_X32T; d = a*b; }
128 | else if (in.type==INSTRUCTION_FADD) { lat = LATENCY_X32T; d = a + b; }
129 | else if (in.type==INSTRUCTION_FFMA20I) { lat = LATENCY_X32T; d = a*b + c; }
130 | else if (in.type==INSTRUCTION_FMUL20I) { lat = LATENCY_X32T; d = a*b; }
131 | else if (in.type==INSTRUCTION_FADD20I) { lat = LATENCY_X32T; d = a + b; }
132 | else if (in.type==INSTRUCTION_FADD20I_ABS_A) { lat = LATENCY_X32T; d = fabsf(a) + b; }
133 | else if (in.type==INSTRUCTION_FMIN) { lat = LATENCY_X32T; d = (a < b) ? a : b; }
134 | else if (in.type==INSTRUCTION_FMAX) { lat = LATENCY_X32T; d = (a > b) ? a : b; }
135 | else if (in.type==INSTRUCTION_FMAX_NEG_B) { lat = LATENCY_X32T; d = (a > -b) ? a : -b; }
136 | else if (in.type==INSTRUCTION_SQRT) { lat = LATENCY_SQRT; d = sqrtf(a); }
137 | else assert(false && "unhandled instruction");
138 |
139 | _write_reg(in.rd, d, lat);
140 | _step(in.stall);
141 |
142 | if (debug) print_instruction(in);
143 | }
144 | };
145 |
146 | }
147 |
--------------------------------------------------------------------------------
/test/backend_glsl.cpp:
--------------------------------------------------------------------------------
1 | #include "../src/backend_glsl.h"
2 | #include "../src/frep_builder.h"
3 |
4 | int main() {
5 | frep_t *f = fBoxCheap(1.0f, 0.5f, 0.25f);
6 | f = fOpUnion(f, fBox(2.0f, 1.0f, 1.0f));
7 | char *s = frep_compile_to_glsl(f);
8 | printf("%s\n", s);
9 | }
10 |
--------------------------------------------------------------------------------
/test/backend_ptx.cpp:
--------------------------------------------------------------------------------
1 | // Example compilation instructions for Linux, g++:
2 | // (Replace include directory with your installation and version of CUDA)
3 | // $ g++ -std=c++11 backend_ptx.cpp -I/usr/local/cuda-10.1/include -lcuda
4 |
5 | #include
6 | #include
7 | #include
8 | #include "util/cuda_error.h"
9 | #include "util/init_cuda.h"
10 |
11 | #define PTX_FP20_IMMEDIATE
12 | #include "../src/frep.h"
13 | #include "../src/frep_eval.h"
14 | #include "../src/frep_builder.h"
15 | #include "../src/backend_ptx.h"
16 |
17 | // This generates a PTX program equivalent to:
18 | // float tree(float x, float y, float z) {
19 | // // generated PTX instructions
20 | // }
21 | // void main(vec4 *input, float *output) {
22 | // int tid = threadIdx.x + blockDim.x*blockIdx.x;
23 | // vec4 p = input[tid];
24 | // output[tid] = tree(p.x, p.y, p.z);
25 | // }
26 | // Note: out_length _DOES NOT_ include the null-terminator.
27 | char *generate_ptx_program(frep_t *f, size_t *out_length)
28 | {
29 | const char *ptx_template = R"str(
30 | .version 6.0
31 | .target sm_60
32 | .address_size 64
33 | .func (.reg.f32 f%d) tree(.reg.f32 x0, .reg.f32 y0, .reg.f32 z0) {
34 | .reg.f32 f<%d>;
35 | %s
36 | ret.uni;
37 | }
38 | .visible.entry main(.param.u64 param0, .param.u64 param1) {
39 | .reg.f32 x0;
40 | .reg.f32 y0;
41 | .reg.f32 z0;
42 | .reg.f32 w0;
43 | .reg.b32 r<5>;
44 | .reg.b64 rd<9>;
45 | .reg.f32 d;
46 | ld.param.u64 rd1, [param0];
47 | ld.param.u64 rd2, [param1];
48 | cvta.to.global.u64 rd3, rd2;
49 | cvta.to.global.u64 rd4, rd1;
50 | mov.u32 r1, %%tid.x; // threadIdx.x
51 | mov.u32 r2, %%ctaid.x; // blockIdx.x
52 | mov.u32 r3, %%ntid.x; // blockDim.x
53 | mad.lo.s32 r4, r3, r2, r1; // blockDim.x*blockIdx.x + threadIdx.x
54 | mul.wide.s32 rd5, r4, 16; // sizeof(vec4)*(blockDim.x*blockIdx.x + threadIdx.x)
55 | add.s64 rd6, rd4, rd5; // param0 + sizeof(vec4)*(blockDim.x*blockIdx.x + threadIdx.x)
56 | ld.global.v4.f32 {x0, y0, z0, w0}, [rd6];
57 | mul.wide.s32 rd7, r4, 4; // sizeof(float)*(blockDim.x*blockIdx.x + threadIdx.x)
58 | add.s64 rd8, rd3, rd7; // param1 + sizeof(float)*(blockDim.x*blockIdx.x + threadIdx.x)
59 | call.uni (d), tree, (x0,y0,z0);
60 | st.global.f32 [rd8], d;
61 | ret;
62 | }
63 | )str";
64 |
65 | static char buffer[10*1024*1024];
66 | char *stream = buffer;
67 | int result_register;
68 | char *ptx = frep_compile_to_ptx(f, &result_register);
69 | stream += sprintf(stream, ptx_template, result_register, result_register, ptx);
70 | *out_length = (stream - buffer);
71 | return buffer;
72 | }
73 |
74 | CUmodule load_ptx_program(
75 | const char *ptx_source, size_t ptx_source_length,
76 | int jit_optimization_level)
77 | {
78 | CUmodule module;
79 | void *cubin; size_t cubin_size;
80 | CUlinkState link_state;
81 | enum { num_options = 8 };
82 | CUjit_option options[num_options];
83 | void *option_values[num_options];
84 | float walltime;
85 | char error_log[8192], info_log[8192];
86 |
87 | assert(jit_optimization_level >= 0 && jit_optimization_level <= 4);
88 |
89 | // see CUDA Driver API manual for these options (look up cuLinkCreate)
90 | options[0] = CU_JIT_WALL_TIME; option_values[0] = (void *) &walltime;
91 | options[1] = CU_JIT_INFO_LOG_BUFFER; option_values[1] = (void *) info_log;
92 | options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; option_values[2] = (void *) (long)sizeof(info_log);
93 | options[3] = CU_JIT_ERROR_LOG_BUFFER; option_values[3] = (void *) error_log;
94 | options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; option_values[4] = (void *) (long)sizeof(error_log);
95 | options[5] = CU_JIT_LOG_VERBOSE; option_values[5] = (void *) 1;
96 | options[6] = CU_JIT_TARGET; option_values[6] = (void *) CU_TARGET_COMPUTE_60;
97 | options[7] = CU_JIT_OPTIMIZATION_LEVEL; option_values[7] = (void *) (long)jit_optimization_level;
98 | cudaCheckError(cuLinkCreate(num_options, options, option_values, &link_state));
99 |
100 | int err = cuLinkAddData(link_state, CU_JIT_INPUT_PTX, (void *)ptx_source, ptx_source_length+1, 0, 0, 0, 0);
101 | if (err != CUDA_SUCCESS)
102 | fprintf(stderr, "PTX Linker Error:\n%s\n", error_log);
103 | cudaCheckError(cuLinkComplete(link_state, &cubin, &cubin_size));
104 | printf("Linking done in %fms. Linker Output:\n%s\n", walltime, info_log);
105 |
106 | cudaCheckError(cuModuleLoadData(&module, cubin)); assert(module);
107 | cudaCheckError(cuLinkDestroy(link_state));
108 | return module;
109 | }
110 |
111 | void run_ptx_program(
112 | void *input, size_t sizeof_input,
113 | void *output, size_t sizeof_output,
114 | const char *ptx_source, size_t ptx_source_length, const char *entry_name,
115 | int num_blocks, int threads_per_block, int shared_memory_bytes=1024,
116 | int jit_optimization_level=1 /*allowed values = 0,1,2,3,4*/)
117 | {
118 | CUdeviceptr dev_input;
119 | CUdeviceptr dev_output;
120 | cudaCheckError(cuMemAlloc(&dev_input, sizeof_input)); assert(dev_input);
121 | cudaCheckError(cuMemAlloc(&dev_output, sizeof_output)); assert(dev_output);
122 | cudaCheckError(cuMemcpyHtoD(dev_input, input, sizeof_input));
123 | CUmodule module = load_ptx_program(ptx_source, ptx_source_length, jit_optimization_level);
124 | CUfunction kernel = 0;
125 | cudaCheckError(cuModuleGetFunction(&kernel, module, entry_name));
126 | uint64_t param0 = (uint64_t)(dev_input);
127 | uint64_t param1 = (uint64_t)(dev_output);
128 | void *kernel_params[] = { (void*)¶m0, (void*)¶m1 };
129 | cuLaunchKernel(kernel, num_blocks,1,1, threads_per_block,1,1, shared_memory_bytes, NULL, kernel_params, NULL);
130 | cudaCheckError(cuCtxSynchronize());
131 | cudaCheckError(cuMemcpyDtoH(output, dev_output, sizeof_output));
132 | cudaCheckError(cuMemFree(dev_output));
133 | cudaCheckError(cuMemFree(dev_input));
134 | cudaCheckError(cuModuleUnload(module));
135 | }
136 |
137 | void run_test(int test_number, frep_t *f)
138 | {
139 | printf("///////////////////////////////////////////////////\n");
140 | printf(" running test number %d\n", test_number);
141 |
142 | const int num_points_x = 4;
143 | const int num_points_y = 4;
144 | const int num_points_z = 4;
145 | const int num_threads = 32;
146 | const int num_points = num_points_x*num_points_y*num_points_z;
147 | const int num_blocks = num_points/num_threads;
148 | const int sizeof_input = num_points*4*sizeof(float);
149 | const int sizeof_output = num_points*1*sizeof(float);
150 |
151 | float *output = (float*)malloc(sizeof_output); assert(output);
152 | float *cpu_output = (float*)malloc(sizeof_output); assert(cpu_output);
153 | float *input = (float*)malloc(num_points*4*sizeof(float));
154 |
155 | // generate input array data (points sampled in regular grid)
156 | {
157 | float *p = input;
158 | for (int zi = 0; zi < num_points_z; zi++)
159 | for (int yi = 0; yi < num_points_y; yi++)
160 | for (int xi = 0; xi < num_points_x; xi++)
161 | {
162 | p[0] = (-1.0f + 2.0f*xi/num_points_x);
163 | p[1] = (-1.0f + 2.0f*yi/num_points_y);
164 | p[2] = (-1.0f + 2.0f*zi/num_points_z);
165 | p[3] = 0.0f;
166 | p += 4;
167 | }
168 | }
169 |
170 | // compute expected output using CPU-based evaluator
171 | {
172 | for (int i = 0; i < num_points; i++)
173 | {
174 | float x = input[4*i + 0];
175 | float y = input[4*i + 1];
176 | float z = input[4*i + 2];
177 | cpu_output[i] = frep_eval(f, x, y, z);
178 | }
179 | }
180 |
181 | // compute output using GPU
182 | {
183 | size_t ptx_length;
184 | char *ptx_source = generate_ptx_program(f, &ptx_length);
185 | run_ptx_program(
186 | input, sizeof_input,
187 | output, sizeof_output,
188 | ptx_source, ptx_length,
189 | "main",
190 | num_blocks, num_threads);
191 | }
192 |
193 | // verify that GPU output matches CPU output
194 | for (int i = 0; i < num_points; i++)
195 | {
196 | float d_cpu = cpu_output[i];
197 | float d_ptx = output[i];
198 | if (fabsf(d_cpu - d_ptx) > 0.01f)
199 | {
200 | float x = input[4*i + 0];
201 | float y = input[4*i + 1];
202 | float z = input[4*i + 2];
203 | printf("\nEvaluation mismatch!\n");
204 | printf("cpu: f(%.2f,%.2f,%.2f) = %f\n", x, y, z, d_cpu);
205 | printf("ptx: f(%.2f,%.2f,%.2f) = %f\n", x, y, z, d_ptx);
206 | exit(1);
207 | }
208 | }
209 |
210 | free(output);
211 | free(cpu_output);
212 | free(input);
213 | }
214 |
215 | int main(int argc, char **argv)
216 | {
217 | init_cuda();
218 |
219 | frep_t *f = fBoxCheap(1.0f, 0.5f, 0.25f);
220 | run_test(1, f);
221 |
222 | return 0;
223 | }
224 |
--------------------------------------------------------------------------------
/test/backend_sass_6_x.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "util/cuda_error.h"
5 | #include "util/init_cuda.h"
6 | #include "../src/frep.h"
7 | #include "../src/frep_eval.h"
8 | #include "../src/frep_builder.h"
9 | #include "../src/backend_sass.h"
10 |
11 | CUmodule link_sass(CUmodule *module,
12 | void *cubin1, size_t sizeof_cubin1,
13 | void *cubin2, size_t sizeof_cubin2);
14 |
15 | int main(int argc, char **argv)
16 | {
17 | setenv("CUDA_CACHE_DISABLE", "1", 1);
18 | init_cuda();
19 |
20 | system("/usr/local/cuda-10.1/bin/nvcc "
21 | "--gpu-architecture=sm_60 "
22 | "--cubin "
23 | "--relocatable-device-code=true "
24 | "main.cu "
25 | "--output-file main.cubin");
26 |
27 | size_t sizeof_cubin_main;
28 | void *cubin_main = read_file("main.cubin", &sizeof_cubin_main);
29 |
30 | frep_t *tree = fBoxCheap(1.0f, 0.5f, 0.25f);
31 |
32 | size_t sizeof_cubin_tree;
33 | void *cubin_tree = frep_compile_to_sass(tree, &sizeof_cubin_tree);
34 |
35 | CUmodule module = 0;
36 | link_sass(&module, cubin_main, sizeof_cubin_main, cubin_tree, sizeof_cubin_tree);
37 |
38 | CUfunction kernel;
39 | cudaCheckError(cuModuleGetFunction(&kernel, module, "main")); assert(kernel);
40 |
41 | //
42 | // finally we run the thing to make sure that it actually works.
43 | //
44 | int N = 32;
45 | size_t sizeof_input = 4*N*sizeof(float);
46 | size_t sizeof_output = N*sizeof(float);
47 | float *input = (float*)malloc(sizeof_input);
48 | float *output = (float*)malloc(sizeof_output);
49 |
50 | for (int i = 0; i < N; i++)
51 | {
52 | input[4*i + 0] = 1.0f;
53 | input[4*i + 1] = 0.0f;
54 | input[4*i + 2] = 0.0f;
55 | input[4*i + 3] = 0.0f;
56 | }
57 |
58 | int num_blocks = 8;
59 | int num_threads = 4;
60 | int shared_memory_bytes = 1024;
61 | CUdeviceptr dev_input;
62 | CUdeviceptr dev_output;
63 | cudaCheckError(cuMemAlloc(&dev_input, sizeof_input)); assert(dev_input);
64 | cudaCheckError(cuMemAlloc(&dev_output, sizeof_output)); assert(dev_output);
65 | cudaCheckError(cuMemcpyHtoD(dev_input, input, sizeof_input));
66 | uint64_t param0 = (uint64_t)(dev_input);
67 | uint64_t param1 = (uint64_t)(dev_output);
68 | void *kernel_params[] = { (void*)¶m0, (void*)¶m1 };
69 | cuLaunchKernel(kernel, num_blocks,1,1, num_threads,1,1, shared_memory_bytes, NULL, kernel_params, NULL);
70 | cudaCheckError(cuCtxSynchronize());
71 | cudaCheckError(cuMemcpyDtoH(output, dev_output, sizeof_output));
72 | cudaCheckError(cuMemFree(dev_output));
73 | cudaCheckError(cuMemFree(dev_input));
74 |
75 | cudaCheckError(cuModuleUnload(module));
76 |
77 | printf("output:\n");
78 | for (int i = 0; i < N; i++)
79 | printf("%f ", output[i]);
80 |
81 | return 0;
82 | }
83 |
84 | void link_sass(CUmodule *module,
85 | void *cubin1, size_t sizeof_cubin1,
86 | void *cubin2, size_t sizeof_cubin2)
87 | {
88 | enum { num_options = 6 };
89 | CUjit_option options[num_options];
90 | void *option_values[num_options];
91 | char error_log[8192];
92 | char info_log[8192];
93 | options[0] = CU_JIT_INFO_LOG_BUFFER; option_values[0] = (void *) info_log;
94 | options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; option_values[1] = (void *) (long)sizeof(info_log);
95 | options[2] = CU_JIT_ERROR_LOG_BUFFER; option_values[2] = (void *) error_log;
96 | options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; option_values[3] = (void *) (long)sizeof(error_log);
97 | options[4] = CU_JIT_LOG_VERBOSE; option_values[4] = (void *) 1;
98 | options[5] = CU_JIT_TARGET; option_values[5] = (void *) CU_TARGET_COMPUTE_60;
99 | CUlinkState link_state;
100 | cudaCheckError(cuLinkCreate(num_options, options, option_values, &link_state));
101 |
102 | if (CUDA_SUCCESS != cuLinkAddData(link_state, CU_JIT_INPUT_CUBIN,
103 | (void *)cubin_main, sizeof_cubin_main, 0,0,0,0))
104 | fprintf(stderr, "nvlink error:\n%s\n", error_log);
105 |
106 | if (CUDA_SUCCESS != cuLinkAddData(link_state, CU_JIT_INPUT_CUBIN,
107 | (void *)cubin_tree, sizeof_cubin_tree, 0,0,0,0))
108 | fprintf(stderr, "nvlink error:\n%s\n", error_log);
109 |
110 | void *cubin;
111 | size_t cubin_size;
112 | cudaCheckError(cuLinkComplete(link_state, &cubin, &cubin_size));
113 | cudaCheckError(cuModuleLoadData(module, cubin)); assert(module);
114 | cudaCheckError(cuLinkDestroy(link_state));
115 | }
116 |
--------------------------------------------------------------------------------
/test/backend_sass_6_x_mock.cpp:
--------------------------------------------------------------------------------
1 | #define COMPUTE_CAPABILITY_6_X
2 | #include
3 | #include
4 | #include
5 | #include "../src/frep.h"
6 | #include "../src/frep_builder.h"
7 | #include "../src/frep_eval.h"
8 | #include "../src/backend_sass.h"
9 | #include "../src/sass_6_x/simulator.h"
10 |
11 | using namespace backend_sass;
12 |
13 | float frep_eval_sass(
14 | float x0, float y0, float z0,
15 | instruction_t *instructions, int num_instructions,
16 | bool debug=false)
17 | {
18 | static sass_simulator_t sim = {0};
19 | sim.init(debug);
20 | sim.reg[0x00] = x0;
21 | sim.reg[0x01] = y0;
22 | sim.reg[0x02] = z0;
23 | for (int i = 0; i < num_instructions; i++)
24 | sim.execute(instructions[i]);
25 | return sim.reg[0x07];
26 | }
27 |
28 | void run_test(int test_number, frep_t *tree)
29 | {
30 | instruction_blocks_t blocks = generate_blocks(tree);
31 |
32 | int num_instructions;
33 | instruction_t *instructions = schedule_blocks(blocks, &num_instructions);
34 |
35 | printf("///////////////////////////////////////////////////\n");
36 | printf(" test number %d\n", test_number);
37 |
38 | frep_eval_sass(0.0f,0.0f,0.0f, instructions, num_instructions, true);
39 |
40 | for (int i = -4; i <= 4; i++)
41 | for (int j = -4; j <= 4; j++)
42 | for (int k = -4; k <= 4; k++)
43 | {
44 | float x0 = i/4.0f;
45 | float y0 = j/4.0f;
46 | float z0 = k/4.0f;
47 | float f_sass = frep_eval_sass(x0,y0,z0, instructions, num_instructions);
48 | float f_true = frep_eval(tree, x0,y0,z0);
49 | if (fabsf(f_sass - f_true) > 0.00001f)
50 | {
51 | printf("\nEvaluation mismatch!\n");
52 | printf("true: f(%.2f,%.2f,%.2f) = %f\n", x0,y0,z0,f_true);
53 | printf("sass: f(%.2f,%.2f,%.2f) = %f\n", x0,y0,z0,f_sass);
54 | exit(1);
55 | }
56 | }
57 | printf("ok!\n");
58 | }
59 |
60 | int main()
61 | {
62 | frep_t *tree;
63 |
64 | tree = fBoxCheap(0.9f,0.6f,0.3f);
65 | run_test(0, tree);
66 |
67 | tree = fSphere(0.3f);
68 | run_test(1, tree);
69 |
70 | tree = fCylinder(0.6f,0.3f);
71 | run_test(2, tree);
72 |
73 | tree = fPlane(1.0f, 0.3f);
74 | pOpRotate(tree, 0.3f,0.5f,0.4f);
75 | pOpTranslate(tree, 0.2f,0.5f,0.4f);
76 | run_test(3, tree);
77 |
78 | frep_t *d1 = fBoxCheap(1.0f,0.5f,0.25f);
79 | pOpRotate(d1, 0.1f,0.4f,0.3f);
80 | pOpTranslate(d1, 0.5f,0.25f,0.25f);
81 | frep_t *d2 = fSphere(0.8f);
82 | pOpTranslate(d2, 1.0f,0,0);
83 | frep_t *d3 = fCylinder(0.4f, 0.2f);
84 | pOpTranslate(d3, 1.0f, 1.0f, 0.3f);
85 | tree = fOpUnion(fOpUnion(d1, d2), d3);
86 | run_test(4, tree);
87 | }
88 |
--------------------------------------------------------------------------------
/test/linker.cpp:
--------------------------------------------------------------------------------
1 | // This file tests the use of seperate compilation to link together
2 | // pre-existing (relocatable) Cubin files. This is useful because we
3 | // can use the CUDA Driver API to generate an executable Cubin from
4 | // the output of our SASS backend and a user-provided Cubin containing
5 | // the entrypoint.
6 | //
7 | // To compile this file on Linux using g++:
8 | // $ g++ -std=c++11 linker.cpp -I/usr/local/cuda-10.1/include -lcuda
9 | //
10 | #include
11 | #include
12 | #include
13 | #include "util/cuda_error.h"
14 | #include "util/init_cuda.h"
15 | #include "util/read_file.h"
16 | #define ENABLE_TIMING
17 | #include "util/profiler.h"
18 |
19 | int main() {
20 | init_cuda();
21 |
22 | //
23 | // Generate relocatable SASS binaries by invoking the PTX assembler
24 | // on our two test files. Neither of these can be executed on their
25 | // own, so we will link them together into an actual executable using
26 | // the CUDA linker in the Driver API.
27 | //
28 | system("/usr/local/cuda-10.1/bin/ptxas --opt-level 1 --compile-only --gpu-name sm_60 test1.ptx --output-file test1.cubin");
29 | system("/usr/local/cuda-10.1/bin/ptxas --opt-level 1 --compile-only --gpu-name sm_60 test2.ptx --output-file test2.cubin");
30 |
31 | int sizeof_cubin1 = 0;
32 | void *cubin1 = (void*)read_file("test1.cubin", &sizeof_cubin1);
33 | assert(cubin1);
34 |
35 | int sizeof_cubin2 = 0;
36 | void *cubin2 = (void*)read_file("test2.cubin", &sizeof_cubin2);
37 | assert(cubin2);
38 |
39 | CUfunction kernel;
40 | CUmodule module;
41 | const char *entry_name = "main";
42 |
43 | // We do this 100 times and measure the time it takes the driver to
44 | // link together the Cubin file, and report the average in ms.
45 | for (int i = 0; i < 100; i++)
46 | {
47 | TIMING("linker");
48 |
49 | //
50 | // initialize the linker. note: CU_JIT_TARGET must match compute mode
51 | // specified in test1.ptx and test2.ptx, and the --gpu-name argument
52 | // passed to ptxas above.
53 | //
54 | enum { num_options = 6 };
55 | CUjit_option options[num_options];
56 | void *option_values[num_options];
57 | char error_log[8192];
58 | char info_log[8192];
59 | options[0] = CU_JIT_INFO_LOG_BUFFER; option_values[0] = (void *) info_log;
60 | options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; option_values[1] = (void *) (long)sizeof(info_log);
61 | options[2] = CU_JIT_ERROR_LOG_BUFFER; option_values[2] = (void *) error_log;
62 | options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; option_values[3] = (void *) (long)sizeof(error_log);
63 | options[4] = CU_JIT_LOG_VERBOSE; option_values[4] = (void *) 1;
64 | options[5] = CU_JIT_TARGET; option_values[5] = (void *) CU_TARGET_COMPUTE_60;
65 | CUlinkState link_state;
66 | cudaCheckError(cuLinkCreate(num_options, options, option_values, &link_state));
67 |
68 | if (CUDA_SUCCESS != cuLinkAddData(link_state, CU_JIT_INPUT_CUBIN,
69 | (void *)cubin1, sizeof_cubin1, 0,0,0,0))
70 | fprintf(stderr, "nvlink error:\n%s\n", error_log);
71 |
72 | if (CUDA_SUCCESS != cuLinkAddData(link_state, CU_JIT_INPUT_CUBIN,
73 | (void *)cubin2, sizeof_cubin2, 0,0,0,0))
74 | fprintf(stderr, "nvlink error:\n%s\n", error_log);
75 |
76 | void *cubin;
77 | size_t cubin_size;
78 | cudaCheckError(cuLinkComplete(link_state, &cubin, &cubin_size));
79 |
80 | cudaCheckError(cuModuleLoadData(&module, cubin)); assert(module);
81 | cudaCheckError(cuLinkDestroy(link_state));
82 | cudaCheckError(cuModuleGetFunction(&kernel, module, entry_name)); assert(kernel);
83 |
84 | TIMING("linker");
85 | }
86 | assert(kernel);
87 |
88 | // Print the average linking time in milliseconds
89 | TIMING_SUMMARY();
90 |
91 | //
92 | // finally we run the thing to make sure that it actually works.
93 | //
94 | int N = 32;
95 | size_t sizeof_input = 4*N*sizeof(float);
96 | size_t sizeof_output = N*sizeof(float);
97 | float *input = (float*)malloc(sizeof_input);
98 | float *output = (float*)malloc(sizeof_output);
99 |
100 | for (int i = 0; i < 32; i++)
101 | {
102 | input[4*i + 0] = 1.1f;
103 | input[4*i + 1] = 0.0f;
104 | input[4*i + 2] = 0.0f;
105 | input[4*i + 3] = 0.0f;
106 | }
107 |
108 | int num_blocks = 8;
109 | int num_threads = 4;
110 | int shared_memory_bytes = 1024;
111 | CUdeviceptr dev_input;
112 | CUdeviceptr dev_output;
113 | cudaCheckError(cuMemAlloc(&dev_input, sizeof_input)); assert(dev_input);
114 | cudaCheckError(cuMemAlloc(&dev_output, sizeof_output)); assert(dev_output);
115 | cudaCheckError(cuMemcpyHtoD(dev_input, input, sizeof_input));
116 | uint64_t param0 = (uint64_t)(dev_input);
117 | uint64_t param1 = (uint64_t)(dev_output);
118 | void *kernel_params[] = { (void*)¶m0, (void*)¶m1 };
119 | cuLaunchKernel(kernel, num_blocks,1,1, num_threads,1,1, shared_memory_bytes, NULL, kernel_params, NULL);
120 | cudaCheckError(cuCtxSynchronize());
121 | cudaCheckError(cuMemcpyDtoH(output, dev_output, sizeof_output));
122 | cudaCheckError(cuMemFree(dev_output));
123 | cudaCheckError(cuMemFree(dev_input));
124 | cudaCheckError(cuModuleUnload(module));
125 |
126 | printf("output:\n");
127 | for (int i = 0; i < N; i++)
128 | printf("%f ", output[i]);
129 | }
130 |
--------------------------------------------------------------------------------
/test/test1.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lightbits/fast-csg/53b14f651e9544580ba31ba0f157221a77ba44fe/test/test1.cubin
--------------------------------------------------------------------------------
/test/test1.ptx:
--------------------------------------------------------------------------------
1 | .version 6.0
2 | .target sm_60
3 | .address_size 64
4 |
5 | .visible .func (.reg.f32 f1) tree(.reg.f32 x0, .reg.f32 y0, .reg.f32 z0) {
6 | .reg.f32 x;
7 | .reg.f32 y;
8 | .reg.f32 z;
9 | abs.f32.ftz x, x0;
10 | abs.f32.ftz y, y0;
11 | abs.f32.ftz z, z0;
12 | sub.f32.ftz x,x,1.0;
13 | sub.f32.ftz y,y,0.5;
14 | sub.f32.ftz z,z,0.25;
15 | max.f32.ftz f1,x,y;
16 | max.f32.ftz f1,f1,z;
17 | ret.uni;
18 | }
19 |
--------------------------------------------------------------------------------
/test/test2.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lightbits/fast-csg/53b14f651e9544580ba31ba0f157221a77ba44fe/test/test2.cubin
--------------------------------------------------------------------------------
/test/test2.ptx:
--------------------------------------------------------------------------------
1 | .version 6.0
2 | .target sm_60
3 | .address_size 64
4 |
5 | .extern .func (.reg.f32 f1) tree(.reg.f32 x0, .reg.f32 y0, .reg.f32 z0)
6 |
7 | .visible.entry main(.param.u64 param0, .param.u64 param1) {
8 | .reg.f32 x0;
9 | .reg.f32 y0;
10 | .reg.f32 z0;
11 | .reg.f32 w0;
12 | .reg.b32 r<5>;
13 | .reg.b64 rd<9>;
14 | .reg.f32 d;
15 | ld.param.u64 rd1, [param0];
16 | ld.param.u64 rd2, [param1];
17 | cvta.to.global.u64 rd3, rd2;
18 | cvta.to.global.u64 rd4, rd1;
19 | mov.u32 r1, %tid.x; // threadIdx.x
20 | mov.u32 r2, %ctaid.x; // blockIdx.x
21 | mov.u32 r3, %ntid.x; // blockDim.x
22 | mad.lo.s32 r4, r3, r2, r1; // blockDim.x*blockIdx.x + threadIdx.x
23 | mul.wide.s32 rd5, r4, 16; // sizeof(vec4)*(blockDim.x*blockIdx.x + threadIdx.x)
24 | add.s64 rd6, rd4, rd5; // param0 + sizeof(vec4)*(blockDim.x*blockIdx.x + threadIdx.x)
25 | ld.global.v4.f32 {x0, y0, z0, w0}, [rd6];
26 | mul.wide.s32 rd7, r4, 4; // sizeof(float)*(blockDim.x*blockIdx.x + threadIdx.x)
27 | add.s64 rd8, rd3, rd7; // param1 + sizeof(float)*(blockDim.x*blockIdx.x + threadIdx.x)
28 | call.uni (d), tree, (x0,y0,z0);
29 | st.global.f32 [rd8], d;
30 | ret;
31 | }
32 |
--------------------------------------------------------------------------------
/test/util/cuda_error.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | static const char *cudaErrorToString(CUresult error)
5 | {
6 | switch (error)
7 | {
8 | case CUDA_SUCCESS:
9 | return "CUDA_SUCCESS";
10 |
11 | case CUDA_ERROR_INVALID_VALUE:
12 | return "CUDA_ERROR_INVALID_VALUE";
13 |
14 | case CUDA_ERROR_OUT_OF_MEMORY:
15 | return "CUDA_ERROR_OUT_OF_MEMORY";
16 |
17 | case CUDA_ERROR_NOT_INITIALIZED:
18 | return "CUDA_ERROR_NOT_INITIALIZED";
19 |
20 | case CUDA_ERROR_DEINITIALIZED:
21 | return "CUDA_ERROR_DEINITIALIZED";
22 |
23 | case CUDA_ERROR_PROFILER_DISABLED:
24 | return "CUDA_ERROR_PROFILER_DISABLED";
25 |
26 | case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
27 | return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
28 |
29 | case CUDA_ERROR_PROFILER_ALREADY_STARTED:
30 | return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
31 |
32 | case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
33 | return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
34 |
35 | case CUDA_ERROR_NO_DEVICE:
36 | return "CUDA_ERROR_NO_DEVICE";
37 |
38 | case CUDA_ERROR_INVALID_DEVICE:
39 | return "CUDA_ERROR_INVALID_DEVICE";
40 |
41 | case CUDA_ERROR_INVALID_IMAGE:
42 | return "CUDA_ERROR_INVALID_IMAGE";
43 |
44 | case CUDA_ERROR_INVALID_CONTEXT:
45 | return "CUDA_ERROR_INVALID_CONTEXT";
46 |
47 | case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
48 | return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
49 |
50 | case CUDA_ERROR_MAP_FAILED:
51 | return "CUDA_ERROR_MAP_FAILED";
52 |
53 | case CUDA_ERROR_UNMAP_FAILED:
54 | return "CUDA_ERROR_UNMAP_FAILED";
55 |
56 | case CUDA_ERROR_ARRAY_IS_MAPPED:
57 | return "CUDA_ERROR_ARRAY_IS_MAPPED";
58 |
59 | case CUDA_ERROR_ALREADY_MAPPED:
60 | return "CUDA_ERROR_ALREADY_MAPPED";
61 |
62 | case CUDA_ERROR_NO_BINARY_FOR_GPU:
63 | return "CUDA_ERROR_NO_BINARY_FOR_GPU";
64 |
65 | case CUDA_ERROR_ALREADY_ACQUIRED:
66 | return "CUDA_ERROR_ALREADY_ACQUIRED";
67 |
68 | case CUDA_ERROR_NOT_MAPPED:
69 | return "CUDA_ERROR_NOT_MAPPED";
70 |
71 | case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
72 | return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
73 |
74 | case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
75 | return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
76 |
77 | case CUDA_ERROR_ECC_UNCORRECTABLE:
78 | return "CUDA_ERROR_ECC_UNCORRECTABLE";
79 |
80 | case CUDA_ERROR_UNSUPPORTED_LIMIT:
81 | return "CUDA_ERROR_UNSUPPORTED_LIMIT";
82 |
83 | case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
84 | return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
85 |
86 | case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
87 | return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";
88 |
89 | case CUDA_ERROR_INVALID_PTX:
90 | return "CUDA_ERROR_INVALID_PTX";
91 |
92 | case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
93 | return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT";
94 |
95 | case CUDA_ERROR_NVLINK_UNCORRECTABLE:
96 | return "CUDA_ERROR_NVLINK_UNCORRECTABLE";
97 |
98 | case CUDA_ERROR_JIT_COMPILER_NOT_FOUND:
99 | return "CUDA_ERROR_JIT_COMPILER_NOT_FOUND";
100 |
101 | case CUDA_ERROR_INVALID_SOURCE:
102 | return "CUDA_ERROR_INVALID_SOURCE";
103 |
104 | case CUDA_ERROR_FILE_NOT_FOUND:
105 | return "CUDA_ERROR_FILE_NOT_FOUND";
106 |
107 | case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
108 | return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
109 |
110 | case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
111 | return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
112 |
113 | case CUDA_ERROR_OPERATING_SYSTEM:
114 | return "CUDA_ERROR_OPERATING_SYSTEM";
115 |
116 | case CUDA_ERROR_INVALID_HANDLE:
117 | return "CUDA_ERROR_INVALID_HANDLE";
118 |
119 | case CUDA_ERROR_NOT_FOUND:
120 | return "CUDA_ERROR_NOT_FOUND";
121 |
122 | case CUDA_ERROR_NOT_READY:
123 | return "CUDA_ERROR_NOT_READY";
124 |
125 | case CUDA_ERROR_ILLEGAL_ADDRESS:
126 | return "CUDA_ERROR_ILLEGAL_ADDRESS";
127 |
128 | case CUDA_ERROR_LAUNCH_FAILED:
129 | return "CUDA_ERROR_LAUNCH_FAILED";
130 |
131 | case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
132 | return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
133 |
134 | case CUDA_ERROR_LAUNCH_TIMEOUT:
135 | return "CUDA_ERROR_LAUNCH_TIMEOUT";
136 |
137 | case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
138 | return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
139 |
140 | case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
141 | return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
142 |
143 | case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
144 | return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
145 |
146 | case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
147 | return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
148 |
149 | case CUDA_ERROR_CONTEXT_IS_DESTROYED:
150 | return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
151 |
152 | case CUDA_ERROR_ASSERT:
153 | return "CUDA_ERROR_ASSERT";
154 |
155 | case CUDA_ERROR_TOO_MANY_PEERS:
156 | return "CUDA_ERROR_TOO_MANY_PEERS";
157 |
158 | case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
159 | return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
160 |
161 | case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
162 | return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
163 |
164 | case CUDA_ERROR_HARDWARE_STACK_ERROR:
165 | return "CUDA_ERROR_HARDWARE_STACK_ERROR";
166 |
167 | case CUDA_ERROR_ILLEGAL_INSTRUCTION:
168 | return "CUDA_ERROR_ILLEGAL_INSTRUCTION";
169 |
170 | case CUDA_ERROR_MISALIGNED_ADDRESS:
171 | return "CUDA_ERROR_MISALIGNED_ADDRESS";
172 |
173 | case CUDA_ERROR_INVALID_ADDRESS_SPACE:
174 | return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
175 |
176 | case CUDA_ERROR_INVALID_PC:
177 | return "CUDA_ERROR_INVALID_PC";
178 |
179 | case CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE:
180 | return "CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE";
181 |
182 | case CUDA_ERROR_NOT_PERMITTED:
183 | return "CUDA_ERROR_NOT_PERMITTED";
184 |
185 | case CUDA_ERROR_NOT_SUPPORTED:
186 | return "CUDA_ERROR_NOT_SUPPORTED";
187 |
188 | case CUDA_ERROR_UNKNOWN:
189 | return "CUDA_ERROR_UNKNOWN";
190 | }
191 |
192 | return "";
193 | }
194 |
195 | static const char *cudaErrorToString(cudaError_t error)
196 | {
197 | switch (error)
198 | {
199 | case cudaSuccess:
200 | return "cudaSuccess";
201 |
202 | case cudaErrorMissingConfiguration:
203 | return "cudaErrorMissingConfiguration";
204 |
205 | case cudaErrorMemoryAllocation:
206 | return "cudaErrorMemoryAllocation";
207 |
208 | case cudaErrorInitializationError:
209 | return "cudaErrorInitializationError";
210 |
211 | case cudaErrorLaunchFailure:
212 | return "cudaErrorLaunchFailure";
213 |
214 | case cudaErrorPriorLaunchFailure:
215 | return "cudaErrorPriorLaunchFailure";
216 |
217 | case cudaErrorLaunchTimeout:
218 | return "cudaErrorLaunchTimeout";
219 |
220 | case cudaErrorLaunchOutOfResources:
221 | return "cudaErrorLaunchOutOfResources";
222 |
223 | case cudaErrorInvalidDeviceFunction:
224 | return "cudaErrorInvalidDeviceFunction";
225 |
226 | case cudaErrorInvalidConfiguration:
227 | return "cudaErrorInvalidConfiguration";
228 |
229 | case cudaErrorInvalidDevice:
230 | return "cudaErrorInvalidDevice";
231 |
232 | case cudaErrorInvalidValue:
233 | return "cudaErrorInvalidValue";
234 |
235 | case cudaErrorInvalidPitchValue:
236 | return "cudaErrorInvalidPitchValue";
237 |
238 | case cudaErrorInvalidSymbol:
239 | return "cudaErrorInvalidSymbol";
240 |
241 | case cudaErrorMapBufferObjectFailed:
242 | return "cudaErrorMapBufferObjectFailed";
243 |
244 | case cudaErrorUnmapBufferObjectFailed:
245 | return "cudaErrorUnmapBufferObjectFailed";
246 |
247 | case cudaErrorInvalidHostPointer:
248 | return "cudaErrorInvalidHostPointer";
249 |
250 | case cudaErrorInvalidDevicePointer:
251 | return "cudaErrorInvalidDevicePointer";
252 |
253 | case cudaErrorInvalidTexture:
254 | return "cudaErrorInvalidTexture";
255 |
256 | case cudaErrorInvalidTextureBinding:
257 | return "cudaErrorInvalidTextureBinding";
258 |
259 | case cudaErrorInvalidChannelDescriptor:
260 | return "cudaErrorInvalidChannelDescriptor";
261 |
262 | case cudaErrorInvalidMemcpyDirection:
263 | return "cudaErrorInvalidMemcpyDirection";
264 |
265 | case cudaErrorAddressOfConstant:
266 | return "cudaErrorAddressOfConstant";
267 |
268 | case cudaErrorTextureFetchFailed:
269 | return "cudaErrorTextureFetchFailed";
270 |
271 | case cudaErrorTextureNotBound:
272 | return "cudaErrorTextureNotBound";
273 |
274 | case cudaErrorSynchronizationError:
275 | return "cudaErrorSynchronizationError";
276 |
277 | case cudaErrorInvalidFilterSetting:
278 | return "cudaErrorInvalidFilterSetting";
279 |
280 | case cudaErrorInvalidNormSetting:
281 | return "cudaErrorInvalidNormSetting";
282 |
283 | case cudaErrorMixedDeviceExecution:
284 | return "cudaErrorMixedDeviceExecution";
285 |
286 | case cudaErrorCudartUnloading:
287 | return "cudaErrorCudartUnloading";
288 |
289 | case cudaErrorUnknown:
290 | return "cudaErrorUnknown";
291 |
292 | case cudaErrorNotYetImplemented:
293 | return "cudaErrorNotYetImplemented";
294 |
295 | case cudaErrorMemoryValueTooLarge:
296 | return "cudaErrorMemoryValueTooLarge";
297 |
298 | case cudaErrorInvalidResourceHandle:
299 | return "cudaErrorInvalidResourceHandle";
300 |
301 | case cudaErrorNotReady:
302 | return "cudaErrorNotReady";
303 |
304 | case cudaErrorInsufficientDriver:
305 | return "cudaErrorInsufficientDriver";
306 |
307 | case cudaErrorSetOnActiveProcess:
308 | return "cudaErrorSetOnActiveProcess";
309 |
310 | case cudaErrorInvalidSurface:
311 | return "cudaErrorInvalidSurface";
312 |
313 | case cudaErrorNoDevice:
314 | return "cudaErrorNoDevice";
315 |
316 | case cudaErrorECCUncorrectable:
317 | return "cudaErrorECCUncorrectable";
318 |
319 | case cudaErrorSharedObjectSymbolNotFound:
320 | return "cudaErrorSharedObjectSymbolNotFound";
321 |
322 | case cudaErrorSharedObjectInitFailed:
323 | return "cudaErrorSharedObjectInitFailed";
324 |
325 | case cudaErrorUnsupportedLimit:
326 | return "cudaErrorUnsupportedLimit";
327 |
328 | case cudaErrorDuplicateVariableName:
329 | return "cudaErrorDuplicateVariableName";
330 |
331 | case cudaErrorDuplicateTextureName:
332 | return "cudaErrorDuplicateTextureName";
333 |
334 | case cudaErrorDuplicateSurfaceName:
335 | return "cudaErrorDuplicateSurfaceName";
336 |
337 | case cudaErrorDevicesUnavailable:
338 | return "cudaErrorDevicesUnavailable";
339 |
340 | case cudaErrorInvalidKernelImage:
341 | return "cudaErrorInvalidKernelImage";
342 |
343 | case cudaErrorNoKernelImageForDevice:
344 | return "cudaErrorNoKernelImageForDevice";
345 |
346 | case cudaErrorIncompatibleDriverContext:
347 | return "cudaErrorIncompatibleDriverContext";
348 |
349 | case cudaErrorPeerAccessAlreadyEnabled:
350 | return "cudaErrorPeerAccessAlreadyEnabled";
351 |
352 | case cudaErrorPeerAccessNotEnabled:
353 | return "cudaErrorPeerAccessNotEnabled";
354 |
355 | case cudaErrorDeviceAlreadyInUse:
356 | return "cudaErrorDeviceAlreadyInUse";
357 |
358 | case cudaErrorProfilerDisabled:
359 | return "cudaErrorProfilerDisabled";
360 |
361 | case cudaErrorProfilerNotInitialized:
362 | return "cudaErrorProfilerNotInitialized";
363 |
364 | case cudaErrorProfilerAlreadyStarted:
365 | return "cudaErrorProfilerAlreadyStarted";
366 |
367 | case cudaErrorProfilerAlreadyStopped:
368 | return "cudaErrorProfilerAlreadyStopped";
369 |
370 | /* Since CUDA 4.0*/
371 | case cudaErrorAssert:
372 | return "cudaErrorAssert";
373 |
374 | case cudaErrorTooManyPeers:
375 | return "cudaErrorTooManyPeers";
376 |
377 | case cudaErrorHostMemoryAlreadyRegistered:
378 | return "cudaErrorHostMemoryAlreadyRegistered";
379 |
380 | case cudaErrorHostMemoryNotRegistered:
381 | return "cudaErrorHostMemoryNotRegistered";
382 |
383 | /* Since CUDA 5.0 */
384 | case cudaErrorOperatingSystem:
385 | return "cudaErrorOperatingSystem";
386 |
387 | case cudaErrorPeerAccessUnsupported:
388 | return "cudaErrorPeerAccessUnsupported";
389 |
390 | case cudaErrorLaunchMaxDepthExceeded:
391 | return "cudaErrorLaunchMaxDepthExceeded";
392 |
393 | case cudaErrorLaunchFileScopedTex:
394 | return "cudaErrorLaunchFileScopedTex";
395 |
396 | case cudaErrorLaunchFileScopedSurf:
397 | return "cudaErrorLaunchFileScopedSurf";
398 |
399 | case cudaErrorSyncDepthExceeded:
400 | return "cudaErrorSyncDepthExceeded";
401 |
402 | case cudaErrorLaunchPendingCountExceeded:
403 | return "cudaErrorLaunchPendingCountExceeded";
404 |
405 | case cudaErrorNotPermitted:
406 | return "cudaErrorNotPermitted";
407 |
408 | case cudaErrorNotSupported:
409 | return "cudaErrorNotSupported";
410 |
411 | /* Since CUDA 6.0 */
412 | case cudaErrorHardwareStackError:
413 | return "cudaErrorHardwareStackError";
414 |
415 | case cudaErrorIllegalInstruction:
416 | return "cudaErrorIllegalInstruction";
417 |
418 | case cudaErrorMisalignedAddress:
419 | return "cudaErrorMisalignedAddress";
420 |
421 | case cudaErrorInvalidAddressSpace:
422 | return "cudaErrorInvalidAddressSpace";
423 |
424 | case cudaErrorInvalidPc:
425 | return "cudaErrorInvalidPc";
426 |
427 | case cudaErrorIllegalAddress:
428 | return "cudaErrorIllegalAddress";
429 |
430 | /* Since CUDA 6.5*/
431 | case cudaErrorInvalidPtx:
432 | return "cudaErrorInvalidPtx";
433 |
434 | case cudaErrorInvalidGraphicsContext:
435 | return "cudaErrorInvalidGraphicsContext";
436 |
437 | case cudaErrorStartupFailure:
438 | return "cudaErrorStartupFailure";
439 |
440 | case cudaErrorApiFailureBase:
441 | return "cudaErrorApiFailureBase";
442 |
443 | /* Since CUDA 8.0*/
444 | case cudaErrorNvlinkUncorrectable :
445 | return "cudaErrorNvlinkUncorrectable";
446 |
447 | /* Since CUDA 8.5*/
448 | case cudaErrorJitCompilerNotFound :
449 | return "cudaErrorJitCompilerNotFound";
450 |
451 | /* Since CUDA 9.0*/
452 | case cudaErrorCooperativeLaunchTooLarge :
453 | return "cudaErrorCooperativeLaunchTooLarge";
454 |
455 | }
456 |
457 | return "";
458 | }
459 |
460 | template< typename T >
461 | void _cudaCheckError(T result, char const *const func, const char *const file, int const line)
462 | {
463 | if (result)
464 | {
465 | fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
466 | file, line, static_cast(result), cudaErrorToString(result), func);
467 | CUcontext ctx;
468 | cuCtxGetCurrent(&ctx);
469 | cuCtxDestroy(ctx);
470 | exit(EXIT_FAILURE);
471 | }
472 | }
473 | #define cudaCheckError(val) _cudaCheckError ( (val), #val, __FILE__, __LINE__ )
474 |
--------------------------------------------------------------------------------
/test/util/init_cuda.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 | #include
5 | #include "cuda_error.h"
6 | void init_cuda()
7 | {
8 | // disable CUDA from caching SASS programs
9 | setenv("CUDA_CACHE_DISABLE", "1", 1);
10 |
11 | CUcontext context;
12 | CUdevice device;
13 | cudaCheckError(cuInit(0));
14 | cudaCheckError(cuDeviceGet(&device, 0));
15 | cudaCheckError(cuCtxCreate(&context, 0, device));
16 |
17 | char name[256];
18 | int major = 0, minor = 0;
19 | int compute_mode = -1;
20 | cudaCheckError(cuDeviceGetName(name, 100, device));
21 | cudaCheckError(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
22 | cudaCheckError(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
23 | cudaCheckError(cuDeviceGetAttribute(&compute_mode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, device));
24 | assert(compute_mode != CU_COMPUTEMODE_PROHIBITED && "Device is running in Compute Mode Prohibited");
25 | printf("Using CUDA device %s: Compute SM %d.%d\n", name, major, minor);
26 | }
27 |
--------------------------------------------------------------------------------
/test/util/profiler.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 |
5 | #ifdef _WIN32
6 | #ifndef WIN32_LEAN_AND_MEAN
7 | #define WIN32_LEAN_AND_MEAN
8 | #endif
9 | #include
10 |
11 | LARGE_INTEGER perf_get_tick()
12 | {
13 | LARGE_INTEGER result;
14 | QueryPerformanceCounter(&result);
15 | return result;
16 | }
17 |
18 | float perf_seconds_elapsed(LARGE_INTEGER begin, LARGE_INTEGER end)
19 | {
20 | LARGE_INTEGER frequency;
21 | QueryPerformanceFrequency(&frequency);
22 | return (float)(end.QuadPart - begin.QuadPart) /
23 | (float)frequency.QuadPart;
24 | }
25 |
26 | struct perf_TimingInfo
27 | {
28 | const char *label;
29 | LARGE_INTEGER begin;
30 | LARGE_INTEGER end;
31 | bool counting;
32 | float t_sum;
33 | float t_last;
34 | int hits;
35 | };
36 |
37 | #else // ifdef _WIN32
38 | #include
39 |
40 | timespec perf_get_tick()
41 | {
42 | timespec result;
43 | clock_gettime(CLOCK_REALTIME, &result);
44 | return result;
45 | }
46 |
47 | float perf_seconds_elapsed(timespec begin, timespec end)
48 | {
49 | time_t dsec = end.tv_sec - begin.tv_sec;
50 | long dnsec = end.tv_nsec - begin.tv_nsec;
51 | double result = (double)dsec + (double)dnsec / 1000000000.0;
52 | return (float)result;
53 | }
54 |
55 | struct perf_TimingInfo
56 | {
57 | const char *label;
58 | timespec begin;
59 | timespec end;
60 | bool counting;
61 | float t_sum;
62 | float t_last;
63 | int hits;
64 | };
65 |
66 | #endif
67 |
68 | #ifdef ENABLE_TIMING
69 | static perf_TimingInfo perf_timing_blocks[1024];
70 | static int perf_count = 0;
71 |
72 | void TIMING(const char *label)
73 | {
74 | perf_TimingInfo *block = 0;
75 | for (int i = 0; i < perf_count; i++)
76 | {
77 | if (strcmp(label, perf_timing_blocks[i].label) == 0)
78 | {
79 | block = &perf_timing_blocks[i];
80 | break;
81 | }
82 | }
83 | if (!block)
84 | {
85 | block = &perf_timing_blocks[perf_count];
86 | perf_count++;
87 | block->hits = 0;
88 | block->t_sum = 0.0f;
89 | block->t_last = 0.0f;
90 | block->label = label;
91 | }
92 | if (block->counting)
93 | {
94 | block->hits++;
95 | block->end = perf_get_tick();
96 | float elapsed = perf_seconds_elapsed(block->begin, block->end);
97 | block->t_sum += elapsed;
98 | block->t_last = elapsed;
99 | block->counting = false;
100 | }
101 | else
102 | {
103 | block->counting = true;
104 | block->begin = perf_get_tick();
105 | }
106 | }
107 |
108 | void TIMING_CLEAR() { perf_count = 0; }
109 |
110 | void TIMING_SUMMARY()
111 | {
112 | printf("AVG \tLAST \tHITS\tNAME\n");
113 | for (int i = 0; i < perf_count; i++)
114 | {
115 | perf_TimingInfo block = perf_timing_blocks[i];
116 | int hits = block.hits;
117 | float avg = 1000.0f * block.t_sum / block.hits;
118 | float last = 1000.0f * block.t_last;
119 | printf("%.2f\t%.2f\t%04d\t%s\n", avg, last, hits, block.label);
120 | }
121 | }
122 |
123 | float TIMING_GET_AVG(const char *label)
124 | {
125 | perf_TimingInfo *block = 0;
126 | for (int i = 0; i < perf_count; i++)
127 | {
128 | if (strcmp(label, perf_timing_blocks[i].label) == 0)
129 | {
130 | block = &perf_timing_blocks[i];
131 | break;
132 | }
133 | }
134 | if (!block)
135 | return -1.0f;
136 | return block->t_sum / block->hits;
137 | }
138 |
139 | #else
140 | void TIMING(const char *label) { }
141 | void TIMING_CLEAR() { }
142 | void TIMING_SUMMARY() { }
143 | void TIMING_GET_AVG(const char *label) { }
144 | #endif
145 |
--------------------------------------------------------------------------------
/test/util/test_models.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "sdf_builder.h"
3 |
4 | sdf_node_t *model_simple01() { return sdf_box(1.0f, 0.5f, 0.25f); }
5 | sdf_node_t *model_simple02() { return sdf_cylinder(1.0f, 0.5f); }
6 | sdf_node_t *model_simple03() { return sdf_sphere(0.98f); }
7 | sdf_node_t *model_simple04() { return sdf_plane(0.98f); }
8 | sdf_node_t *model_simple05() { return sdf_rotate(sdf_translate(sdf_box(0.98f, 0.63f, 0.33f), 0.1f,-0.2f,0.3f), 0.1f,0.2f,-0.3f); }
9 | sdf_node_t *model_simple06() { return sdf_rotate(sdf_translate(sdf_sphere(0.98f), 0.1f,-0.2f,0.3f), 0.1f,0.2f,-0.3f); }
10 | sdf_node_t *model_simple07() { return sdf_rotate(sdf_translate(sdf_cylinder(0.98f, 0.63f), 0.1f,-0.2f,0.3f), 0.1f,0.2f,-0.3f); }
11 | sdf_node_t *model_simple08() { return sdf_rotate(sdf_translate(sdf_plane(0.98f), 0.1f,-0.2f,0.3f), 0.1f,0.2f,-0.3f); }
12 | sdf_node_t *model_simple09() { return sdf_blend(0.4f, sdf_sphere(1.0f), sdf_cylinder(0.3f,1.0f)); }
13 | sdf_node_t *model_simple10() {
14 | sdf_node_t *d1 = sdf_box(0.98f, 0.63f, 0.33f);
15 | sdf_rotate(d1, -0.3f, 0.2f, -0.1f);
16 | sdf_translate(d1, 0.3f, -0.5f, 0.3f);
17 | sdf_node_t *d2 = sdf_sphere(0.63f);
18 | sdf_rotate(d2, 0.7f, 0.8f, -0.3f);
19 | sdf_translate(d2, -0.6f, +0.5f, 0.2f);
20 | sdf_node_t *d = sdf_union(d1, d2);
21 | return d;
22 | }
23 | sdf_node_t *model_simple11() { return sdf_subtract(sdf_box(1.0f,1.0f,1.0f), sdf_translate(sdf_sphere(0.5f), 0,1.0f,0)); }
24 | sdf_node_t *model_simple12() { return sdf_subtract(sdf_rotate(sdf_box(1.0f,1.0f,1.0f), 0.77f,0.77f,0), sdf_sphere(0.5f)); }
25 | sdf_node_t *model_simple13() { return sdf_subtract(sdf_box(1.0f,1.0f,1.0f), sdf_cylinder(0.5f,2.0f)); }
26 | sdf_node_t *model_simple14() { return sdf_union(sdf_box(0.5f,0.5f,0.5f), sdf_translate(sdf_sphere(0.25f),0.5f,0,0)); }
27 | sdf_node_t *model_simple15() { return sdf_intersect(sdf_box(0.5f,0.5f,0.5f), sdf_translate(sdf_sphere(0.25f),0.5f,0,0)); }
28 | sdf_node_t *model_simple16() { return sdf_subtract(sdf_box(0.5f,0.5f,0.5f), sdf_translate(sdf_sphere(0.25f),0.5f,0,0)); }
29 |
30 | sdf_node_t *model_complex_2d_1()
31 | {
32 | auto *d1 = sdf_translate(sdf_box(0.9f, 0.1f, 0.5f), 0.0f, 0.5f, 0.0f);
33 | auto *d2 = sdf_translate(sdf_box(0.8f, 0.05f, 0.5f), 0.0f, -0.5f, 0.0f);
34 | auto *d3 = sdf_sphere(0.5f);
35 | auto *d4 = sdf_box(1.0f, 0.2f, 0.5f);
36 | return sdf_rotate(sdf_translate(sdf_union(sdf_union(d1, d2), sdf_subtract(d3, d4)), 0.1f, -0.2f, 0.0f), 0.0f, 0.0f, 0.2f);
37 | }
38 |
39 | sdf_node_t *model_complex02()
40 | {
41 | sdf_node_t *a1 = sdf_plane(0.3f);
42 | sdf_node_t *a2 = sdf_cylinder(0.2f, 0.3f);
43 | sdf_node_t *a3 = sdf_box(0.3f,0.3f,0.3f);
44 | sdf_node_t *a4 = sdf_sphere(0.5f);
45 | sdf_node_t *a5 = sdf_union(a1,a2);
46 | sdf_node_t *a6 = sdf_subtract(a3,a4);
47 | sdf_node_t *a7 = sdf_union(a5,a6);
48 | sdf_node_t *b1 = sdf_plane(0.3f);
49 | sdf_node_t *b2 = sdf_cylinder(0.2f, 0.3f);
50 | sdf_node_t *b3 = sdf_box(0.3f,0.3f,0.3f);
51 | sdf_node_t *b4 = sdf_sphere(0.5f);
52 | sdf_node_t *b5 = sdf_union(b1,b2);
53 | sdf_node_t *b6 = sdf_subtract(b3,b4);
54 | sdf_node_t *b7 = sdf_union(b5,b6);
55 | sdf_node_t *d = sdf_union(a7,b7);
56 | return d;
57 | }
58 |
59 | sdf_node_t *model_complex03()
60 | {
61 | float s = 0.3f;
62 | sdf_node_t *d1 = sdf_sphere(1.0f*s);
63 | sdf_node_t *c1 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(c1, 0,0,0);
64 | sdf_node_t *c2 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(c2, 1.54f,0,0);
65 | sdf_node_t *c3 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(c3, 0,0,1.54f);
66 | sdf_node_t *c12 = sdf_union(c1,c2);
67 | sdf_node_t *c123 = sdf_union(c12,c3);
68 | sdf_node_t *d2 = sdf_subtract(d1,c123);
69 |
70 | sdf_node_t *b1 = sdf_box(0.74f*s,0.74f*s,0.74f*s);
71 | sdf_node_t *d3 = sdf_intersect(d2,b1);
72 |
73 | sdf_node_t *s2 = sdf_sphere(0.3f*s);
74 | sdf_node_t *c5 = sdf_cylinder(0.1f*s, 0.8f*s); sdf_rotate(c5, 1.54f,0,0);
75 | sdf_node_t *c6 = sdf_cylinder(0.1f*s, 0.8f*s); sdf_rotate(c6, 0,0,0);
76 | sdf_node_t *c56 = sdf_union(c5,c6); sdf_rotate(c56, 0.7f, 0.0f, 0.0f); sdf_translate(c56, 1.0f*s, 0.0f, 0.0f);
77 | sdf_node_t *s2c56 = sdf_union(s2,c56);
78 | sdf_node_t *d4 = sdf_union(d3, s2c56);
79 |
80 | sdf_node_t *b2 = sdf_box(0.2f*s,0.2f*s,0.2f*s); sdf_translate(b2,-1.0f*s,0,0); sdf_rotate(b2,0.77f,0.77f,0);
81 | sdf_node_t *d5 = sdf_union(d4,b2);
82 |
83 | sdf_node_t *d = d5;
84 | return d;
85 | }
86 |
87 | sdf_node_t *model_complex04()
88 | {
89 | float s = 1.5f;
90 | sdf_node_t *d1 = sdf_sphere(1.0f*s);
91 | sdf_node_t *d2 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(d2, 0,0,0);
92 | sdf_node_t *d3 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(d3, 1.54f,0,0);
93 | sdf_node_t *d4 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(d4, 0,0,1.54f);
94 | sdf_node_t *d5 = sdf_subtract(d1,d2);
95 | sdf_node_t *d6 = sdf_subtract(d5,d3);
96 | sdf_node_t *d7 = sdf_subtract(d6,d4);
97 | sdf_node_t *d8 = sdf_plane(0.74f*s); sdf_rotate(d8, 0,0,1.54f);
98 | sdf_node_t *d9 = sdf_plane(0.74f*s); sdf_rotate(d9, 0,0,-1.54f);
99 | sdf_node_t *d10 = sdf_plane(0.74f*s); sdf_rotate(d10, 0,1.54f,0);
100 | sdf_node_t *d11 = sdf_plane(0.74f*s); sdf_rotate(d11, 0,-1.54f,0);
101 | sdf_node_t *d12 = sdf_plane(0.74f*s); sdf_rotate(d12, 0,0,0);
102 | sdf_node_t *d13 = sdf_plane(0.74f*s); sdf_rotate(d13, 0,0,3.14f);
103 | sdf_node_t *d14 = sdf_intersect(d7,d8);
104 | sdf_node_t *d15 = sdf_intersect(d14,d9);
105 | sdf_node_t *d16 = sdf_intersect(d15,d10);
106 | sdf_node_t *d17 = sdf_intersect(d16,d11);
107 | sdf_node_t *d18 = sdf_intersect(d17,d12);
108 | sdf_node_t *d19 = sdf_intersect(d18,d13);
109 | sdf_node_t *d20 = sdf_sphere(0.3f*s);
110 | sdf_node_t *d21 = sdf_union(d19, d20);
111 | sdf_node_t *d22 = sdf_cylinder(0.1f*s, 0.8f*s); sdf_rotate(d22, 1.54f,0,0);
112 | sdf_node_t *d23 = sdf_cylinder(0.1f*s, 0.8f*s); sdf_rotate(d23, 0,0,0);
113 | sdf_node_t *d24 = sdf_union(d21, d22);
114 | sdf_node_t *d25 = sdf_union(d24, d23);
115 | return d25;
116 | }
117 |
118 | sdf_node_t *model_complex05()
119 | {
120 | float s = 1.5f;
121 | sdf_node_t *d1 = sdf_sphere(1.0f*s);
122 | sdf_node_t *d2 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(d2, 0,0,0);
123 | sdf_node_t *d3 = sdf_subtract(d1,d2);
124 | sdf_node_t *d4 = sdf_plane(0.44f*s); sdf_rotate(d4, 0,0,1.54f);
125 | sdf_node_t *d5 = sdf_plane(0.44f*s); sdf_rotate(d5, 0,0,-1.54f);
126 | sdf_node_t *d6 = sdf_intersect(d3,d4);
127 | sdf_node_t *d7 = sdf_intersect(d6,d5);
128 | return d7;
129 | }
130 |
131 | sdf_node_t *model_chair1_2d()
132 | {
133 | float k = 0.5f;
134 | sdf_node_t *seat = sdf_box(1.0f*k, 0.1f*k, 1.0f);
135 | sdf_node_t *leg1 = sdf_rotate(sdf_translate(sdf_box(0.1f*k, 1.0f*k, 1.0f), -1.0f*k,0,0), 0,0,-0.2f);
136 | sdf_node_t *leg2 = sdf_rotate(sdf_translate(sdf_box(0.1f*k, 1.0f*k, 1.0f), +1.0f*k,0,0), 0,0,+0.1f);
137 | sdf_node_t *legs = sdf_translate(sdf_union(leg1, leg2), 0,-1.0f*k,0);
138 | sdf_node_t *back = sdf_rotate(sdf_translate(sdf_box(0.1f*k, 1.0f*k, 1.0f), 1.0f*k,1.0f*k,0), 0,0,-0.1f);
139 | sdf_node_t *seat_and_legs = sdf_union(seat, legs);
140 | sdf_node_t *chair = sdf_union(seat_and_legs, back);
141 | return chair;
142 | }
143 |
144 | sdf_node_t *model_chair2_2d()
145 | {
146 | float k = 0.5f;
147 | sdf_node_t *seat = sdf_rotate(sdf_box(0.8f*k, 0.15f*k, 1.0f), 0,0,0.2f);
148 | sdf_node_t *leg1 = sdf_rotate(sdf_translate(sdf_box(0.1f*k, 1.0f*k, 1.0f), -0.75f*k,0,0), 0,0,-0.05f);
149 | sdf_node_t *leg2 = sdf_rotate(sdf_translate(sdf_box(0.1f*k, 1.0f*k, 1.0f), +0.8f*k,0.05f*k,0), 0,0,0.1f);
150 | sdf_node_t *mid = sdf_translate(sdf_box(0.8f*k, 0.05f*k, 1.0f), 0,-1.0f*k,0);
151 | sdf_node_t *legs = sdf_intersect(sdf_translate(sdf_union(leg1, leg2), 0,-1.0f*k,0),
152 | sdf_rotate(sdf_plane(1.9f*k), 0,0,-3.14f/2.0f));
153 | sdf_node_t *seat_and_legs = sdf_union(seat, legs);
154 | sdf_node_t *chair = sdf_union(seat_and_legs, mid);
155 | return chair;
156 | }
157 |
158 | sdf_node_t *model_translated_sphere()
159 | {
160 | return
161 | sdf_translate(sdf_sphere(1.0f), -0.5f,0.0f,0.0f);
162 | }
163 |
164 | sdf_node_t *model_intersection()
165 | {
166 | return
167 | sdf_intersect(sdf_translate(sdf_sphere(0.5f), -0.2f,0.0f,0.0f),
168 | sdf_translate(sdf_sphere(0.5f), +0.2f,0.0f,0.0f));
169 | }
170 |
171 | sdf_node_t *model_two_spheres()
172 | {
173 | return
174 | sdf_union(sdf_translate(sdf_sphere(0.1f), -0.5f,0.0f,0.0f),
175 | sdf_translate(sdf_sphere(0.5f), +0.3f,0.0f,0.0f));
176 | }
177 |
178 | sdf_node_t *model_two_spheres_equal()
179 | {
180 | return
181 | sdf_union(sdf_translate(sdf_sphere(0.3f), -0.4f,0.0f,0.0f),
182 | sdf_translate(sdf_sphere(0.3f), +0.4f,0.0f,0.0f));
183 | }
184 |
185 | sdf_node_t *model_four_spheres()
186 | {
187 | return
188 | sdf_union(
189 | sdf_union(
190 | sdf_translate(sdf_sphere(0.2f), 0.0f,0.7f,0.0f),
191 | sdf_translate(sdf_sphere(0.2f), 0.0f,-0.7f,0.0f)),
192 | sdf_union(
193 | sdf_translate(sdf_sphere(0.4f), -0.5f,0.0f,0.0f),
194 | sdf_translate(sdf_sphere(0.4f), +0.5f,0.0f,0.0f)));
195 | }
196 |
197 | sdf_node_t *model_scissor()
198 | {
199 | return
200 | sdf_union(
201 | sdf_translate(sdf_sphere(0.4f), 0.0f,0.6f,0.0f),
202 | sdf_intersect(
203 | sdf_translate(sdf_sphere(0.8f), -0.5f,0.0f,0.0f),
204 | sdf_translate(sdf_sphere(0.8f), +0.5f,0.0f,0.0f)));
205 | }
206 |
207 | sdf_node_t *model_fillet()
208 | {
209 | return
210 | sdf_union
211 | (
212 | sdf_translate(sdf_sphere(0.25f), 0.25f,0.25f,0.0f),
213 | sdf_intersect
214 | (
215 | sdf_rotate(sdf_plane(0.53f), 0.0f,0.0f,3.1415f/4.0f),
216 | sdf_box(0.5f, 0.5f, 0.5f)
217 | )
218 | );
219 | }
220 |
221 | sdf_node_t *model_two_box()
222 | {
223 | return
224 | sdf_union
225 | (
226 | sdf_translate(sdf_box(0.55f,0.05f,1.0f), 0.25f,0.5f,0.0f),
227 | sdf_translate(sdf_box(0.05f,0.55f,1.0f), -0.25f,0.0f,0.0f)
228 | );
229 | }
230 |
231 | sdf_node_t *model_two_box_unequal()
232 | {
233 | return
234 | sdf_union
235 | (
236 | sdf_translate(sdf_box(0.35f,0.05f,1.0f), 0.15f,0.5f,0.0f),
237 | sdf_translate(sdf_box(0.05f,0.55f,1.0f), -0.25f,0.0f,0.0f)
238 | );
239 | }
240 |
241 | sdf_node_t *model_offset_box()
242 | {
243 | return sdf_rotate(sdf_translate(sdf_box(0.5f,0.5f,0.5f), 0.2f, -0.2f, 0.0f), 0.0f, 0.0f, -0.5f);
244 | }
245 |
246 | sdf_node_t *model_motion0(int which)
247 | {
248 | if (which == 0) {
249 | auto *d1 = sdf_box(0.3f, 0.3f, 0.3f);
250 | auto *d2 = sdf_box(0.2f, 0.2f, 0.2f);
251 | d2 = sdf_rotate(sdf_translate(d2, +0.3f, +0.2f, 0.0f), 0.0f, 0.0f, 0.3f);
252 | auto *d5 = sdf_union(d1, d2);
253 | d5 = sdf_rotate(d5, 0.0f, 0.0f, 0.2f);
254 | d5 = sdf_translate(d5, 0.45f, -0.5f, 0.0f);
255 | auto *d6 = sdf_sphere(0.3f);
256 | d6 = sdf_translate(d6, -0.4f, +0.2f, 0.0);
257 | return sdf_union(d5, d6);
258 | } else {
259 | auto *d1 = sdf_box(0.3f, 0.3f, 0.3f);
260 | auto *d2 = sdf_box(0.2f, 0.2f, 0.2f);
261 | d2 = sdf_rotate(sdf_translate(d2, +0.3f, +0.2f, 0.0f), 0.0f, 0.0f, 0.3f);
262 | auto *d5 = sdf_union(d1, d2);
263 | d5 = sdf_rotate(d5, 0.0f, 0.0f, 0.2f);
264 | d5 = sdf_rotate(sdf_translate(d5, 0.45f, -0.1f, 0.0f), 0.0f, 0.0f, -0.3f);
265 | auto *d6 = sdf_sphere(0.3f);
266 | d6 = sdf_translate(d6, -0.4f, +0.2f, 0.0);
267 | return sdf_union(d5, d6);
268 | }
269 | }
270 |
271 | sdf_node_t *model_motion1(int which)
272 | {
273 | if (which == 0) {
274 | auto *d1 = sdf_box(0.3f, 0.3f, 0.3f);
275 | auto *d2 = sdf_box(0.2f, 0.2f, 0.2f);
276 | auto *d3 = sdf_box(0.2f, 0.2f, 0.2f);
277 | d2 = sdf_rotate(sdf_translate(d2, +0.3f, +0.2f, 0.0f), 0.0f, 0.0f, 0.3f);
278 | d3 = sdf_rotate(sdf_translate(d3, -0.3f, -0.2f, 0.0f), 0.0f, 0.0f, 0.7f);
279 | auto *d4 = sdf_union(d2, d3);
280 | auto *d5 = sdf_union(d1, d4);
281 | d5 = sdf_rotate(d5, 0.0f, 0.0f, 0.2f);
282 | d5 = sdf_translate(d5, 0.4f, -0.2f, 0.0f);
283 | auto *d6 = sdf_sphere(0.3f);
284 | d6 = sdf_translate(d6, -0.3f, +0.2f, 0.0);
285 | return sdf_union(d5, d6);
286 | } else {
287 | auto *d1 = sdf_box(0.3f, 0.3f, 0.3f);
288 | auto *d2 = sdf_box(0.2f, 0.2f, 0.2f);
289 | auto *d3 = sdf_box(0.2f, 0.2f, 0.2f);
290 | d2 = sdf_rotate(sdf_translate(d2, +0.3f, +0.2f, 0.0f), 0.0f, 0.0f, 0.3f);
291 | d3 = sdf_rotate(sdf_translate(d3, -0.3f, -0.2f, 0.0f), 0.0f, 0.0f, 0.7f);
292 | auto *d4 = sdf_union(d2, d3);
293 | auto *d5 = sdf_union(d1, d4);
294 | d5 = sdf_rotate(d5, 0.0f, 0.0f, 0.2f);
295 | d5 = sdf_translate(d5, 0.45f, -0.1f, 0.0f);
296 | auto *d6 = sdf_sphere(0.3f);
297 | d6 = sdf_translate(d6, -0.3f, +0.2f, 0.0);
298 | return sdf_union(d5, d6);
299 | }
300 | }
301 |
--------------------------------------------------------------------------------