├── LICENSE.txt
├── README.md
├── doc
    └── overview-small.svg
├── src
    ├── backend_glsl.h
    ├── backend_ptx.h
    ├── backend_sass.h
    ├── frep.h
    ├── frep_builder.h
    ├── frep_eval.h
    ├── frep_serialize.h
    └── sass_6_x
    │   ├── backend.h
    │   ├── blocks.h
    │   ├── bytecode.h
    │   ├── cubin.h
    │   ├── instruction.h
    │   ├── registers.h
    │   ├── scheduler.h
    │   └── simulator.h
└── test
    ├── backend_glsl.cpp
    ├── backend_ptx.cpp
    ├── backend_sass_6_x.cpp
    ├── backend_sass_6_x_mock.cpp
    ├── linker.cpp
    ├── test1.cubin
    ├── test1.ptx
    ├── test2.cubin
    ├── test2.ptx
    └── util
        ├── cuda_error.h
        ├── init_cuda.h
        ├── profiler.h
        └── test_models.h


/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015-2019 Simen Haugo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # fast-csg
 2 | 
 3 | ![](doc/overview-small.svg)
 4 | 
 5 | A compiler for functional representations (see e.g. OpenSCAD, libfive, Hyperfun) that directly generates executable bytecode instructions for GPU architectures.
 6 | 
 7 | It gives you the benefit of fast tree evaluation as the tree structure is compiled into optimized machine code instructions (which makes the program compute-limited, not memory bandwidth-limited), while avoiding the long compile times that you would get by compiling to an intermediate target, such as GLSL, PTX, CUDA or NVVM IR.
 8 | 
 9 | Unlike NVIDIA's closed-source compiler chain, we focus on compilation speed, aiming for sub-millisecond time from compilation start to having the kernel uploaded to the GPU and ready to run.
10 | 
11 | Possible applications:
12 | 
13 | * Fast and parallelized hypothesis generation and testing, for e.g. program synthesis or 3D reconstruction.
14 | * GPU-accelerated visualization where you can live edit the CSG tree structure with instant feedback
15 | 
16 | ## Project status
17 | 
18 | This project is currently in limbo. I'm open-sourcing it in the event that anyone finds some parts of it useful. To that end, here's a list of stuff that's in here:
19 | 
20 | * CSG tree grammar and interpreter (see [src/frep.h](src/frep.h))
21 | * Complete CSG->GLSL compiler (see [src/backend_glsl.h](src/backend_glsl.h))
22 | * Complete CSG->PTX compiler (see [src/backend_ptx.h](src/backend_ptx.h))
23 | * Partial CSG->SASS 6.x compiler (missing Cubin linking stage)
24 | 
25 | Naturally, implementing a custom SASS compiler is difficult, as NVIDIA does not publically document the ISA and their PTX compiler is closed-source. With the help of Scott Gray's MaxAs (a reverse engineering of the Maxwell SASS), I was able to implement a rudimentary compiler for compute capability 6.x devices (Maxwell, Pascal families). Although the succeeding families Volta and Turing have not made huge changes to the ISA, it's a tedious task to implement backends for all of them.
26 | 
27 | Nonetheless, you can find
28 | * Scheduler and register allocation ([src/sass_6_x/backend.h](src/sass_6_x/backend.h))
29 | * Bytecode generation ([src/sass_6_x/bytecode.h](src/sass_6_x/bytecode.h))
30 | 


--------------------------------------------------------------------------------
/doc/overview-small.svg:
--------------------------------------------------------------------------------
 1 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
 2 | <svg xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://www.w3.org/2000/svg" height="354.33" width="708.66" version="1.1" xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/">
 3 | 	<defs>
 4 | 		<marker id="TriangleOutSJ" refY="0" refX="0" orient="auto">
 5 | 			<path d="m5.77,0-8.65,5,0-10,8.65,5z" fill-rule="evenodd" transform="scale(0.2,0.2)" stroke="#b4b4b4" stroke-width="1pt" fill="#b4b4b4"/>
 6 | 		</marker>
 7 | 	</defs>
 8 | 	<metadata>
 9 | 		<rdf:RDF>
10 | 			<cc:Work rdf:about="">
11 | 				<dc:format>image/svg+xml</dc:format>
12 | 				<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
13 | 				<dc:title/>
14 | 			</cc:Work>
15 | 		</rdf:RDF>
16 | 	</metadata>
17 | 	<g transform="translate(0,177.16536)">
18 | 		<path stroke-linejoin="round" d="m94.31-19.259c0,2.7598-4.6177,4.997-10.314,4.997-5.6962,0-10.314-2.2372-10.314-4.997s4.6177-4.997,10.314-4.997c5.6962,0,10.314,2.2372,10.314,4.997z" stroke-dashoffset="0" stroke="#000" stroke-linecap="round" stroke-miterlimit="4" stroke-dasharray="none" stroke-width="3.54330707" fill="none"/>
19 | 		<path stroke-linejoin="round" d="m73.682-19.37,0,22.209c0,2.7598,4.6177,4.997,10.314,4.997,5.6962,0,10.314-2.2372,10.314-4.997v-22.32" stroke-dashoffset="0" stroke="#000" stroke-linecap="round" stroke-miterlimit="4" stroke-dasharray="none" stroke-width="3.54330707" fill="none"/>
20 | 		<path fill="#000" d="m89.401-19.014c0,0.9754-2.42,1.7661-5.4053,1.7661s-5.4053-0.79072-5.4053-1.7661c0-0.9754,2.42-1.7661,5.4053-1.7661s5.4053,0.79072,5.4053,1.7661z"/>
21 | 		<path stroke-linejoin="round" d="m167.24-105.05c0,1.333-2.0248,2.4136-4.5226,2.4136s-4.5226-1.0806-4.5226-2.4136,2.0248-2.4136,4.5226-2.4136,4.5226,1.0806,4.5226,2.4136z" stroke-dashoffset="0" stroke="#000" stroke-linecap="round" stroke-miterlimit="4" stroke-width="3.5433073" fill="none"/>
22 | 		<path stroke-linejoin="round" d="m158.19-104.39,0,10.727,0,12.452c0,1.333,2.0248,2.4136,4.5226,2.4136s4.5226-1.0806,4.5226-2.4136v-23.233" stroke-dashoffset="0" stroke="#000" stroke-linecap="round" stroke-miterlimit="4" stroke-dasharray="none" stroke-width="3.54330707" fill="none"/>
23 | 		<path stroke-linejoin="round" d="m81.013-104.2c0,2.566-4.2756,4.6462-9.5498,4.6462s-9.5498-2.0802-9.5498-4.6462,4.2756-4.6462,9.5498-4.6462,9.5498,2.0802,9.5498,4.6462z" stroke-dashoffset="0" stroke="#000" stroke-linecap="round" stroke-miterlimit="4" stroke-width="3.54330707" fill="none"/>
24 | 		<path stroke-linejoin="round" d="m61.913-104.3,0,20.65c0,2.566,4.2756,4.6462,9.5498,4.6462s9.5498-2.0802,9.5498-4.6462v-20.753" stroke-dashoffset="0" stroke="#000" stroke-linecap="round" stroke-miterlimit="4" stroke-dasharray="none" stroke-width="3.54330707" fill="none"/>
25 | 		<text style="letter-spacing:0px;word-spacing:0px;" font-weight="bold" font-variant="normal" font-size="24px" font-stretch="normal" line-height="125%" font-style="oblique" y="-90.483345" x="38.490936" font-family="Thames" xml:space="preserve" fill="#000000"><tspan y="-90.483345" x="38.490936">f<tspan style="baseline-shift:sub;" font-size="65.00091553%" font-style="normal">1</tspan></tspan></text>
26 | 		<text style="letter-spacing:0px;word-spacing:0px;" font-weight="bold" xml:space="preserve" font-size="24px" font-stretch="normal" line-height="125%" font-variant="normal" y="-88.340485" x="134.56236" font-family="Thames" font-style="oblique" fill="#000000"><tspan x="134.56236" y="-88.340485">f<tspan style="baseline-shift:sub;" font-size="65.00091553%" font-style="normal">2</tspan></tspan></text>
27 | 		<path stroke-linejoin="round" d="M107.06-29.769,72.062-73.34" stroke-dashoffset="0" stroke="#000" stroke-linecap="round" stroke-miterlimit="4" stroke-dasharray="none" stroke-width="3.54330707" fill="none"/>
28 | 		<path stroke-linejoin="round" d="M123.85-29.769,158.85-73.34" stroke-dashoffset="0" stroke="#000" stroke-linecap="round" stroke-miterlimit="4" stroke-dasharray="none" stroke-width="3.54330707" fill="none"/>
29 | 		<path stroke-linejoin="round" d="m120.7-15.116,0-6.0978,0,0c0-2.9398-2.3967-4.9756-5.3532-4.9756s-5.3532,1.8799-5.3532,4.9756v6.1561" stroke-dashoffset="0" stroke="#000" stroke-linecap="round" stroke-miterlimit="4" stroke-width="3.5433073" fill="none"/>
30 | 		<path stroke-linejoin="round" d="m159.56,35.231-35-43.571" stroke-dashoffset="0" stroke="#000" stroke-linecap="round" stroke-miterlimit="4" stroke-dasharray="none" stroke-width="3.54330707" fill="none"/>
31 | 		<path stroke-linejoin="round" d="M176.35,35.231,207.42-3.6976" stroke-dashoffset="0" stroke="#000" stroke-linecap="round" stroke-miterlimit="4" stroke-dasharray="none" stroke-width="3.54330707" fill="none"/>
32 | 		<path stroke-linejoin="round" d="m173.23,41.044,0,6.0978,0,0c0,2.9398-2.3967,4.9756-5.3532,4.9756s-5.3532-1.8799-5.3532-4.9756v-6.156" stroke-dashoffset="0" stroke="#000" stroke-linecap="round" stroke-miterlimit="4" stroke-width="3.5433073" fill="none"/>
33 | 		<text style="letter-spacing:0px;word-spacing:0px;" font-weight="bold" font-variant="normal" font-size="24px" font-stretch="normal" line-height="125%" font-style="oblique" y="78.737488" x="163.43117" font-family="Thames" xml:space="preserve" fill="#000000"><tspan y="78.737488" x="163.43117">f<tspan style="baseline-shift:sub;" font-size="65.00091553%"/></tspan></text>
34 | 		<g fill="#b4b4b4" transform="matrix(0.96560313,0,0,1.8063492,-83.008367,-171.3651)">
35 | 			<path style="baseline-shift:baseline;block-progression:tb;color:#000000;direction:ltr;text-indent:0;text-align:start;enable-background:accumulate;text-transform:none;" d="m351.95,68.656,0,14.156,30.891,0,0-14.156z" fill-rule="nonzero"/>
36 | 			<path d="m399.21,75.737-24.52,14.173,0-28.346,24.52,14.173z" fill-rule="evenodd"/>
37 | 		</g>
38 | 		<text style="letter-spacing:0px;word-spacing:0px;" font-weight="normal" font-variant="normal" font-stretch="normal" line-height="125%" font-style="normal" y="-133.57912" x="311.20746" font-family="Source Code Pro" xml:space="preserve" fill="#000000"><tspan font-size="16px" y="-133.57912" x="311.20746"><tspan fill="#787878">uint64_t</tspan> cubin[] = {</tspan><tspan font-size="16px" y="-113.57912" x="311.20746" fill="#787878">       ...</tspan><tspan font-size="16px" y="-93.579117" x="311.20746"/><tspan font-size="14px" y="-75.454117" x="311.20746" fill="#787878"> 0x001fd400fc2007e1</tspan><tspan font-size="14px" y="-57.954113" x="311.20746"><tspan font-weight="bold"> 0x3958503f80070403</tspan><tspan fill="#787878"> // FADD R3,|R4|,-1.0</tspan></tspan><tspan font-size="14px" y="-40.454113" x="311.20746"><tspan font-weight="bold"> 0x3958503f00070505</tspan><tspan font-weight="normal" fill="#787878"> // FADD R5,|R5|,-0.5</tspan></tspan><tspan font-size="14px" y="-22.954113" x="311.20746"><tspan font-weight="bold"> 0x3958503e80070604</tspan><tspan font-weight="normal" fill="#787878"> // FADD R4,|R6|,-0.25</tspan></tspan><tspan font-size="14px" y="-5.4541135" x="311.20746" fill="#787878"> 0x001ffc00fe0007f6</tspan><tspan font-size="14px" y="12.045886" x="311.20746"><tspan font-weight="bold"> 0x5c60178000570303</tspan><tspan font-weight="normal" fill="#787878"> // FMNMX R3,R3,R5,!PT</tspan></tspan><tspan font-size="14px" y="29.545887" x="311.20746"><tspan font-weight="bold"> 0x5c60178000370404</tspan><tspan font-weight="normal" fill="#787878"> // FMNMX R4,R4,R3,!PT</tspan></tspan><tspan font-size="16px" y="48.920887" x="311.20746"> </tspan><tspan font-size="16px" y="68.920883" x="311.20746" fill="#787878">       ...</tspan><tspan font-size="16px" y="88.920883" x="311.20746">};</tspan><tspan font-size="16px" y="108.92088" x="311.20746"><tspan fill="#787878">CUmodule</tspan> module;</tspan><tspan font-size="16px" y="128.92088" x="311.20746"><tspan fill="#787878">CUfunction</tspan> fun;</tspan><tspan font-size="16px" y="148.92088" x="311.20746">cuModuleLoadData(&amp;module, cubin)</tspan><tspan font-size="16px" y="168.92088" x="311.20746">cuModuleGetFunction(&amp;fun, module, &quot;f&quot;)</tspan></text>
39 | 		<text style="letter-spacing:0px;word-spacing:0px;" font-weight="bold" font-variant="normal" font-size="24px" font-stretch="normal" line-height="125%" font-style="oblique" y="-29.671881" x="173.72552" font-family="Thames" xml:space="preserve" fill="#000000"><tspan y="-29.671881" x="173.72552">f<tspan style="baseline-shift:sub;" font-size="65.00091553%" font-style="normal">3</tspan></tspan></text>
40 | 		<rect ry="6.4382" height="48.487" width="84.853" stroke="#b4b4b4" stroke-dasharray="none" stroke-miterlimit="4" y="-57.006" x="162.63" stroke-width="3.5433" fill="none"/>
41 | 		<rect ry="20.387" height="145.49" width="364.53" stroke="#b4b4b4" stroke-dasharray="none" stroke-miterlimit="4" y="-98.606" x="308.1" stroke-width="3.5433" fill="none"/>
42 | 		<text style="letter-spacing:0px;word-spacing:0px;" font-weight="normal" font-variant="normal" font-size="20px" font-stretch="normal" line-height="125%" font-style="normal" y="-161.99783" x="438.53812" font-family="Source Code Pro" xml:space="preserve" fill="#787878"><tspan font-weight="normal" font-size="16px" font-style="normal" font-stretch="normal" font-variant="normal" y="-161.99783" x="438.53812" font-family="Arial" fill="#787878">NVIDIA SASS</tspan></text>
43 | 		<g stroke-linejoin="round" transform="matrix(0.29228316,0,0,0.33737912,212.71578,-46.168438)" stroke-dashoffset="0" stroke="#000" stroke-linecap="butt" stroke-dasharray="none" stroke-miterlimit="4" stroke-width="11.28360271" fill="none">
44 | 			<path d="m-44.814,17.875,0,47.477,47.982,19.951,46.72-19.193,0-47.982l-47.225-19.446z"/>
45 | 			<path d="M-44.309,18.127,2.9153,37.32,49.887,18.38"/>
46 | 			<path d="m3.1679,84.797,0-47.73"/>
47 | 		</g>
48 | 		<g stroke-linejoin="round" stroke-dashoffset="0" transform="matrix(0.38689373,0,0,0.44658702,129.26152,42.46)" stroke="#000" stroke-linecap="butt" stroke-dasharray="none" stroke-miterlimit="4" fill="none">
49 | 			<path d="m-44.814,17.875,0,47.477,47.982,19.951,46.72-19.193,0-47.982l-47.225-19.446z" stroke-width="8.52432251"/>
50 | 			<path d="M-44.309,18.127,2.9153,37.32,49.887,18.38" stroke-width="6.39324226"/>
51 | 			<path d="m3.1679,84.797,0-47.73" stroke-width="6.39324226"/>
52 | 		</g>
53 | 		<rect height="10.089" width="15.446" y="39.13" x="122.59" fill="#FFF"/>
54 | 		<g transform="translate(-30.304576,15.783634)">
55 | 			<path stroke-linejoin="round" d="m152.42,27.158,0,6.1195,0,0c0,2.1549,3.684,3.9018,8.2284,3.9018s8.2284-1.7469,8.2284-3.9018v-6.2062" stroke-dashoffset="0" stroke="#000" stroke-linecap="butt" stroke-miterlimit="4" stroke-dasharray="none" stroke-width="3.54330707" fill="none"/>
56 | 			<path fill="#000" d="m165.2,27.224c0,1.1397-2.0574,2.0635-4.5954,2.0635s-4.5954-0.92386-4.5954-2.0635c0-1.1397,2.0574-2.0635,4.5954-2.0635s4.5954,0.92389,4.5954,2.0635z"/>
57 | 			<path stroke-linejoin="round" d="m113.26,208.74c0,2.371-3.2224,4.2932-7.1973,4.2932-3.975,0-7.1973-1.9221-7.1973-4.2932" transform="matrix(1.0248108,0,0,0.93760823,51.988261,-168.52552)" stroke-dashoffset="0" stroke="#000" stroke-linecap="butt" stroke-miterlimit="4" stroke-dasharray="none" stroke-width="1.80736661" fill="none"/>
58 | 			<path stroke-linejoin="round" d="m113.26,208.74c0,2.371-3.2224,4.2932-7.1973,4.2932-3.975,0-7.1973-1.9221-7.1973-4.2932" transform="matrix(1.1432487,0,0,-1.1663781,39.387099,270.79331)" stroke-dashoffset="0" stroke="#000" stroke-linecap="round" stroke-miterlimit="4" stroke-dasharray="none" stroke-width="3.06844807" fill="none"/>
59 | 			<path stroke-linejoin="round" d="m152.42,61.785,0,3.7204,0,0c0,2.1549,3.684,3.9018,8.2284,3.9018s8.2284-1.7469,8.2284-3.9018v-3.8071" stroke-dashoffset="0" stroke="#000" stroke-linecap="butt" stroke-miterlimit="4" stroke-dasharray="none" stroke-width="3.54330707" fill="none"/>
60 | 			<path stroke-linejoin="round" d="m113.26,208.74c0,2.371-3.2224,4.2932-7.1973,4.2932-3.975,0-7.1973-1.9221-7.1973-4.2932" transform="matrix(1.0248108,0,0,0.93760823,51.988261,-168.52552)" stroke-dashoffset="0" stroke="#000" stroke-linecap="butt" stroke-miterlimit="4" stroke-dasharray="none" stroke-width="1.80736661" fill="none"/>
61 | 			<path stroke-linejoin="round" d="m113.26,208.74c0,2.371-3.2224,4.2932-7.1973,4.2932-3.975,0-7.1973-1.9221-7.1973-4.2932" transform="matrix(1.1432487,0,0,-1.1663781,39.387099,270.79331)" stroke-dashoffset="0" stroke="#000" stroke-linecap="round" stroke-miterlimit="4" stroke-dasharray="none" stroke-width="3.06844807" fill="none"/>
62 | 		</g>
63 | 		<rect ry="6.4382" height="48.487" width="84.853" stroke="#b4b4b4" stroke-dasharray="none" stroke-miterlimit="4" y="-57.006" x="162.63" stroke-width="3.5433" fill="none"/>
64 | 		<path style="marker-end:url(#TriangleOutSJ);" d="m302.86,301.7c-75.714,0-57.223-47.657-120.13-50.095" transform="translate(0,-177.16536)" stroke-dashoffset="0" stroke="#b4b4b4" stroke-dasharray="10.62992126, 10.62992126" stroke-miterlimit="4" stroke-width="3.54330707" fill="none"/>
65 | 		<text style="letter-spacing:0px;word-spacing:0px;" font-weight="normal" xml:space="preserve" font-size="20px" font-stretch="normal" line-height="125%" font-variant="normal" y="-161.99785" x="145.52673" font-family="Source Code Pro" font-style="normal" fill="#787878"><tspan style="text-anchor:middle;text-align:center;" font-weight="normal" font-size="16px" font-style="normal" font-stretch="normal" font-variant="normal" y="-161.99785" x="145.52673" font-family="Arial" fill="#787878">Function Representation</tspan></text>
66 | 	</g>
67 | </svg>
68 | 


--------------------------------------------------------------------------------
/src/backend_glsl.h:
--------------------------------------------------------------------------------
  1 | // Developed by Simen Haugo.
  2 | // See LICENSE.txt for copyright and licensing details (standard MIT License).
  3 | 
  4 | // This is the code generation backend for GLSL (GL Shading Language).
  5 | // The output is a stripped GLSL source code, meaning you must insert
  6 | // it into a GLSL shader as necessary for your application.
  7 | 
  8 | #pragma once
  9 | #include "frep.h"
 10 | #include <assert.h>
 11 | 
 12 | // Generates a null-terminated string of GLSL code that computes
 13 | //
 14 | // Variables are expected to be defined:
 15 | //     vec3 p0;
 16 | //
 17 | // Output is stored in:
 18 | //     float d1 = f(p0.x, p0.y, p0.z);
 19 | //
 20 | // The following functions must be declared and linked into the GLSL:
 21 | //     float fBox(vec3 p, vec3 dim);
 22 | //     float fBoxCheap(vec3 p, vec3 dim);
 23 | //     float fCylinder(vec3 p, float r, float h);
 24 | //     float fSphere(vec3 p, float r, float h);
 25 | //
 26 | char *frep_compile_to_glsl(frep_t *f);
 27 | 
 28 | //////////////////////////////////////////////////////////////////
 29 | //                       Implementation
 30 | //////////////////////////////////////////////////////////////////
 31 | 
 32 | namespace backend_glsl {
 33 | 
 34 | struct glsl_t
 35 | {
 36 |     int destination;
 37 |     char *stream;
 38 | };
 39 | 
 40 | int _frep_compile_to_glsl(frep_t *node,
 41 |                    glsl_t &s,
 42 |                    frep_mat3_t R_root_to_parent=frep_identity_3x3,
 43 |                    frep_vec3_t T_parent_rel_root=frep_null_3x1)
 44 | {
 45 |     assert(node);
 46 | 
 47 |     frep_mat3_t R_root_to_this;
 48 |     frep_vec3_t T_this_rel_root;
 49 |     frep_get_global_transform(node, &R_root_to_this, &T_this_rel_root, R_root_to_parent, T_parent_rel_root);
 50 | 
 51 |     int my_index = s.destination++;
 52 | 
 53 |     // p^this = R_root_to_this*(p^0 - T_this_rel_root)
 54 |     //        = R_root_to_this*p^0 + (-R_root_to_this*T_this_rel_root)
 55 |     {
 56 |         #define R(row,col) R_root_to_this.at(row,col)
 57 |         #define T(i) T_this_rel_root[i]
 58 |         float dtx = -(R(0,0)*T(0) + R(0,1)*T(1) + R(0,2)*T(2));
 59 |         float dty = -(R(1,0)*T(0) + R(1,1)*T(1) + R(1,2)*T(2));
 60 |         float dtz = -(R(2,0)*T(0) + R(2,1)*T(1) + R(2,2)*T(2));
 61 |         s.stream += sprintf(s.stream,
 62 |             "vec3 p%d = "
 63 |             "vec3(%f,%f,%f)*p0.x + "
 64 |             "vec3(%f,%f,%f)*p0.y + "
 65 |             "vec3(%f,%f,%f)*p0.z + "
 66 |             "vec3(%f,%f,%f);\n",
 67 |             my_index,
 68 |             R(0,0), R(1,0), R(2,0),
 69 |             R(0,1), R(1,1), R(2,1),
 70 |             R(0,2), R(1,2), R(2,2),
 71 |             dtx, dty, dtz
 72 |         );
 73 |         #undef R
 74 |         #undef T
 75 |     }
 76 | 
 77 |     if (frep_is_boolean(node))
 78 |     {
 79 |         assert(node->left);
 80 |         assert(node->right);
 81 | 
 82 |         int i_left = _frep_compile_to_glsl(node->left, s, R_root_to_this, T_this_rel_root);
 83 |         int i_right = _frep_compile_to_glsl(node->right, s, R_root_to_this, T_this_rel_root);
 84 | 
 85 |         s.stream += sprintf(s.stream, "float d%d = ", my_index);
 86 | 
 87 |         switch (node->opcode)
 88 |         {
 89 |             case FREP_UNION: s.stream += sprintf(s.stream, "min(d%d,d%d);\n", i_left, i_right); break;
 90 |             case FREP_INTERSECT: s.stream += sprintf(s.stream, "max(d%d,d%d);\n", i_left, i_right); break;
 91 |             case FREP_SUBTRACT: s.stream += sprintf(s.stream, "max(d%d,-d%d);\n", i_left, i_right); break;
 92 |             case FREP_BLEND: s.stream += sprintf(s.stream, "%f*d%d + %f*d%d;\n", node->blend.alpha, i_left, 1.0f-node->blend.alpha, i_right); break;
 93 |             default: assert(false && "Unexpected opcode");
 94 |         }
 95 |     }
 96 |     else if (frep_is_primitive(node))
 97 |     {
 98 |         s.stream += sprintf(s.stream, "float d%d = ", my_index);
 99 | 
100 |         switch (node->opcode)
101 |         {
102 |             case FREP_BOX:       s.stream += sprintf(s.stream, "fBox(p%d, vec3(%f, %f, %f));\n", my_index, node->box.width, node->box.height, node->box.depth); break;
103 |             case FREP_BOX_CHEAP: s.stream += sprintf(s.stream, "fBoxCheap(p%d, vec3(%f, %f, %f));\n", my_index, node->box.width, node->box.height, node->box.depth); break;
104 |             case FREP_SPHERE:    s.stream += sprintf(s.stream, "fSphere(p%d, %f);\n", my_index, node->sphere.radius); break;
105 |             case FREP_CYLINDER:  s.stream += sprintf(s.stream, "fCylinder(p%d, %f, %f);\n", my_index, node->cylinder.radius, node->cylinder.height); break;
106 |             case FREP_PLANE:     s.stream += sprintf(s.stream, "p%d.x - %f;\n", my_index, node->plane.offset); break;
107 |             default: assert(false && "Unexpected opcode");
108 |         }
109 |     }
110 |     else
111 |     {
112 |         assert(false && "Unexpected node type");
113 |     }
114 |     return my_index;
115 | }
116 | 
117 | }
118 | 
119 | char *frep_compile_to_glsl(frep_t *node)
120 | {
121 |     using namespace backend_glsl;
122 |     static char *buffer = (char*)malloc(10*1024*1024);
123 |     assert(buffer && "Failed to allocate buffer to contain GLSL output");
124 |     glsl_t s;
125 |     s.stream = buffer;
126 |     s.destination = 1;
127 |     _frep_compile_to_glsl(node, s);
128 |     return buffer;
129 | }
130 | 


--------------------------------------------------------------------------------
/src/backend_ptx.h:
--------------------------------------------------------------------------------
  1 | // Developed by Simen Haugo.
  2 | // See LICENSE.txt for copyright and licensing details (standard MIT License).
  3 | 
  4 | /*
  5 | This is the code generation backend for NVIDIA PTX, which is not
  6 | a machine code target, but a fake assembly language (stored as text)
  7 | which gets compiled into native target-architecture instructions by
  8 | the CUDA driver. Note that this compilation can take a long time.
  9 | If you need to be able to rapidly compile and upload trees to the
 10 | GPU, look at the SASS backend, where we implement our own native
 11 | machine code generation.
 12 | */
 13 | 
 14 | #pragma once
 15 | 
 16 | #include "frep.h"
 17 | #include <stdint.h>
 18 | #include <assert.h>
 19 | 
 20 | /*
 21 | Generates a string containing newline-seperated PTX instructions
 22 | which evaluate f(x0, y0, z0) and stores the result in a register
 23 | named "f%d" % result_register (e.g. "f3"). The input coordinates
 24 | are assumed to be in registers named "x0", "y0", and "z0".
 25 | 
 26 | See test/backend_ptx.cpp for an example of a complete PTX program
 27 | that uses the generated output.
 28 | */
 29 | char *frep_compile_to_ptx(frep_t *f, int *result_register);
 30 | 
 31 | //////////////////////////////////////////////////////////////////
 32 | //                       Implementation
 33 | //////////////////////////////////////////////////////////////////
 34 | 
 35 | namespace backend_ptx {
 36 | 
 37 | /*
 38 | Nodes in the FRep AST have constants (such as sphere radius) that
 39 | are involved in the expression for that node's membership function.
 40 | When generating code to execute the membership function, constants
 41 | can either be placed in Constants Memory (and must be fetched with
 42 | an additional load), or be baked directly into the instructions.
 43 | 
 44 | For example, the PTX instruction
 45 |     add.ftz.f32 x, x, 0f3F000000; // x <- x + 0.5
 46 | uses +0.5 as an immediate value. In the generated machine code for
 47 | e.g. Maxwell architectures, this instruction may look like this:
 48 |     0x3858503f00070409
 49 |             ^^^^^
 50 |         immediate value (note that last 12 bits are truncated).
 51 | 
 52 | However, not all instructions can use full 32-bit floating point
 53 | immediate values. Notably, min, max and fused-multiply-add (FFMA)
 54 | on Maxwell/Pascal target architectures. But all do support 20-bit
 55 | floating point immediates, where the last 12 bits of the mantissa
 56 | are truncated (assumed to be zero).
 57 | 
 58 | You can choose whether you want to preserve 32-bit floating point
 59 | constants at the expense of speed, or if you want to truncate the
 60 | last 12 bits and use 20-bit floating point constants.
 61 | */
 62 | uint32_t encode_f32(float x)
 63 | {
 64 |     #if defined(PTX_FP32_IMMEDIATE)
 65 |     return (*(uint32_t*)&x);
 66 |     #elif defined(PTX_FP20_IMMEDIATE)
 67 |     // Note: PTX immediate values preserve their sign bit, unlike
 68 |     // SASS immediate values, which encode the sign bit elsewhere
 69 |     // in the instruction.
 70 |     return (*(uint32_t*)&x) & 0xFFFFF000;
 71 |     #else
 72 |     #error "You must #define either PTX_FP32_IMMEDIATE or PTX_FP20_IMMEDIATE before including this file."
 73 |     #endif
 74 | }
 75 | 
 76 | struct ptx_t
 77 | {
 78 |     int next_register;
 79 |     char *stream;
 80 | };
 81 | 
 82 | void emit_transform(ptx_t &s, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/)
 83 | {
 84 |     // emit transform code: p_this = R_root_to_this*(p_root - T_this_rel_root)
 85 |     int x = s.next_register++;
 86 |     int y = s.next_register++;
 87 |     int z = s.next_register++;
 88 | 
 89 |     // compute R_root_to_this*(-T_this_rel_root)
 90 |     float tx = -(R.at(0,0)*T[0] + R.at(0,1)*T[1] + R.at(0,2)*T[2]);
 91 |     float ty = -(R.at(1,0)*T[0] + R.at(1,1)*T[1] + R.at(1,2)*T[2]);
 92 |     float tz = -(R.at(2,0)*T[0] + R.at(2,1)*T[1] + R.at(2,2)*T[2]);
 93 | 
 94 |     // emit instructions for R_root_to_this*p_root + R_root_to_this*(-T_this_rel_root)
 95 |     s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, x0, 0f%08x, 0f%08x;\n", x, encode_f32(R.at(0,0)), encode_f32(tx));
 96 |     s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, x0, 0f%08x, 0f%08x;\n", y, encode_f32(R.at(1,0)), encode_f32(ty));
 97 |     s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, x0, 0f%08x, 0f%08x;\n", z, encode_f32(R.at(2,0)), encode_f32(tz));
 98 |     s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, y0, 0f%08x, f%d;\n",    x, encode_f32(R.at(0,1)), x);
 99 |     s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, y0, 0f%08x, f%d;\n",    y, encode_f32(R.at(1,1)), y);
100 |     s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, y0, 0f%08x, f%d;\n",    z, encode_f32(R.at(2,1)), z);
101 |     s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, z0, 0f%08x, f%d;\n",    x, encode_f32(R.at(0,2)), x);
102 |     s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, z0, 0f%08x, f%d;\n",    y, encode_f32(R.at(1,2)), y);
103 |     s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d, z0, 0f%08x, f%d;\n",    z, encode_f32(R.at(2,2)), z);
104 | }
105 | 
106 | int emit_box(ptx_t &s, frep_t *node, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/)
107 | {
108 |     assert(false && "Box is not implemented in PTX backend yet");
109 |     return 0;
110 | }
111 | 
112 | int emit_box_cheap(ptx_t &s, frep_t *node, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/)
113 | {
114 |     // mathematical expression: Box(p, width,height,depth)
115 |     // (x,y,z) = R*(p - T)
116 |     // d = max( |x|-width, |y|-height, |z|-depth )
117 | 
118 |     // ptx template:
119 |     // <transform template>
120 |     // abs.ftz.f32 x, x;
121 |     // abs.ftz.f32 y, y;
122 |     // abs.ftz.f32 z, z;
123 |     // sub.ftz.f32 x, x, (width);
124 |     // sub.ftz.f32 y, y, (height);
125 |     // sub.ftz.f32 z, z, (depth);
126 |     // max.ftz.f32 d, x, y;
127 |     // max.ftz.f32 d, d, z;
128 | 
129 |     // emitted instructions:
130 |     emit_transform(s, R, T); // todo: inline here and optimize for each primitive
131 |     int x = s.next_register - 3;
132 |     int y = s.next_register - 2;
133 |     int z = s.next_register - 1;
134 |     int d = s.next_register++;
135 |     s.stream += sprintf(s.stream, "abs.ftz.f32 f%d,f%d;\n", x, x);
136 |     s.stream += sprintf(s.stream, "abs.ftz.f32 f%d,f%d;\n", y, y);
137 |     s.stream += sprintf(s.stream, "abs.ftz.f32 f%d,f%d;\n", z, z);
138 |     s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", x, x, encode_f32(-node->box.width));
139 |     s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", y, y, encode_f32(-node->box.height));
140 |     s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", z, z, encode_f32(-node->box.depth));
141 |     s.stream += sprintf(s.stream, "max.ftz.f32 f%d,f%d,f%d;\n", d, x, y);
142 |     s.stream += sprintf(s.stream, "max.ftz.f32 f%d,f%d,f%d;\n", d, d, z);
143 |     return d;
144 | }
145 | 
146 | int emit_sphere(ptx_t &s, frep_t *node, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/)
147 | {
148 |     // mathematical expression:
149 |     // d = length(p_this) - r
150 |     //   = length(R*(p_root - T)) - r
151 |     //   = length(p_root - T) - r
152 | 
153 |     // ptx template:
154 |     // add.ftz.f32 x, x0, (-tx);
155 |     // add.ftz.f32 y, y0, (-ty);
156 |     // add.ftz.f32 z, z0, (-tz);
157 |     // mul.ftz.f32 d, x, x;
158 |     // fma.rn.ftz.f32 d, y, y, d;
159 |     // fma.rn.ftz.f32 d, z, z, d;
160 |     // sqrt.approx.ftz.f32 d, d;
161 |     // sub.f32 d, d, (r);
162 | 
163 |     // emitted instructions:
164 |     int x = s.next_register++;
165 |     int y = s.next_register++;
166 |     int z = s.next_register++;
167 |     int d = s.next_register++;
168 |     s.stream += sprintf(s.stream, "add.ftz.f32 f%d,x0,0f%08x;\n", x, encode_f32(-T[0]));                // x <- x0 - (Tx)
169 |     s.stream += sprintf(s.stream, "add.ftz.f32 f%d,y0,0f%08x;\n", y, encode_f32(-T[1]));                // y <- y0 - (Ty)
170 |     s.stream += sprintf(s.stream, "add.ftz.f32 f%d,z0,0f%08x;\n", z, encode_f32(-T[2]));                // z <- z0 - (Tz)
171 |     s.stream += sprintf(s.stream, "mul.ftz.f32 f%d,f%d,f%d;\n", d, x, x);                               // d <- x*x
172 |     s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,f%d,f%d,f%d;\n", d, y, y, d);                     // d <- y*y + d
173 |     s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,f%d,f%d,f%d;\n", d, z, z, d);                     // d <- z*z + d
174 |     s.stream += sprintf(s.stream, "sqrt.approx.ftz.f32 f%d,f%d;\n", d, d);                              // d <- sqrt(d)
175 |     s.stream += sprintf(s.stream, "add.f32 f%d,f%d,0f%08x;\n", d, d, encode_f32(-node->sphere.radius)); // d <- d - (r)
176 |     return d;
177 | }
178 | 
179 | int emit_cylinder(ptx_t &s, frep_t *node, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/)
180 | {
181 |     // mathematical expression: cylinder(p, 2*height, radius)
182 |     // (x,y,z) = R*(p - T)
183 |     // d = max( sqrt(x*x + z*z) - radius, abs(y) - height )
184 | 
185 |     // ptx template
186 |     // <transform template>
187 |     // mul.ftz.f32 d, x, x;
188 |     // fma.rn.ftz.f32 d, z, z, d;
189 |     // sqrt.approx.ftz.f32 d, d;
190 |     // abs.ftz.f32 y, y;
191 |     // add.ftz.f32 y, y, (-height);
192 |     // add.ftz.f32 d, d, (-radius);
193 |     // max.ftz.f32 d, d, y;
194 | 
195 |     // emitted instructions:
196 |     emit_transform(s, R, T); // todo: inline here and optimize for each primitive
197 |     int x = s.next_register - 3;
198 |     int y = s.next_register - 2;
199 |     int z = s.next_register - 1;
200 |     int d = s.next_register++;
201 |     s.stream += sprintf(s.stream, "mul.ftz.f32 f%d,f%d,f%d;\n", d, x, x);
202 |     s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,f%d,f%d,f%d;\n", d, z, z, d);
203 |     s.stream += sprintf(s.stream, "sqrt.approx.ftz.f32 f%d,f%d;\n", d, d);
204 |     s.stream += sprintf(s.stream, "abs.ftz.f32 f%d,f%d;\n", y, y);
205 |     s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", y, y, encode_f32(-node->cylinder.height));
206 |     s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", d, d, encode_f32(-node->cylinder.radius));
207 |     s.stream += sprintf(s.stream, "max.ftz.f32 f%d,f%d,f%d;\n", d, d, y);
208 |     return d;
209 | }
210 | 
211 | int emit_plane(ptx_t &s, frep_t *node, frep_mat3_t R/*root_to_this*/, frep_vec3_t T/*this_rel_root*/)
212 | {
213 |     // mathematical expression:
214 |     // (x,y,z) = R*(p - T)
215 |     // d = x - plane.x
216 |     //   = R00*(x0 - Tx) + R01*(y0 - Ty) + R02*(z0 - Tz) - plane.x
217 |     //   = R00*x0 + R01*y0 + R02*z0 + (-plane.x - R00*Tx - R01*Ty - R02*Tz)
218 |     //   = R00*x0 + R01*y0 + R02*z0 + k
219 | 
220 |     // ptx template:
221 |     // mul.ftz.f32 d, x0, (R00);
222 |     // fma.rn.ftz.f32 d, y0, (R01), d;
223 |     // fma.rn.ftz.f32 d, z0, (R02), d;
224 |     // add.ftz.f32 d, d, (k)
225 | 
226 |     // emitted instructions:
227 |     float k = -(R.at(0,0)*T[0] + R.at(0,1)*T[1] + R.at(0,2)*T[2] + node->plane.offset);
228 |     int d = s.next_register++;
229 |     s.stream += sprintf(s.stream, "mul.ftz.f32 f%d,x0,0f%08x;\n", d, encode_f32(R.at(0,0)));
230 |     s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,y0,0f%08x,f%d;\n", d, encode_f32(R.at(0,1)), d);
231 |     s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,z0,0f%08x,f%d;\n", d, encode_f32(R.at(0,2)), d);
232 |     s.stream += sprintf(s.stream, "add.ftz.f32 f%d,f%d,0f%08x;\n", d, d, encode_f32(k));
233 |     return d;
234 | }
235 | 
236 | int emit_union(ptx_t &s, int left, int right)
237 | {
238 |     int d = s.next_register++;
239 |     s.stream += sprintf(s.stream, "min.ftz.f32 f%d,f%d,f%d;\n", d, left, right);
240 |     return d;
241 | }
242 | 
243 | int emit_intersect(ptx_t &s, int left, int right)
244 | {
245 |     int d = s.next_register++;
246 |     s.stream += sprintf(s.stream, "max.ftz.f32 f%d,f%d,f%d;\n", d, left, right);
247 |     return d;
248 | }
249 | 
250 | int emit_subtract(ptx_t &s, int left, int right)
251 | {
252 |     int d = s.next_register++;
253 |     s.stream += sprintf(s.stream, "neg.ftz.f32 f%d,f%d;\n", right, right);
254 |     s.stream += sprintf(s.stream, "max.ftz.f32 f%d,f%d,f%d;\n", d, left, right);
255 |     return d;
256 | }
257 | 
258 | int emit_blend(ptx_t &s, int left, int right, float blend_alpha)
259 | {
260 |     int d = s.next_register++;
261 |     s.stream += sprintf(s.stream, "mul.ftz.f32 f%d,f%d,0f%08x;\n", d, left, encode_f32(blend_alpha));
262 |     s.stream += sprintf(s.stream, "fma.rn.ftz.f32 f%d,f%d,0f%08x,f%d;\n", d, right, encode_f32(1.0f-blend_alpha), d);
263 |     return d;
264 | }
265 | 
266 | int _frep_compile_to_ptx(
267 |     frep_t *node,
268 |     ptx_t &state,
269 |     frep_mat3_t R_root_to_parent=frep_identity_3x3,
270 |     frep_vec3_t T_parent_rel_root=frep_null_3x1)
271 | {
272 |     assert(node);
273 | 
274 |     frep_mat3_t R_root_to_this;
275 |     frep_vec3_t T_this_rel_root;
276 |     frep_get_global_transform(node, &R_root_to_this, &T_this_rel_root, R_root_to_parent, T_parent_rel_root);
277 | 
278 |     int result = -1;
279 |     if (frep_is_boolean(node))
280 |     {
281 |         assert(node->left);
282 |         assert(node->right);
283 |         int left = _frep_compile_to_ptx(node->left, state, R_root_to_this, T_this_rel_root);
284 |         int right = _frep_compile_to_ptx(node->right, state, R_root_to_this, T_this_rel_root);
285 |         switch (node->opcode)
286 |         {
287 |             case FREP_UNION:     return emit_union(state, left, right);
288 |             case FREP_INTERSECT: return emit_intersect(state, left, right);
289 |             case FREP_SUBTRACT:  return emit_subtract(state, left, right);
290 |             case FREP_BLEND:     return emit_blend(state, left, right, node->blend.alpha);
291 |         }
292 |     }
293 |     else if (frep_is_primitive(node))
294 |     {
295 |         switch (node->opcode)
296 |         {
297 |             case FREP_BOX:       return emit_box(state, node, R_root_to_this, T_this_rel_root);
298 |             case FREP_BOX_CHEAP: return emit_box_cheap(state, node, R_root_to_this, T_this_rel_root);
299 |             case FREP_SPHERE:    return emit_sphere(state, node, R_root_to_this, T_this_rel_root);
300 |             case FREP_CYLINDER:  return emit_cylinder(state, node, R_root_to_this, T_this_rel_root);
301 |             case FREP_PLANE:     return emit_plane(state, node, R_root_to_this, T_this_rel_root);
302 |         }
303 |     }
304 | 
305 |     assert(false && "Unexpected node opcode");
306 |     return -1;
307 | }
308 | 
309 | }
310 | 
311 | char *frep_compile_to_ptx(frep_t *node, int *result_register)
312 | {
313 |     using namespace backend_ptx;
314 |     static char *buffer = (char*)malloc(10*1024*1024);
315 |     assert(buffer && "Failed to allocate buffer to contain PTX output");
316 |     ptx_t s;
317 |     s.stream = buffer;
318 |     s.next_register = 0;
319 |     *result_register = _frep_compile_to_ptx(node, s);
320 |     return buffer;
321 | }
322 | 


--------------------------------------------------------------------------------
/src/backend_sass.h:
--------------------------------------------------------------------------------
 1 | #if defined(COMPUTE_CAPABILITY_3_X)
 2 | // Kepler
 3 | #error "Target devices of compute capability 3.x are not supported by the SASS backend."
 4 | 
 5 | #elif defined(COMPUTE_CAPABILITY_5_X) || defined(COMPUTE_CAPABILITY_6_X)
 6 | // Maxwell, Pascal (e.g. GTX 1080, Titan X)
 7 | #include "sass_6_x/backend.h"
 8 | 
 9 | #elif defined(COMPUTE_CAPABILITY_7_X)
10 | // Volta, Turing (e.g. RTX Titan, 2080)
11 | #error "Target devices of compute capability 7.x are not supported by the SASS backend."
12 | 
13 | #else
14 | #error "Missing #define. Specify the compute capability target for the SASS backend."
15 | #endif
16 | 


--------------------------------------------------------------------------------
/src/frep.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <assert.h>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <stdint.h>
  6 | #include <math.h>
  7 | 
  8 | typedef int frep_opcode_t;
  9 | enum frep_opcode_ {
 10 |     FREP_INVALID = 0,
 11 | 
 12 |     FREP_BOX,
 13 |     FREP_BOX_CHEAP,
 14 |     FREP_SPHERE,
 15 |     FREP_CYLINDER,
 16 |     FREP_PLANE,
 17 |     FREP_UNION,
 18 |     FREP_INTERSECT,
 19 |     FREP_SUBTRACT,
 20 |     FREP_BLEND,
 21 | };
 22 | 
 23 | struct frep_box_t      { float width, height, depth; };
 24 | struct frep_sphere_t   { float radius; };
 25 | struct frep_cylinder_t { float radius, height; };
 26 | struct frep_plane_t    { float sign, offset; };
 27 | struct frep_blend_t    { float alpha; };
 28 | 
 29 | /*
 30 | Each frep node has a rigid-body transform associated with it.
 31 | It can be the identity. If so, it gets optimized out in the
 32 | backend. The transformation parameters relate the point argument
 33 | of the child node to its parent node by:
 34 | 
 35 |     p^parent = Rx(rx)*Ry(ry)*Rz(rz)*p^child + (tx,ty,tz)
 36 | 
 37 | */
 38 | struct frep_t {
 39 |     frep_opcode_t opcode;
 40 |     frep_t *left;
 41 |     frep_t *right;
 42 |     float rx,ry,rz,tx,ty,tz;
 43 |     union {
 44 |         frep_box_t box;
 45 |         frep_sphere_t sphere;
 46 |         frep_cylinder_t cylinder;
 47 |         frep_plane_t plane;
 48 |         frep_blend_t blend;
 49 |     };
 50 | };
 51 | 
 52 | /*
 53 | Node creation and deletion utilities
 54 | */
 55 | frep_t *frep_malloc() {
 56 |     frep_t *f = (frep_t*)malloc(sizeof(frep_t));
 57 |     return f;
 58 | }
 59 | frep_t *frep_calloc() {
 60 |     frep_t *f = (frep_t*)calloc(1, sizeof(frep_t));
 61 |     return f;
 62 | }
 63 | void frep_free(frep_t *f) {
 64 |     if (!f) return;
 65 |     frep_free(f->left);
 66 |     frep_free(f->right);
 67 |     free(f);
 68 | }
 69 | frep_t *frep_copy(frep_t *f) {
 70 |     if (!f) return NULL;
 71 |     frep_t *f1 = frep_malloc();
 72 |     *f1 = *f;
 73 |     f1->left = frep_copy(f->left);
 74 |     f1->right = frep_copy(f->right);
 75 |     return f1;
 76 | }
 77 | 
 78 | /*
 79 | Other utilities
 80 | */
 81 | bool frep_is_primitive(frep_t *f) {
 82 |     return f->opcode == FREP_BOX ||
 83 |            f->opcode == FREP_BOX_CHEAP ||
 84 |            f->opcode == FREP_SPHERE ||
 85 |            f->opcode == FREP_CYLINDER ||
 86 |            f->opcode == FREP_PLANE;
 87 | }
 88 | bool frep_is_boolean(frep_t *f) {
 89 |     return f->opcode == FREP_UNION ||
 90 |            f->opcode == FREP_INTERSECT ||
 91 |            f->opcode == FREP_SUBTRACT;
 92 | }
 93 | int frep_get_num_nodes(frep_t *f) {
 94 |     if (!f) return 0;
 95 |     return 1 + frep_get_num_nodes(f->left) + frep_get_num_nodes(f->right);
 96 | }
 97 | 
 98 | int frep_get_depth(frep_t *f) {
 99 |     if (!f) return 0;
100 |     int l = frep_get_depth(f->left);
101 |     int r = frep_get_depth(f->right);
102 |     int max_lr = (l > r ? l : r);
103 |     return 1 + max_lr;
104 | }
105 | frep_t *frep_find_node(frep_t *a, int find_i, frep_t **out_parent, int *out_depth, frep_t *parent=NULL, int depth=0)
106 | {
107 |     assert(a);
108 |     assert(find_i >= 0);
109 | 
110 |     static int i = 0;
111 |     if (!parent) i = 0;
112 |     else i++;
113 | 
114 |     if (i == find_i)
115 |     {
116 |         *out_depth = depth;
117 |         *out_parent = parent;
118 |         return a;
119 |     }
120 |     else if (frep_is_boolean(a))
121 |     {
122 |         frep_t *left = frep_find_node(a->left, find_i, out_parent, out_depth, a, depth+1);
123 |         if (left) return left;
124 |         frep_t *right = frep_find_node(a->right, find_i, out_parent, out_depth, a, depth+1);
125 |         if (right) return right;
126 |     }
127 |     return NULL;
128 | }
129 | 
130 | /*
131 | Utility routines for computing rigid-body transform from root node to a specific child.
132 | */
133 | struct frep_mat3_t { float d[3*3]; float &at(int row, int col) { return d[col + row*3]; } };
134 | struct frep_vec3_t { float d[3]; float &operator[](int i) { return d[i]; } };
135 | static frep_mat3_t frep_identity_3x3 = { 1,0,0, 0,1,0, 0,0,1 };
136 | static frep_vec3_t frep_null_3x1 = { 0,0,0 };
137 | 
138 | // d = a*b
139 | frep_mat3_t frep_mat_mul(frep_mat3_t a, frep_mat3_t b) {
140 |     frep_mat3_t d = {0};
141 |     for (int row = 0; row < 3; row++)
142 |     for (int col = 0; col < 3; col++)
143 |     {
144 |         d.at(row,col) = 0.0f;
145 |         for (int i = 0; i < 3; i++)
146 |             d.at(row,col) += a.at(row,i)*b.at(i,col);
147 |     }
148 |     return d;
149 | }
150 | 
151 | // d = transpose(a) * b
152 | frep_vec3_t frep_mat_mul_transpose(frep_mat3_t a, frep_vec3_t b) {
153 |     frep_vec3_t d = {0};
154 |     for (int row = 0; row < 3; row++)
155 |     {
156 |         d[row] = 0.0f;
157 |         for (int i = 0; i < 3; i++)
158 |             d[row] += a.at(i,row)*b[i];
159 |     }
160 |     return d;
161 | }
162 | 
163 | frep_vec3_t frep_mat_add(frep_vec3_t a, frep_vec3_t b) {
164 |     frep_vec3_t d = { a[0]+b[0], a[1]+b[1], a[2]+b[2] };
165 |     return d;
166 | }
167 | 
168 | void frep_get_global_transform(frep_t *node,
169 |                               frep_mat3_t *R_root_to_this,
170 |                               frep_vec3_t *T_this_rel_root,
171 |                               frep_mat3_t R_root_to_parent,
172 |                               frep_vec3_t T_parent_rel_root) {
173 |     float cx = cosf(-node->rx); float sx = sinf(-node->rx);
174 |     float cy = cosf(-node->ry); float sy = sinf(-node->ry);
175 |     float cz = cosf(-node->rz); float sz = sinf(-node->rz);
176 | 
177 |     //    R_this_to_parent = Rx(rx)*Ry(ry)*Rz(rz)
178 |     // -> R_parent_to_this = Rz(-rz)*Ry(-ry)*Rx(-rx)
179 |     frep_mat3_t R_parent_to_this =
180 |     {
181 |         cy*cz, cz*sx*sy - cx*sz, sx*sz + cx*cz*sy,
182 |         cy*sz, cx*cz + sx*sy*sz, cx*sy*sz - cz*sx,
183 |         -sy, cy*sx, cx*cy
184 |     };
185 |     frep_vec3_t T_this_rel_parent = { node->tx, node->ty, node->tz };
186 | 
187 |     *R_root_to_this = frep_mat_mul(R_parent_to_this,R_root_to_parent);
188 |     *T_this_rel_root = frep_mat_add(T_parent_rel_root, frep_mat_mul_transpose(R_root_to_parent, T_this_rel_parent));
189 | }
190 | 
191 | 


--------------------------------------------------------------------------------
/src/frep_builder.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "frep.h"
 3 | 
 4 | /*
 5 | FRep primitives
 6 | */
 7 | frep_t *fBox(float width, float height, float depth) {
 8 |     frep_t *f = frep_calloc();
 9 |     f->opcode = FREP_BOX;
10 |     f->box.width = width;
11 |     f->box.height = height;
12 |     f->box.depth = depth;
13 |     return f;
14 | }
15 | frep_t *fBoxCheap(float width, float height, float depth) {
16 |     frep_t *f = frep_calloc();
17 |     f->opcode = FREP_BOX_CHEAP;
18 |     f->box.width = width;
19 |     f->box.height = height;
20 |     f->box.depth = depth;
21 |     return f;
22 | }
23 | frep_t *fSphere(float radius) {
24 |     frep_t *f = frep_calloc();
25 |     f->opcode = FREP_SPHERE;
26 |     f->sphere.radius = radius;
27 |     return f;
28 | }
29 | frep_t *fCylinder(float radius, float height) {
30 |     frep_t *f = frep_calloc();
31 |     f->opcode = FREP_CYLINDER;
32 |     f->cylinder.radius = radius;
33 |     f->cylinder.height = height;
34 |     return f;
35 | }
36 | frep_t *fPlane(float sign, float offset) {
37 |     frep_t *f = frep_calloc();
38 |     f->opcode = FREP_PLANE;
39 |     f->plane.sign = sign;
40 |     f->plane.offset = offset;
41 |     return f;
42 | }
43 | 
44 | /*
45 | Function operators
46 | */
47 | frep_t *fOpUnion(frep_t *left, frep_t *right) {
48 |     frep_t *f = frep_calloc();
49 |     f->opcode = FREP_UNION;
50 |     f->left = left;
51 |     f->right = right;
52 |     return f;
53 | }
54 | frep_t *fOpSubtract(frep_t *left, frep_t *right) {
55 |     frep_t *f = frep_calloc();
56 |     f->opcode = FREP_SUBTRACT;
57 |     f->left = left;
58 |     f->right = right;
59 |     return f;
60 | }
61 | frep_t *fOpIntersect(frep_t *left, frep_t *right) {
62 |     frep_t *f = frep_calloc();
63 |     f->opcode = FREP_INTERSECT;
64 |     f->left = left;
65 |     f->right = right;
66 |     return f;
67 | }
68 | 
69 | /*
70 | Spatial operators
71 | */
72 | frep_t *pOpRotate(frep_t *f, float rx, float ry, float rz) {
73 |     f->rx = rx;
74 |     f->ry = ry;
75 |     f->rz = rz;
76 |     return f;
77 | }
78 | frep_t *pOpTranslate(frep_t *f, float tx, float ty, float tz) {
79 |     f->tx = tx;
80 |     f->ty = ty;
81 |     f->tz = tz;
82 |     return f;
83 | }
84 | 


--------------------------------------------------------------------------------
/src/frep_eval.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "frep.h"
  3 | #include <assert.h>
  4 | #include <math.h>
  5 | 
  6 | float frep_eval(frep_t *f, float x, float y, float z)
  7 | {
  8 |     assert(f);
  9 | 
 10 |     x -= f->tx;
 11 |     y -= f->ty;
 12 |     z -= f->tz;
 13 | 
 14 |     if (f->rx != 0.0f)
 15 |     {
 16 |         float cx = cosf(-f->rx);
 17 |         float sx = sinf(-f->rx);
 18 |         float zz = cx*z + sx*y;
 19 |         y = cx*y - sx*z;
 20 |         z = zz;
 21 |     }
 22 |     if (f->ry != 0.0f)
 23 |     {
 24 |         float cy = cosf(-f->ry);
 25 |         float sy = sinf(-f->ry);
 26 |         float xx = cy*x + sy*z;
 27 |         z = cy*z - sy*x;
 28 |         x = xx;
 29 |     }
 30 |     if (f->rz != 0.0f)
 31 |     {
 32 |         float cz = cosf(-f->rz);
 33 |         float sz = sinf(-f->rz);
 34 |         float xx = cz*x - sz*y;
 35 |         y = cz*y + sz*x;
 36 |         x = xx;
 37 |     }
 38 | 
 39 |     switch (f->opcode)
 40 |     {
 41 |         case FREP_BOX:
 42 |         {
 43 |             float dx = fabsf(x) - f->box.width;
 44 |             float dy = fabsf(y) - f->box.height;
 45 |             float dz = fabsf(z) - f->box.depth;
 46 |             float dbx = (dx < 0.0f) ? dx : 0.0f; float b = dbx;
 47 |             float dby = (dy < 0.0f) ? dy : 0.0f; if (dby > b) b = dby;
 48 |             float dbz = (dz < 0.0f) ? dz : 0.0f; if (dbz > b) b = dbz;
 49 |             if (dx < 0.0f) dx = 0.0f;
 50 |             if (dy < 0.0f) dy = 0.0f;
 51 |             if (dz < 0.0f) dz = 0.0f;
 52 |             return sqrtf(dx*dx + dy*dy + dz*dz) + b;
 53 |         }
 54 |         case FREP_BOX_CHEAP:
 55 |         {
 56 |             float dx = fabsf(x) - f->box.width;
 57 |             float dy = fabsf(y) - f->box.height;
 58 |             float dz = fabsf(z) - f->box.depth;
 59 |             float d = dx;
 60 |             if (dy > d) d = dy;
 61 |             if (dz > d) d = dz;
 62 |             return d;
 63 |         }
 64 |         case FREP_SPHERE:
 65 |         {
 66 |             return sqrtf(x*x + y*y + z*z) - f->sphere.radius;
 67 |         }
 68 |         case FREP_CYLINDER:
 69 |         {
 70 |             float a = sqrtf(x*x + z*z) - f->cylinder.radius;
 71 |             float b = fabsf(y) - f->cylinder.height;
 72 |             return a > b ? a : b;
 73 |         }
 74 |         case FREP_PLANE:
 75 |         {
 76 |             return f->plane.sign*x - f->plane.offset;
 77 |         }
 78 |         case FREP_UNION:
 79 |         {
 80 |             float f1 = frep_eval(f->left, x, y, z);
 81 |             float f2 = frep_eval(f->right, x, y, z);
 82 |             return f1 < f2 ? f1 : f2;
 83 |         }
 84 |         case FREP_INTERSECT:
 85 |         {
 86 |             float f1 = frep_eval(f->left, x, y, z);
 87 |             float f2 = frep_eval(f->right, x, y, z);
 88 |             return f1 > f2 ? f1 : f2;
 89 |         }
 90 |         case FREP_SUBTRACT:
 91 |         {
 92 |             float f1 = frep_eval(f->left, x, y, z);
 93 |             float f2 = -frep_eval(f->right, x, y, z);
 94 |             return f1 > f2 ? f1 : f2;
 95 |         }
 96 |         #if 0
 97 |         case FREP_BLEND:
 98 |         {
 99 |             float f1 = frep_eval(f->left, x, y, z);
100 |             float f2 = frep_eval(f->right, x, y, z);
101 |             return f->blend.alpha*f1 + (1.0f - f->blend.alpha)*f2;
102 |         }
103 |         #endif
104 |         default:
105 |         {
106 |             assert(false && "invalid node type");
107 |         }
108 |     }
109 |     return 0.0f;
110 | }
111 | 


--------------------------------------------------------------------------------
/src/frep_serialize.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "ast.h"
 3 | #include <stdio.h>
 4 | 
 5 | #ifdef _MSC_VER
 6 | // Note: MSVC version returns -1 on overflow, but glibc returns total count (which may be >= buf_size)
 7 | #define snprintf _snprintf
 8 | #endif
 9 | 
10 | static char *ast__to_string(ast_t *a, char *stream, size_t sizeof_buffer)
11 | {
12 |     if (!a) return stream;
13 |     if      (a->type == AST_BOX)       stream += snprintf(stream, sizeof_buffer, "b[%g,%g,%g]", a->box.w, a->box.h, a->box.d);
14 |     else if (a->type == AST_SPHERE)    stream += snprintf(stream, sizeof_buffer, "s[%g]", a->sphere.r);
15 |     else if (a->type == AST_CYLINDER)  stream += snprintf(stream, sizeof_buffer, "c[%g,%g]", a->cylinder.r, a->cylinder.h);
16 |     else if (a->type == AST_PLANE)     stream += snprintf(stream, sizeof_buffer, "p[%g]", a->plane.x);
17 |     else if (a->type == AST_UNION)     stream += snprintf(stream, sizeof_buffer, "U");
18 |     else if (a->type == AST_INTERSECT) stream += snprintf(stream, sizeof_buffer, "I");
19 |     else if (a->type == AST_SUBTRACT)  stream += snprintf(stream, sizeof_buffer, "S");
20 |     else if (a->type == AST_BLEND)     stream += snprintf(stream, sizeof_buffer, "B[%g]", a->blend.alpha);
21 |     stream += snprintf(stream, sizeof_buffer, "[%g,%g,%g]", a->rx, a->ry, a->rz);
22 |     stream += snprintf(stream, sizeof_buffer, "[%g,%g,%g]", a->tx, a->ty, a->tz);
23 |     stream = ast__to_string(a->left, stream, sizeof_buffer);
24 |     stream = ast__to_string(a->right, stream, sizeof_buffer);
25 |     return stream;
26 | }
27 | 
28 | static ast_t *ast__from_string(char **inout_stream)
29 | {
30 |     char *stream = *inout_stream;
31 |     if (!stream) return NULL;
32 |     if (*stream == '\0') return NULL;
33 | 
34 |     ast_t *a = ast_new();
35 | 
36 |     #define next_bracket() { while (*stream && *stream != '[') stream++; assert(*stream); stream++; assert(*stream); }
37 |     if      (*stream == 'b') { a->type = AST_BOX;       next_bracket(); assert(3 == sscanf(stream, "%f,%f,%f", &a->box.w,      &a->box.h, &a->box.d)); next_bracket(); }
38 |     else if (*stream == 's') { a->type = AST_SPHERE;    next_bracket(); assert(1 == sscanf(stream, "%f",       &a->sphere.r                        )); next_bracket(); }
39 |     else if (*stream == 'c') { a->type = AST_CYLINDER;  next_bracket(); assert(2 == sscanf(stream, "%f,%f",    &a->cylinder.r, &a->cylinder.h      )); next_bracket(); }
40 |     else if (*stream == 'p') { a->type = AST_PLANE;     next_bracket(); assert(1 == sscanf(stream, "%f",       &a->plane.x                         )); next_bracket(); }
41 |     else if (*stream == 'U') { a->type = AST_UNION;     next_bracket(); }
42 |     else if (*stream == 'I') { a->type = AST_INTERSECT; next_bracket(); }
43 |     else if (*stream == 'S') { a->type = AST_SUBTRACT;  next_bracket(); }
44 |     else if (*stream == 'B') { a->type = AST_BLEND;     next_bracket(); assert(1 == sscanf(stream, "%f",       &a->blend.alpha                     )); next_bracket(); }
45 |     else assert(false && "invalid node type");
46 |     assert(3 == sscanf(stream, "%f,%f,%f", &a->rx, &a->ry, &a->rz));
47 |     next_bracket();
48 |     assert(3 == sscanf(stream, "%f,%f,%f", &a->tx, &a->ty, &a->tz));
49 |     while (*stream && *stream != ']') stream++;
50 |     assert(*stream);
51 |     stream++;
52 |     #undef next_bracket
53 | 
54 |     a->left = ast__from_string(&stream);
55 |     a->right = ast__from_string(&stream);
56 |     *inout_stream = stream;
57 |     return a;
58 | }
59 | 
60 | char *ast_to_string(ast_t *a)
61 | {
62 |     static char buffer[1024*1024];
63 |     ast__to_string(a, buffer, sizeof(buffer));
64 |     return buffer;
65 | }
66 | 
67 | ast_t *ast_from_string(char *stream)
68 | {
69 |     return ast__from_string(&stream);
70 | }
71 | 
72 | #ifdef _MSC_VER
73 | #undef snprintf
74 | #endif
75 | 


--------------------------------------------------------------------------------
/src/sass_6_x/backend.h:
--------------------------------------------------------------------------------
  1 | // Developed by Simen Haugo.
  2 | // See LICENSE.txt for copyright and licensing details (standard MIT License).
  3 | //
  4 | // This file contains the machine code generation backend for NVIDIA SASS (Shader
  5 | // Assembly) ISA. Unlike the PTX backend, this directly outputs to binary code that
  6 | // can be patched into a Cubin binary module and loaded immediately with the Cuda
  7 | // Driver API (see NVRTC example in SDK). This avoids the slow PTX compiler provided
  8 | // in CUDA.
  9 | //
 10 | // This backend is for devices of compute capability 6.x, such as the Maxwell and
 11 | // Pascal GPU families. It does not support Volta or Turing families (which have
 12 | // compute capability 7.x).
 13 | //
 14 | // SASS code generation consists of the following major steps
 15 | //
 16 | // 1. Generate instruction blocks
 17 | //      the input frep tree is parsed to produce independent sequences of temporary
 18 | //      SASS instructions (not binary). These are assigned virtual register names,
 19 | //      which must be assigned to physical registers in the next step.
 20 | //
 21 | // 2. Schedule instructions and assign physical registers
 22 | //
 23 | // 3. Generate SASS binary
 24 | //      With the physical registers assigned, we can now generate the actual binary
 25 | //      instructions that go into the final ELF executable.
 26 | //
 27 | // 4. Link SASS ELF executable (a "Cubin" module)
 28 | //
 29 | 
 30 | #pragma once
 31 | #include "../frep.h"
 32 | #include <stdint.h>
 33 | #include <assert.h>
 34 | #include <stdio.h>
 35 | #include <memory.h>
 36 | #include "registers.h"
 37 | #include "instruction.h"
 38 | #include "scheduler.h"
 39 | #include "blocks.h"
 40 | #include "bytecode.h"
 41 | 
 42 | #if 0
 43 | uint64_t get_ctrl_segment(instruction_t i)
 44 | {
 45 |     uint8_t ra,rb,rc,rd;
 46 |     uint8_t reuse;       // register reuse flags
 47 |     uint8_t yield;       // can relinquish control to other warp or not
 48 |     uint8_t stall;       // number of cycles to wait before continuing
 49 |     uint8_t wrtdb;       // write dependencies
 50 |     uint8_t readb;       // read dependencies
 51 |     uint8_t watdb;       // wait dependencies
 52 | }
 53 | 
 54 | void *frep_compile_to_sass(frep_t *tree, size_t *length)
 55 | {
 56 |     static const uint8_t header[] = {
 57 |         0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x33, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 58 |         0x01, 0x00, 0xbe, 0x00, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 59 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 60 |         0x3c, 0x05, 0x3c, 0x00, 0x40, 0x00, 0x38, 0x00, 0x00, 0x00, 0x40, 0x00, 0x09, 0x00, 0x01, 0x00,
 61 |         0x00, 0x2e, 0x73, 0x68, 0x73, 0x74, 0x72, 0x74, 0x61, 0x62, 0x00, 0x2e, 0x73, 0x74, 0x72, 0x74,
 62 |         0x61, 0x62, 0x00, 0x2e, 0x73, 0x79, 0x6d, 0x74, 0x61, 0x62, 0x00, 0x2e, 0x73, 0x79, 0x6d, 0x74,
 63 |         0x61, 0x62, 0x5f, 0x73, 0x68, 0x6e, 0x64, 0x78, 0x00, 0x2e, 0x6e, 0x76, 0x2e, 0x69, 0x6e, 0x66,
 64 |         0x6f, 0x00, 0x2e, 0x74, 0x65, 0x78, 0x74, 0x2e, 0x74, 0x72, 0x65, 0x65, 0x00, 0x2e, 0x6e, 0x76,
 65 |         0x2e, 0x69, 0x6e, 0x66, 0x6f, 0x2e, 0x74, 0x72, 0x65, 0x65, 0x00, 0x2e, 0x6e, 0x76, 0x2e, 0x63,
 66 |         0x61, 0x6c, 0x6c, 0x67, 0x72, 0x61, 0x70, 0x68, 0x00, 0x2e, 0x6e, 0x76, 0x2e, 0x70, 0x72, 0x6f,
 67 |         0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x00, 0x00, 0x2e, 0x73, 0x68, 0x73, 0x74, 0x72, 0x74, 0x61,
 68 |         0x62, 0x00, 0x2e, 0x73, 0x74, 0x72, 0x74, 0x61, 0x62, 0x00, 0x2e, 0x73, 0x79, 0x6d, 0x74, 0x61,
 69 |         0x62, 0x00, 0x2e, 0x73, 0x79, 0x6d, 0x74, 0x61, 0x62, 0x5f, 0x73, 0x68, 0x6e, 0x64, 0x78, 0x00,
 70 |         0x2e, 0x6e, 0x76, 0x2e, 0x69, 0x6e, 0x66, 0x6f, 0x00, 0x74, 0x72, 0x65, 0x65, 0x00, 0x2e, 0x74,
 71 |         0x65, 0x78, 0x74, 0x2e, 0x74, 0x72, 0x65, 0x65, 0x00, 0x2e, 0x6e, 0x76, 0x2e, 0x69, 0x6e, 0x66,
 72 |         0x6f, 0x2e, 0x74, 0x72, 0x65, 0x65, 0x00, 0x23, 0x66, 0x66, 0x66, 0x66, 0x00, 0x2e, 0x6e, 0x76,
 73 |         0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x67, 0x72, 0x61, 0x70, 0x68, 0x00, 0x2e, 0x6e, 0x76, 0x2e, 0x70,
 74 |         0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 75 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 76 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x03, 0x00, 0x08, 0x00,
 77 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 78 |         0x56, 0x00, 0x00, 0x00, 0x03, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 79 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, 0x03, 0x00, 0x07, 0x00,
 80 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 81 |         0x32, 0x00, 0x00, 0x00, 0x12, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 82 |         0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x2f, 0x08, 0x00, 0x04, 0x00, 0x00, 0x00,
 83 |         0x07, 0x00, 0x00, 0x00, 0x04, 0x23, 0x08, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 84 |         0x04, 0x11, 0x08, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x30, 0x00, 0x00,
 85 |         0x01, 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
 86 |         0xfe, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0xfd, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
 87 |         0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 88 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 89 |     };
 90 | 
 91 |     static const uint8_t footer[] = {
 92 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 93 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 94 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 95 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 96 |         0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 97 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 98 |         0x67, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 99 |         0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
100 |         0x0b, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
101 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
102 |         0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
103 |         0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
104 |         0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
105 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
106 |         0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
107 |         0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
108 |         0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
109 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x98, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
110 |         0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
111 |         0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
112 |         0x3d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
113 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xbc, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
114 |         0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
115 |         0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
116 |         0x4b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
117 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc4, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
118 |         0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
119 |         0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
120 |         0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
121 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xdc, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
122 |         0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
123 |         0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
124 |         0x32, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
125 |         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
126 |         0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x07,
127 |         0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
128 |     };
129 | 
130 |     using namespace backend_sass;
131 |     instruction_blocks_t blocks = generate_sass_blocks(tree);
132 | 
133 |     int num_instructions;
134 |     instruction_t *instructions = schedule_blocks(blocks, &num_instructions);
135 | 
136 |     size_t sizeof_cubin = sizeof(header) + sizeof(footer) + sizeof_sass;
137 |     uint8_t *cubin = (uint8_t*)malloc(sizeof_cubin);
138 |     memcpy(cubin, header, sizeof(header));
139 |     uint64_t *instruction_bin  = cubin + sizeof(header);
140 |     for (size_t i = 0; i < num_instructions; i++)
141 |     {
142 |         instruction_t i1 = instructions[i];
143 |             instruction_t i2 = instructions[i];
144 |         instruction_t i3 = instructions[i];
145 |         instruction_t instruction = instructions[i];
146 |         switch (instruction.type)
147 |         {
148 |             case INSTRUCTION_FFMA:          FFMA(d, a, b, c, FFMA_FTZ); break;
149 |             case INSTRUCTION_FMUL:          FMUL(d, a, b, FMUL_FTZ); break;
150 |             case INSTRUCTION_FADD:          FADD(d, a, b, FADD_FTZ); break;
151 |             case INSTRUCTION_FFMA20I:       FFMA20I(d, a, imm_b, c, FFMA_FTZ); break;
152 |             case INSTRUCTION_FMUL20I:       FMUL20I(d, a, imm_b, FMUL_FTZ); break;
153 |             case INSTRUCTION_FADD20I:       FADD20I(d, a, imm_b, FADD_FTZ); break;
154 |             case INSTRUCTION_FADD20I_ABS_A: FADD20I(d, a, imm_b, FADD_FTZ|FADD_ABS_A); break;
155 |             case INSTRUCTION_FMIN:          FMIN(d, a, b, FMNMX_FTZ); break;
156 |             case INSTRUCTION_FMAX:          FMAX(d, a, b, FMNMX_FTZ); break;
157 |             case INSTRUCTION_FMAX_NEG_B:    FMIN(d, a, b, FMNMX_FTZ|FMNMX_NEG_B); break;
158 |             case INSTRUCTION_SQRT:          MUFU_SQRT(d, a); break;
159 |             default: assert(false && "Unknown instruction type");
160 |         }
161 |     }
162 |     memcpy(cubin, footer, sizeof(footer));
163 | 
164 |     assert(cubin);
165 | }
166 | #endif
167 | 


--------------------------------------------------------------------------------
/src/sass_6_x/blocks.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | namespace backend_sass {
  4 | 
  5 | #define CLEAR()           memset(&block->instructions[block->num_instructions], 0, sizeof(instruction_t))
  6 | #define TYPE(Expression)  block->instructions[block->num_instructions].type = INSTRUCTION_##Expression
  7 | #define RA(Expression)    block->instructions[block->num_instructions].a = REGISTER_##Expression
  8 | #define RB(Expression)    block->instructions[block->num_instructions].b = REGISTER_##Expression
  9 | #define RC(Expression)    block->instructions[block->num_instructions].c = REGISTER_##Expression
 10 | #define RD(Expression)    block->instructions[block->num_instructions].d = REGISTER_##Expression
 11 | #define STALL(Expression) block->instructions[block->num_instructions].stall = Expression;
 12 | #define IMMB(Expression)  block->instructions[block->num_instructions].imm_b = Expression;
 13 | #define NEXT()            block->num_instructions++; assert(block->num_instructions <= MAX_INSTRUCTIONS_PER_BLOCK);
 14 | 
 15 | #if 0 // sequential transform code
 16 | // (x,y,z) = R_root_to_this*((x0,y0,z0) - T_this_rel_root)
 17 | //         = Rz(rz)*Ry(ry)*Rx(rx)*((x0-tx, y0-ty, z0-tz))
 18 | void emit_transform(instruction_block_t *block, frep_mat3_t R_root_to_this, frep_vec3_t T_this_rel_root)
 19 | {
 20 |     // Convert to final rotation into euler angles
 21 |     // (need less registers to do three sequential
 22 |     // euler rotations, than a full 3x3 matrix multiply, I think...?)
 23 |     float rx,ry,rz;
 24 |     frep_so3_to_ypr(R_root_to_this, &rz, &ry, &rx);
 25 |     float tx = T_this_rel_root[0];
 26 |     float ty = T_this_rel_root[1];
 27 |     float tz = T_this_rel_root[2];
 28 |     float cx = cosf(rx); float sx = sinf(rx);
 29 |     float cy = cosf(ry); float sy = sinf(ry);
 30 |     float cz = cosf(rz); float sz = sinf(rz);
 31 |                                                                      // translate:
 32 |     CLEAR(); TYPE(FADD20I); RD(X); RA(X0); IMMB(-tx);        NEXT(); // FADD x, x0, (-tx)
 33 |     CLEAR(); TYPE(FADD20I); RD(Y); RA(Y0); IMMB(-ty);        NEXT(); // FADD y, y0, (-ty)
 34 |     CLEAR(); TYPE(FADD20I); RD(Z); RA(Z0); IMMB(-tz);        NEXT(); // FADD z, z0, (-tz)
 35 |                                                                      // rotate_x: x=x, y=c*y - s*z, z=s*y + c*z
 36 |     CLEAR(); TYPE(FMUL20I); RD(W); RA(Y);  IMMB(+sx);        NEXT(); // FMUL w, y, (s)
 37 |     CLEAR(); TYPE(FMUL20I); RD(Y); RA(Y);  IMMB(+cx);        NEXT(); // FMUL y, y.reuse, (c)
 38 |     CLEAR(); TYPE(FFMA20I); RD(Y); RA(Z);  IMMB(-sx); RC(Y); NEXT(); // FFMA y, z, (-s), y
 39 |     CLEAR(); TYPE(FFMA20I); RD(Z); RA(Z);  IMMB(+cx); RC(W); NEXT(); // FFMA z, z.reuse, (c), w
 40 |                                                                      // rotate_y: x=c*x + s*z, y=y, z=-s*x + c*z
 41 |     CLEAR(); TYPE(FMUL20I); RD(W); RA(X);  IMMB(-sy);        NEXT(); // FMUL w, x, (-s)
 42 |     CLEAR(); TYPE(FMUL20I); RD(X); RA(X);  IMMB(+cy);        NEXT(); // FMUL x, x.reuse, (c)
 43 |     CLEAR(); TYPE(FFMA20I); RD(X); RA(Z);  IMMB(+sy); RC(X); NEXT(); // FFMA x, z, (s), x
 44 |     CLEAR(); TYPE(FFMA20I); RD(Z); RA(Z);  IMMB(+cy); RC(W); NEXT(); // FFMA z, z.reuse, (c), w
 45 |                                                                      // rotate_z: x=c*x - s*y, y=s*x + c*y, z=z
 46 |     CLEAR(); TYPE(FMUL20I); RD(W); RA(X);  IMMB(+sz);        NEXT(); // FMUL w, x, (s)
 47 |     CLEAR(); TYPE(FMUL20I); RD(X); RA(X);  IMMB(+cz);        NEXT(); // FMUL x, x.reuse, (c)
 48 |     CLEAR(); TYPE(FFMA20I); RD(X); RA(Y);  IMMB(-sz); RC(X); NEXT(); // FFMA x, y, (-s), x
 49 |     CLEAR(); TYPE(FFMA20I); RD(Y); RA(Y);  IMMB(+cz); RC(W); NEXT(); // FFMA y, y.reuse, (c), w
 50 | }
 51 | #else
 52 | void emit_transform(instruction_block_t *block, frep_mat3_t R/*_root_to_this*/, frep_vec3_t T/*_this_rel_root*/)
 53 | {
 54 |     // This path is a stall-count optimized version of the above.
 55 |     // The generated code computes the following:
 56 |     // (x,y,z) = R_root_to_this*((x0,y0,z0) - T_this_rel_root)
 57 |     // x = R00*(x0-Tx) + R01*(y0-Ty) + R02*(z0-Tz)
 58 |     //   = R00*x0 + R01*y0 + R02*z0 + (-R00*Tx - R01*Ty - R02*Tz)
 59 |     //   = R00*x0 + R01*y0 + R02*z0 + dx
 60 |     // etc...
 61 | 
 62 |     float dx = -(R.at(0,0)*T[0] + R.at(0,1)*T[1] + R.at(0,2)*T[2]);
 63 |     float dy = -(R.at(1,0)*T[0] + R.at(1,1)*T[1] + R.at(1,2)*T[2]);
 64 |     float dz = -(R.at(2,0)*T[0] + R.at(2,1)*T[1] + R.at(2,2)*T[2]);
 65 | 
 66 |     CLEAR(); TYPE(FADD20I); RD(X); RA(RZ); IMMB(dx);               STALL(1); NEXT(); // 1 FADD x, RZ, dx
 67 |     CLEAR(); TYPE(FADD20I); RD(Y); RA(RZ); IMMB(dy);               STALL(1); NEXT(); // 1 FADD y, RZ, dy
 68 |     CLEAR(); TYPE(FADD20I); RD(Z); RA(RZ); IMMB(dz);               STALL(4); NEXT(); // 4 FADD z, RZ, dz
 69 |     CLEAR(); TYPE(FFMA20I); RD(X); RA(X0); IMMB(R.at(0,0)); RC(X); STALL(1); NEXT(); // 1 FFMA x, x0, (R00), x // Q) Why not have dx here?
 70 |     CLEAR(); TYPE(FFMA20I); RD(Y); RA(X0); IMMB(R.at(1,0)); RC(Y); STALL(1); NEXT(); // 1 FFMA y, x0, (R10), y
 71 |     CLEAR(); TYPE(FFMA20I); RD(Z); RA(X0); IMMB(R.at(2,0)); RC(Z); STALL(4); NEXT(); // 4 FFMA z, x0, (R20), z
 72 |     CLEAR(); TYPE(FFMA20I); RD(X); RA(Y0); IMMB(R.at(0,1)); RC(X); STALL(1); NEXT(); // 1 FFMA x, y0, (R01), x
 73 |     CLEAR(); TYPE(FFMA20I); RD(Y); RA(Y0); IMMB(R.at(1,1)); RC(Y); STALL(1); NEXT(); // 1 FFMA y, y0, (R11), y
 74 |     CLEAR(); TYPE(FFMA20I); RD(Z); RA(Y0); IMMB(R.at(2,1)); RC(Z); STALL(4); NEXT(); // 4 FFMA z, y0, (R21), z
 75 |     CLEAR(); TYPE(FFMA20I); RD(X); RA(Z0); IMMB(R.at(0,2)); RC(X); STALL(1); NEXT(); // 1 FFMA x, z0, (R02), x
 76 |     CLEAR(); TYPE(FFMA20I); RD(Y); RA(Z0); IMMB(R.at(1,2)); RC(Y); STALL(1); NEXT(); // 1 FFMA y, z0, (R12), y
 77 |     CLEAR(); TYPE(FFMA20I); RD(Z); RA(Z0); IMMB(R.at(2,2)); RC(Z); STALL(4); NEXT(); // 4 FFMA z, z0, (R22), z
 78 | }
 79 | #endif
 80 | 
 81 | // cylinder: max(sqrt(x*x + z*z) - R, abs(y)-H)
 82 | void emit_cylinder(instruction_block_t *block, frep_mat3_t R, frep_vec3_t T, float r, float h)
 83 | {
 84 |     emit_transform(block, R, T);
 85 |     CLEAR(); TYPE(FMUL);          RD(W); RA(X); RB(X);        NEXT(); // FMUL w, x, x
 86 |     CLEAR(); TYPE(FFMA);          RD(W); RA(Z); RB(Z); RC(W); NEXT(); // FFMA w, z, z, w
 87 |     CLEAR(); TYPE(SQRT);          RD(W); RA(W); RB(W);        NEXT(); // SQRT w, w
 88 |     CLEAR(); TYPE(FADD20I_ABS_A); RD(Y); RA(Y); IMMB(-h);     NEXT(); // FADD y, |y|, -H
 89 |     CLEAR(); TYPE(FADD20I);       RD(W); RA(W); IMMB(-r);     NEXT(); // FADD w, w, -R
 90 |     CLEAR(); TYPE(FMAX);          RD(D); RA(W); RB(Y);        NEXT(); // FMAX d, w, y
 91 | }
 92 | 
 93 | // sphere: sqrt(x*x + y*y + z*z) - R
 94 | void emit_sphere(instruction_block_t *block, frep_mat3_t R, frep_vec3_t T, float r)
 95 | {
 96 |     #if 1
 97 |     CLEAR(); TYPE(FADD20I); RD(X); RA(X0); IMMB(-T[0]);  STALL(1); NEXT(); // 1 FADD x, x0, (-tx)
 98 |     CLEAR(); TYPE(FADD20I); RD(Y); RA(Y0); IMMB(-T[1]);  STALL(1); NEXT(); // 1 FADD y, y0, (-ty)
 99 |     CLEAR(); TYPE(FADD20I); RD(Z); RA(Z0); IMMB(-T[2]);  STALL(4); NEXT(); // 4 FADD z, z0, (-tz)
100 |     CLEAR(); TYPE(FMUL);    RD(W); RA(X); RB(X);                   NEXT(); // 6 FMUL w, x, x
101 |     CLEAR(); TYPE(FFMA);    RD(W); RA(Y); RB(Y); RC(W);            NEXT(); // 6 FFMA w, y, y, w
102 |     CLEAR(); TYPE(FFMA);    RD(W); RA(Z); RB(Z); RC(W);            NEXT(); // 6 FFMA w, z, z, w
103 |     CLEAR(); TYPE(SQRT);    RD(W); RA(W); RB(W);                   NEXT(); // 8 SQRT w, w
104 |     CLEAR(); TYPE(FADD20I); RD(D); RA(W); IMMB(-r);                NEXT(); // 6 FADD d, w, -R
105 |     #else
106 |     emit_transform(block, R, T);
107 |     CLEAR(); TYPE(FMUL);    RD(W); RA(X); RB(X);        NEXT(); // FMUL w, x, x
108 |     CLEAR(); TYPE(FFMA);    RD(W); RA(Y); RB(Y); RC(W); NEXT(); // FFMA w, y, y, w
109 |     CLEAR(); TYPE(FFMA);    RD(W); RA(Z); RB(Z); RC(W); NEXT(); // FFMA w, z, z, w
110 |     CLEAR(); TYPE(SQRT);    RD(W); RA(W); RB(W);        NEXT(); // SQRT w, w
111 |     CLEAR(); TYPE(FADD20I); RD(D); RA(W); IMMB(-r);     NEXT(); // FADD d, w, -R
112 |     #endif
113 | }
114 | 
115 | void emit_box(instruction_block_t *block, frep_mat3_t R, frep_vec3_t T, float bx, float by, float bz)
116 | {
117 |     assert(false && "fBox is not implemented yet");
118 | }
119 | 
120 | // box: max(max(|x|-wx, |y|-wy), |z|-wz)
121 | void emit_box_cheap(instruction_block_t *block, frep_mat3_t R, frep_vec3_t T, float bx, float by, float bz)
122 | {
123 |     emit_transform(block, R, T);
124 |     CLEAR(); TYPE(FADD20I_ABS_A); RD(X); RA(X); IMMB(-bx); STALL(1); NEXT(); // 1 FADD x, |x|, -wx
125 |     CLEAR(); TYPE(FADD20I_ABS_A); RD(Y); RA(Y); IMMB(-by); STALL(1); NEXT(); // 1 FADD y, |y|, -wy
126 |     CLEAR(); TYPE(FADD20I_ABS_A); RD(Z); RA(Z); IMMB(-bz); STALL(5); NEXT(); // 5 FADD z, |z|, -wz
127 |     CLEAR(); TYPE(FMAX);          RD(W); RA(X); RB(Y);               NEXT(); // 6 FMAX w, x, y
128 |     CLEAR(); TYPE(FMAX);          RD(D); RA(W); RB(Z);               NEXT(); // 6 FMAX d, w, z
129 | }
130 | 
131 | void emit_plane(instruction_block_t *block, frep_mat3_t R, frep_vec3_t T, float px)
132 | {
133 |     #if 0
134 |     // optimized version
135 |     float rx,ry,rz;
136 |     frep_so3_to_ypr(R, &rz, &ry, &rx);
137 |     float cx = cosf(rx); float sx = sinf(rx);
138 |     float cy = cosf(ry); float sy = sinf(ry);
139 |     float cz = cosf(rz); float sz = sinf(rz);
140 |     float rtx = -((cy*cz)*T[0] + (cz*sx*sy - cx*sz)*T[1] + (sx*sz + cx*cz*sy)*T[2]);
141 | 
142 |     CLEAR(); TYPE(FMUL20I); RD(X); RA(X0); IMMB((cy*cz));                   NEXT(); // 6 FMUL x, x0, (cy*cz)
143 |     CLEAR(); TYPE(FFMA20I); RD(X); RA(Y0); IMMB((cz*sx*sy-cx*sz));   RC(X); NEXT(); // 6 FFMA x, y0, (cz*sx*sy-cx*sz), x
144 |     CLEAR(); TYPE(FFMA20I); RD(X); RA(Z0); IMMB((sx*sz + cx*cz*sy)); RC(X); NEXT(); // 6 FFMA x, z0, (sx*sz + cx*cz*sy), x
145 |     CLEAR(); TYPE(FADD20I); RD(D); RA(X);  IMMB(rtx-px);                    NEXT(); // 6 FADD d, x, rtx-px
146 |     #else
147 |     emit_transform(block, R, T);
148 |                                                     // plane: x - px
149 |     CLEAR(); TYPE(FADD20I); RD(D); RA(X); IMMB(-px); NEXT(); // FADD d, x, -px
150 |     #endif
151 | }
152 | 
153 | void emit_union(instruction_block_t *block)     { CLEAR(); TYPE(FMIN); RD(D); RA(D_LEFT); RB(D_RIGHT);       NEXT(); }
154 | void emit_intersect(instruction_block_t *block) { CLEAR(); TYPE(FMAX); RD(D); RA(D_LEFT); RB(D_RIGHT);       NEXT(); }
155 | void emit_subtract(instruction_block_t *block)  { CLEAR(); TYPE(FMAX_NEG_B); RD(D); RA(D_LEFT); RB(D_RIGHT); NEXT(); }
156 | void emit_blend(instruction_block_t *block, float alpha)
157 | {
158 |                                                                         // blend: alpha*d_left + (1-alpha)*d_right
159 |     CLEAR(); TYPE(FMUL20I); RD(D); RA(D_LEFT);  IMMB(alpha);             NEXT(); // FMUL d, d_left, (alpha)
160 |     CLEAR(); TYPE(FFMA20I); RD(D); RA(D_RIGHT); IMMB(1.0f-alpha); RC(D); NEXT(); // FFMA d, d_right, (1-alpha), d
161 | }
162 | 
163 | #undef TYPE
164 | #undef RA
165 | #undef RB
166 | #undef RC
167 | #undef RD
168 | #undef IMMB
169 | #undef NEXT
170 | #undef STALL
171 | #undef CLEAR
172 | 
173 | void _generate_blocks(
174 |     instruction_blocks_t *s,
175 |     frep_t *node,
176 |     int destination=0,
177 |     frep_mat3_t R_root_to_parent=frep_identity_3x3,
178 |     frep_vec3_t T_parent_rel_root=frep_null_3x1)
179 | // You can do much smarter register allocation here. The register allocation
180 | // may also need to change if we do smarter scheduling. E.g. block reordering.
181 | {
182 |     assert(node);
183 | 
184 |     frep_mat3_t R_root_to_this;
185 |     frep_vec3_t T_this_rel_root;
186 |     frep_get_global_transform(node, &R_root_to_this, &T_this_rel_root, R_root_to_parent, T_parent_rel_root);
187 | 
188 |     if (frep_is_boolean(node))
189 |     {
190 |         assert(node->left);
191 |         assert(node->right);
192 | 
193 |         int d_left = destination;
194 |         int d_right = destination+1;
195 |         _generate_blocks(s, node->left, d_left, R_root_to_this, T_this_rel_root);
196 |         _generate_blocks(s, node->right, d_right, R_root_to_this, T_this_rel_root);
197 | 
198 |         instruction_block_t *b = &s->blocks[s->num_blocks++];
199 |         b->num_instructions = 0;
200 |         b->d_left = d_left;
201 |         b->d_right = d_right;
202 |         b->d = destination;
203 |              if (node->opcode == FREP_UNION)     emit_union(b);
204 |         else if (node->opcode == FREP_INTERSECT) emit_intersect(b);
205 |         else if (node->opcode == FREP_SUBTRACT)  emit_subtract(b);
206 |         else if (node->opcode == FREP_BLEND)     emit_blend(b, node->blend.alpha);
207 |         assert(s->num_blocks <= MAX_INSTRUCTION_BLOCKS);
208 |     }
209 |     else if (frep_is_primitive(node))
210 |     {
211 |         instruction_block_t *b = &s->blocks[s->num_blocks++];
212 |         b->num_instructions = 0;
213 |         frep_mat3_t R = R_root_to_this;
214 |         frep_vec3_t T = T_this_rel_root;
215 |         b->d = destination;
216 |              if (node->opcode == FREP_BOX)       emit_box(b, R, T, node->box.width, node->box.height, node->box.depth);
217 |         else if (node->opcode == FREP_BOX_CHEAP) emit_box_cheap(b, R, T, node->box.width, node->box.height, node->box.depth);
218 |         else if (node->opcode == FREP_SPHERE)    emit_sphere(b, R, T, node->sphere.radius);
219 |         else if (node->opcode == FREP_CYLINDER)  emit_cylinder(b, R, T, node->cylinder.radius, node->cylinder.height);
220 |         else if (node->opcode == FREP_PLANE)     emit_plane(b, R, T, node->plane.offset);
221 |         assert(s->num_blocks <= MAX_INSTRUCTION_BLOCKS);
222 |     }
223 |     else
224 |     {
225 |         assert(false && "Unexpected node type");
226 |     }
227 | }
228 | 
229 | instruction_blocks_t generate_blocks(frep_t *node)
230 | // This function generates a list of instruction blocks that evaluates the
231 | // tree and stores the resulting distance value in register[0]. Each block
232 | // is assigned registers during the recursive tree parsing.
233 | {
234 |     assert(node);
235 | 
236 |     static instruction_block_t _blocks[MAX_INSTRUCTION_BLOCKS];
237 |     instruction_blocks_t s = {0};
238 |     s.blocks = _blocks;
239 |     s.num_blocks = 0;
240 | 
241 |     _generate_blocks(&s, node);
242 | 
243 |     return s;
244 | }
245 | 
246 | }
247 | 


--------------------------------------------------------------------------------
/src/sass_6_x/bytecode.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | namespace backend_sass {
  4 | 
  5 | //
  6 | // Instruction flags
  7 | //
  8 | // Meaning:
  9 | // FTZ = Flush to zero
 10 | // NEG_A = Use negated value of a operand
 11 | // NEG_B = Use negated value of b operand
 12 | // ABS_A = Use absolute value of a operand
 13 | // ABS_B = Use absolute value of b operand
 14 | uint64_t FADD_FTZ      = 0x0000100000000000;
 15 | uint64_t FADD_NEG_A    = 0x0001000000000000;
 16 | uint64_t FADD_NEG_B    = 0x0000200000000000;
 17 | uint64_t FADD_ABS_A    = 0x0000400000000000;
 18 | uint64_t FADD_ABS_B    = 0x0002000000000000;
 19 | uint64_t FADD32I_FTZ   = 0x0080000000000000;
 20 | uint64_t FADD32I_ABS_A = 0x0040000000000000;
 21 | uint64_t FMUL_FTZ      = 0x0000100000000000;
 22 | uint64_t FMUL_NEG_B    = 0x0001000000000000;
 23 | uint64_t FMUL32I_FTZ   = 0x0020000000000000;
 24 | uint64_t FMNMX_FTZ     = 0x0000100000000000;
 25 | uint64_t FMNMX_NEG_A   = 0x0001000000000000;
 26 | uint64_t FMNMX_NEG_B   = 0x0000200000000000;
 27 | uint64_t FMNMX_ABS_A   = 0x0000400000000000;
 28 | uint64_t FMNMX_ABS_B   = 0x0002000000000000;
 29 | uint64_t FFMA_FTZ      = 0x0020000000000000;
 30 | uint64_t FFMA_NEG_B    = 0x0001000000000000;
 31 | uint64_t FFMA_NEG_C    = 0x0002000000000000;
 32 | 
 33 | // FADD d, a, b
 34 | // d = a+b
 35 | uint64_t FADD(uint8_t d, uint8_t a, uint8_t b, uint64_t flags) {
 36 |     uint64_t RD  = (uint64_t)(d) << 0;
 37 |     uint64_t RA  = (uint64_t)(a) << 8;
 38 |     uint64_t RB  = (uint64_t)(b) << 20;
 39 |     return 0x5c58000000070000 | flags | RB | RA | RD;
 40 | }
 41 | 
 42 | // FADD d, -a, -RZ
 43 | // d = a+b
 44 | uint64_t NEG(uint8_t d, uint8_t a, uint64_t flags) {
 45 |     uint64_t RD  = (uint64_t)(d) << 0;
 46 |     uint64_t RA  = (uint64_t)(a) << 8;
 47 |     // todo: why is NEG_B flag set?
 48 |     return 0x5c5930000ff70000 | flags | RA | RD;
 49 | }
 50 | 
 51 | // FADD d, a, b immediate
 52 | // d = a+b
 53 | uint64_t FADD20I(uint8_t d, uint8_t a, float b, uint64_t flags) {
 54 |     uint64_t b_u64 = *(uint64_t*)&b;
 55 |     uint64_t sgn_b = b_u64 & 0x0000000080000000;
 56 |     uint64_t NEG_B = sgn_b ? 0x0100000000000000 : 0x0;
 57 |     uint64_t B     = ((b_u64 & 0x000000007FFFF000) >> 12) << 20;
 58 |     uint64_t RA    = (uint64_t)(a) << 8;
 59 |     uint64_t RD    = (uint64_t)(d) << 0;
 60 |     return 0x3858000000070000 | flags | NEG_B | B | RA | RD;
 61 | }
 62 | 
 63 | // FADD32I d, a, b immediate
 64 | // d = a+b
 65 | uint64_t FADD32I(uint8_t d, uint8_t a, float b, uint64_t flags) {
 66 |     uint64_t b_u64 = *(uint64_t*)&b;
 67 |     uint64_t sgn_b = b_u64 & 0x0000000080000000;
 68 |     uint64_t NEG_B = sgn_b ? 0x0008000000000000 : 0x0;
 69 |     uint64_t B     = (b_u64 & 0x000000007FFFFFFF) << 20;
 70 |     uint64_t RA    = (uint64_t)(a) << 8;
 71 |     uint64_t RD    = (uint64_t)(d) << 0;
 72 |     return 0x0880000000070000 | flags | NEG_B | B | RA | RD;
 73 | }
 74 | 
 75 | // FTF.FTZ.F32.F32.FLOOR d, b
 76 | // d = floor(b)
 77 | uint64_t FLOOR32F(uint8_t d, uint8_t b) {
 78 |     uint64_t RB    = (uint64_t)(b) << 20;
 79 |     uint64_t RD    = (uint64_t)(d) << 0;
 80 |     return 0x5ca8148000070a00 | RB | RD;
 81 | }
 82 | 
 83 | // FMUL32I d, a, b immediate
 84 | // d = a*b
 85 | uint64_t FMUL32I(uint8_t d, uint8_t a, float b, uint64_t flags) {
 86 |     uint64_t b_u64 = *(uint64_t*)&b;
 87 |     uint64_t sgn_b = b_u64 & 0x0000000080000000;
 88 |     uint64_t NEG_B = sgn_b ? 0x0008000000000000 : 0x0;
 89 |     uint64_t B     = (b_u64 & 0x000000007FFFFFFF) << 20;
 90 |     uint64_t RA    = (uint64_t)(a) << 8;
 91 |     uint64_t RD    = (uint64_t)(d) << 0;
 92 |     return 0x1e00000000070000 | flags | NEG_B | B | RA | RD;
 93 | }
 94 | 
 95 | // FMUL d, a, b immediate
 96 | // d = a*b
 97 | uint64_t FMUL20I(uint8_t d, uint8_t a, float b, uint64_t flags) {
 98 |     uint64_t b_u64 = *(uint64_t*)&b;
 99 |     uint64_t sgn_b = b_u64 & 0x0000000080000000;
100 |     uint64_t NEG_B = sgn_b ? 0x0100000000000000 : 0x0;
101 |     uint64_t B     = ((b_u64 & 0x000000007FFFF000) >> 12) << 20;
102 |     uint64_t RA    = (uint64_t)(a) << 8;
103 |     uint64_t RD    = (uint64_t)(d) << 0;
104 |     return 0x3868000000070000 | flags | NEG_B | B | RA | RD;
105 | }
106 | 
107 | // FMUL d, a, b
108 | // d = a*b
109 | uint64_t FMUL(uint8_t d, uint8_t a, uint8_t b, uint64_t flags) {
110 |     uint64_t RD  = (uint64_t)(d) << 0;
111 |     uint64_t RA  = (uint64_t)(a) << 8;
112 |     uint64_t RB  = (uint64_t)(b) << 20;
113 |     return 0x5c68000000070000 | flags | RB | RA | RD;
114 | }
115 | 
116 | // FFMA d, a, b, c
117 | // d = a*b + c
118 | uint64_t FFMA(uint8_t d, uint8_t a, uint8_t b, uint8_t c, uint64_t flags) {
119 |     uint64_t RD  = (uint64_t)(d) << 0;
120 |     uint64_t RA  = (uint64_t)(a) << 8;
121 |     uint64_t RB  = (uint64_t)(b) << 20;
122 |     uint64_t RC  = (uint64_t)(c) << 39;
123 |     return 0x5980000000070000 | flags | RC | RB | RA | RD;
124 | }
125 | 
126 | // FFMA d, a, b immediate, c
127 | // d = a*b + c
128 | uint64_t FFMA20I(uint8_t d, uint8_t a, float b, uint8_t c, uint64_t flags) {
129 |     uint64_t b_u64 = *(uint64_t*)&b;
130 |     uint64_t sgn_b = b_u64 & 0x0000000080000000;
131 |     uint64_t NEG_B = sgn_b ? 0x0100000000000000 : 0x0;
132 |     uint64_t B     = ((b_u64 & 0x000000007FFFF000) >> 12) << 20;
133 |     uint64_t RC    = (uint64_t)(c) << 39;
134 |     uint64_t RA    = (uint64_t)(a) << 8;
135 |     uint64_t RD    = (uint64_t)(d) << 0;
136 |     return 0x3280000000070000 | flags | NEG_B | RC | B | RA | RD;
137 | }
138 | 
139 | // FMNMX d, a, b, !PT
140 | // d = max(a,b)
141 | uint64_t FMAX(uint8_t d, uint8_t a, uint8_t b, uint64_t flags) {
142 |     uint64_t RD  = (uint64_t)(d) << 0;
143 |     uint64_t RA  = (uint64_t)(a) << 8;
144 |     uint64_t RB  = (uint64_t)(b) << 20;
145 |     return 0x5c60078000070000 | flags | RB | RA | RD;
146 | }
147 | 
148 | // FMNMX d, a, b, PT
149 | // d = min(a,b)
150 | uint64_t FMIN(uint8_t d, uint8_t a, uint8_t b, uint64_t flags) {
151 |     uint64_t RD  = (uint64_t)(d) << 0;
152 |     uint64_t RA  = (uint64_t)(a) << 8;
153 |     uint64_t RB  = (uint64_t)(b) << 20;
154 |     return 0x5c60038000070000 | flags | RB | RA | RD;
155 | }
156 | 
157 | // FMNMX d, a, b immediate, !PT
158 | // d = min(a,b)
159 | uint64_t FMAX20I(uint8_t d, uint8_t a, float b, uint64_t flags) {
160 |     uint64_t b_u64 = *(uint64_t*)&b;
161 |     uint64_t sgn_b = b_u64 & 0x0000000080000000;
162 |     uint64_t NEG_B = sgn_b ? 0x0100000000000000 : 0x0;
163 |     uint64_t B     = ((b_u64 & 0x000000007FFFF000) >> 12) << 20;
164 |     uint64_t RA    = (uint64_t)(a) << 8;
165 |     uint64_t RD    = (uint64_t)(d) << 0;
166 |     return 0x3860078000070000 | NEG_B | flags | B | RA | RD;
167 | }
168 | 
169 | // FMNMX d, a, b immediate, PT
170 | // d = min(a,b)
171 | uint64_t FMIN20I(uint8_t d, uint8_t a, float b, uint64_t flags) {
172 |     uint64_t b_u64 = *(uint64_t*)&b;
173 |     uint64_t sgn_b = b_u64 & 0x0000000080000000;
174 |     uint64_t NEG_B = sgn_b ? 0x0100000000000000 : 0x0;
175 |     uint64_t B     = ((b_u64 & 0x000000007FFFF000) >> 12) << 20;
176 |     uint64_t RA    = (uint64_t)(a) << 8;
177 |     uint64_t RD    = (uint64_t)(d) << 0;
178 |     return 0x3860038000070000 | NEG_B | flags | B | RA | RD;
179 | }
180 | 
181 | // MUFU.SQRT d, a
182 | // d = sqrt(a)
183 | uint64_t MUFU_SQRT(uint8_t d, uint8_t a) {
184 |     uint64_t RD  = (uint64_t)(d) << 0;
185 |     uint64_t RA  = (uint64_t)(a) << 8;
186 |     return 0x5080000000870000 | RA | RD;
187 | }
188 | 
189 | // NOP should be issued along with --:-:-:Y:0 control codes
190 | uint64_t NOP() { return 0x50b0000000070f00; }
191 | // RET should be issued along with --:-:-:-:f control codes
192 | uint64_t RET() { return 0xe32000000007000f; }
193 | 
194 | struct control_flags_t
195 | {
196 |     uint8_t reuse;
197 |     uint8_t yield;
198 |     uint8_t stall;
199 |     uint8_t wrtdb;
200 |     uint8_t readb;
201 |     uint8_t watdb;
202 | };
203 | 
204 | static control_flags_t ctrl[3];
205 | 
206 | // watdb:readb:wrtdb:yield:stall [reuse]
207 | // read and write barriers are numbered 1...6
208 | void wait_on_barrier(uint8_t op, uint8_t barrier_number) {
209 |     ctrl[op].watdb |= (1 << (barrier_number-1));
210 | }
211 | void set_write_barrier(uint8_t op, uint8_t barrier_number) {
212 |     ctrl[op].wrtdb = barrier_number-1;
213 | }
214 | void set_read_barrier(uint8_t op, uint8_t barrier_number) {
215 |     ctrl[op].readb = barrier_number-1;
216 | }
217 | void yield(uint8_t op) { // enables yield on instruction number op
218 |     ctrl[op].yield = 0; // zero means enable
219 | }
220 | void stall(uint8_t op, uint8_t count) {
221 |     ctrl[op].stall = count;
222 | }
223 | void reuse(uint8_t op, bool ra, bool rb, bool rc, bool rd) {
224 |     ctrl[op].reuse = 0;
225 |     if (ra) ctrl[op].reuse |= 0x1;
226 |     if (rb) ctrl[op].reuse |= 0x2;
227 |     if (rc) ctrl[op].reuse |= 0x4;
228 |     if (rd) ctrl[op].reuse |= 0x8;
229 | }
230 | void reset_ctrl() {
231 |     for (int op = 0; op < 3; op++)
232 |     {
233 |         ctrl[op].watdb = 0x00;
234 |         ctrl[op].readb = 7;
235 |         ctrl[op].wrtdb = 7;
236 |         ctrl[op].yield = 1;
237 |         ctrl[op].stall = 0;
238 |     }
239 | }
240 | uint64_t CTRL() {
241 |     uint64_t ret = 0;
242 |     for (int op = 0; op < 3; op++) {
243 |         uint64_t stall = (((uint64_t)ctrl[op].stall) & 0x0f) << 0;
244 |         uint64_t yield = (((uint64_t)ctrl[op].yield) & 0x01) << 4;
245 |         uint64_t wrtdb = (((uint64_t)ctrl[op].wrtdb) & 0x07) << 5;
246 |         uint64_t readb = (((uint64_t)ctrl[op].readb) & 0x07) << 8;
247 |         uint64_t watdb = (((uint64_t)ctrl[op].watdb) & 0x3f) << 11;
248 |         uint64_t reuse = (((uint64_t)ctrl[op].reuse) & 0x0f) << 17;
249 |         uint64_t ctrl = reuse|watdb|readb|wrtdb|yield|stall;
250 |         ret |= ctrl << (op*21);
251 |     }
252 |     return ret;
253 | }
254 | 
255 | void print_ctrl_segment(uint64_t x) {
256 |     uint8_t stall = (uint8_t)((x & 0x0000f) >> 0);
257 |     uint8_t yield = (uint8_t)((x & 0x00010) >> 4);
258 |     uint8_t wrtdb = (uint8_t)((x & 0x000e0) >> 5); // 7 = no dependency
259 |     uint8_t readb = (uint8_t)((x & 0x00700) >> 8); // 7 = no dependency
260 |     uint8_t watdb = (uint8_t)((x & 0x1f800) >> 11);
261 |     if (watdb)    printf("%02x:", watdb); else printf("--:");
262 |     if (readb==7) printf("-:");           else printf("%d:", readb+1);
263 |     if (wrtdb==7) printf("-:");           else printf("%d:", wrtdb+1);
264 |     if (yield)    printf("-:");           else printf("Y:");
265 |     printf("%x", stall);
266 | }
267 | 
268 | void print_ctrl(uint64_t x) {
269 |     uint64_t ctrl1 =  (x & 0x000000000001ffff) >> 0;
270 |     uint64_t ctrl2 =  (x & 0x0000003fffe00000) >> 21;
271 |     uint64_t ctrl3 =  (x & 0x07fffc0000000000) >> 42;
272 |     uint64_t reuse1 = (x & 0x00000000001e0000) >> 17;
273 |     uint64_t reuse2 = (x & 0x000003c000000000) >> 38;
274 |     uint64_t reuse3 = (x & 0x7800000000000000) >> 59;
275 |     print_ctrl_segment(ctrl1); printf(" | ");
276 |     print_ctrl_segment(ctrl2); printf(" | ");
277 |     print_ctrl_segment(ctrl3);
278 | }
279 | 
280 | }
281 | 
282 | /*
283 | Notes
284 | 
285 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
286 |                                IMMEDIATE VALUES
287 | 
288 | FADD20I, FMUL20I and FFMA20I are immediate versions of their respective instructions,
289 | except the rightmost 12 bits of the single-precision mantissa are masked to zero. If
290 | you need full 23-bit mantissa precision you can use FADD32I and FMUL32I, which encode
291 | the entire float. FFMA does not have a 32-bit immediate version, but it can load from
292 | constant memory.
293 | 
294 | *20I appear to be treated the same (flag-wise) as their non-immediate counterparts.
295 | 
296 | FMNMX d, a, b, !PT -> MAX(a,b)
297 | FMNMX d, a, b, PT  -> MIN(a,b)
298 | 
299 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
300 |                                 REGISTER BANKS
301 | 
302 | Maxwell has four register banks per thread. The assignment of registers to banks is easy:
303 |   Bank = Register number mod 4 (e.g. R0 and R4 are bank0, R3 and R7 are bank3)
304 | On Maxwell and Pascal, instructions can only access one value from each memory bank?
305 | 
306 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
307 |                                 REGISTER REUSE
308 | 
309 | Maxwell and Pascal have 4 register reuse caches and 4 source operand slots. Each of the
310 | 4 reuse flag bits correspond to one of the 8-byte slots. The LSB in reuse flags controls
311 | the cache for the first source operand slot (a?), while the MSB is for the fourth.
312 | e.g. instruction dst, op0 ("first"), op1, op2, op3 ("last")
313 | e.g. FFMA.FTZ R3, R4, R4, R0.reuse -> has reuse flag 0100
314 | e.g. FFMA.FTZ R3, R4.reuse, R4, R0 -> has reuse flag 0001
315 | */
316 | 
317 | 


--------------------------------------------------------------------------------
/src/sass_6_x/cubin.h:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <assert.h>
  3 | #include <stdint.h>
  4 | #include <string.h>
  5 | 
  6 | struct elf64_hdr_t
  7 | {
  8 |     uint8_t magic[4];
  9 |     uint8_t fileClass;
 10 |     uint8_t encoding;
 11 |     uint8_t fileVersion;
 12 |     uint8_t padding[9];
 13 |     uint16_t type;
 14 |     uint16_t machine;
 15 |     uint32_t version;
 16 |     uint64_t entry;
 17 |     uint64_t phOffset;
 18 |     uint64_t shOffset;
 19 |     uint32_t flags;
 20 |     uint16_t ehSize;
 21 |     uint16_t phEntSize;
 22 |     uint16_t phNum;
 23 |     uint16_t shEntSize;
 24 |     uint16_t shNum;
 25 |     uint16_t shStrIndx;
 26 | };
 27 | 
 28 | struct elf64_prg_hdr_t
 29 | {
 30 |     uint32_t type;
 31 |     uint32_t flags;
 32 |     uint64_t offset;
 33 |     uint64_t vaddr;
 34 |     uint64_t paddr;
 35 |     uint64_t fileSize;
 36 |     uint64_t memSize;
 37 |     uint64_t align;
 38 | };
 39 | 
 40 | struct elf64_sec_hdr_t
 41 | {
 42 |     uint32_t name;
 43 |     uint32_t type;
 44 |     uint64_t flags;
 45 |     uint64_t addr;
 46 |     uint64_t offset;
 47 |     uint64_t size;
 48 |     uint32_t link;
 49 |     uint32_t info;
 50 |     uint64_t align;
 51 |     uint64_t entSize;
 52 | };
 53 | 
 54 | struct elf64_sym_ent_t
 55 | {
 56 |     uint32_t name;
 57 |     uint8_t  info;
 58 |     uint8_t  other;
 59 |     uint16_t shIndx;
 60 |     uint64_t value;
 61 |     uint64_t size;
 62 | };
 63 | 
 64 | struct cubin_function_t
 65 | {
 66 |     char *name;
 67 |     char *b;
 68 |     elf64_sec_hdr_t *h;
 69 |     elf64_sym_ent_t *e;
 70 | 
 71 |     #if 0
 72 |     uint64_t *instructions()           { return (uint64_t*)(b + h->offset); }
 73 |     int num_instructions()             { return (int)(h->size / sizeof(uint64_t)); }
 74 |     void set_num_instructions(int n)   { assert(n >= 0); h->size = n*sizeof(uint64_t); }
 75 |     #else
 76 |     // e->value is non-zero if the function is inlined, in which case it describe the
 77 |     // byte offset of the first instruction in the containing function's instructions.
 78 |     uint64_t *instructions()           { return (uint64_t*)(b + h->offset + e->value); }
 79 |     int num_instructions()             { return (int)(e->size/sizeof(uint64_t)); }
 80 |     void set_num_instructions(int n)
 81 |     {
 82 |         assert(n >= 0);
 83 |         assert(e->size == h->size && "The function appears to be an inline function. Changing the size of these is beyond the scope of this program.");
 84 |         e->size = ((uint64_t)n)*sizeof(uint64_t);
 85 |         h->size = ((uint64_t)n)*sizeof(uint64_t);
 86 |     }
 87 |     #endif
 88 | 
 89 |     uint8_t register_count()           { return (h->info & 0xff000000)>>24; }
 90 |     void set_register_count(uint8_t n) { h->info = (h->info & 0x00ffffff) | (n<<24); }
 91 | };
 92 | 
 93 | enum { cubin_max_prg_hdrs = 1024 };
 94 | enum { cubin_max_sec_hdrs = 1024 };
 95 | enum { cubin_max_functions = 1024 };
 96 | struct cubin_t
 97 | {
 98 |     int              sizeof_binary;
 99 |     char            *binary;
100 |     elf64_prg_hdr_t *prg_hdrs[cubin_max_prg_hdrs];
101 |     int              num_prg_hdrs;
102 | 
103 |     elf64_sec_hdr_t *sec_hdrs[cubin_max_sec_hdrs];
104 |     int              num_sec_hdrs;
105 | 
106 |     cubin_function_t functions[cubin_max_functions];
107 |     int              num_functions;
108 | 
109 |     cubin_function_t *get_function(const char *name)
110 |     {
111 |         for (int i = 0; i < num_functions; i++)
112 |             if (strcmp(functions[i].name, name) == 0)
113 |                 return functions + i;
114 |         return NULL;
115 |     }
116 | };
117 | 
118 | cubin_t read_cubin(const char *filename)
119 | {
120 |     {
121 |         uint16_t x = 0xaabb;
122 |         uint8_t *p = (uint8_t*)&x;
123 |         assert(p[0] == 0xbb && "machine is not little (?) endian");
124 |     }
125 | 
126 |     cubin_t cubin = {0};
127 |     {
128 |         FILE *f = fopen(filename, "rb");
129 |         assert(f);
130 |         fseek(f, 0, SEEK_END);
131 |         long size = ftell(f);
132 |         rewind(f);
133 |         char *data = new char[size + 1];
134 |         int ok = fread(data, 1, size, f);
135 |         assert(ok);
136 |         data[size] = 0;
137 |         fclose(f);
138 | 
139 |         cubin.binary = data;
140 |         cubin.sizeof_binary = size;
141 |     }
142 |     assert(cubin.binary);
143 |     assert(cubin.sizeof_binary);
144 | 
145 |     elf64_hdr_t elf_hdr = *(elf64_hdr_t*)cubin.binary;
146 |     assert(elf_hdr.fileClass == 2 && "assuming 64-bit ELF");
147 |     assert((elf_hdr.flags & 0xff) == 60 && "assuming sm_60 architecture");
148 |     assert(elf_hdr.flags & 0x400 && "assuming 64-bit addresses");
149 |     assert(elf_hdr.phNum <= cubin_max_prg_hdrs);
150 |     assert(elf_hdr.shNum <= cubin_max_sec_hdrs);
151 | 
152 |     // read program headers
153 |     {
154 |         char *b = cubin.binary + elf_hdr.phOffset;
155 |         for (int i = 0; i < elf_hdr.phNum; i++)
156 |         {
157 |             cubin.prg_hdrs[cubin.num_prg_hdrs++] = (elf64_prg_hdr_t*)b;
158 |             b += elf_hdr.phEntSize;
159 |         }
160 |     }
161 | 
162 |     // read section headers
163 |     {
164 |         char *b = cubin.binary + elf_hdr.shOffset;
165 |         for (int i = 0; i < elf_hdr.shNum; i++)
166 |         {
167 |             cubin.sec_hdrs[cubin.num_sec_hdrs++] = (elf64_sec_hdr_t*)b;
168 |             b += elf_hdr.shEntSize;
169 |         }
170 |     }
171 | 
172 | 
173 |     // find section headers called strtab and shstrtab
174 |     char *strtab = NULL;
175 |     char *shstrtab = NULL;
176 |     for (int i = 0; i < cubin.num_sec_hdrs; i++)
177 |     {
178 |         elf64_sec_hdr_t *sh = (elf64_sec_hdr_t*)cubin.sec_hdrs[i];
179 |         if (sh->type == 3)
180 |         {
181 |             char *data = cubin.binary + sh->offset;
182 |             char *name = data + sh->name;
183 |             if (strcmp(name, ".strtab") == 0)        strtab = data;
184 |             else if (strcmp(name, ".shstrtab") == 0) shstrtab = data;
185 | 
186 |             printf("found section \"%s\"\ndata (%d bytes): ", name, sh->size);
187 |             for (int j = 0; j < sh->size; j++)
188 |                 printf("%c", data[j] ? data[j] : ' ');
189 |             printf("\n\n");
190 |         }
191 |         #if 0
192 |         else
193 |         {
194 |             char *name = shstrtab + sh->name;
195 |             uint8_t *data = (uint8_t*)(cubin.binary + sh->offset);
196 |             printf("found section \"%s\" (type=%x)\ndata(%d bytes):", name, sh->type, sh->size);
197 |             for (int j = 0; j < sh->size; j++)
198 |                 printf("%02x ", data[j]);
199 |             printf("\n\n");
200 |         }
201 |         #endif
202 |     }
203 |     assert(strtab);
204 |     assert(shstrtab);
205 | 
206 |     for (int i = 0; i < cubin.num_sec_hdrs; i++)
207 |     {
208 |         elf64_sec_hdr_t *sh = cubin.sec_hdrs[i];
209 |         if (sh->type == 2) // look for symbol table
210 |         {
211 |             printf("found symbol table section with these symbols:\n");
212 |             char *data = cubin.binary + sh->offset;
213 |             uint64_t offset = 0;
214 |             while (offset < sh->size) // go through each symbol entry
215 |             {
216 |                 elf64_sym_ent_t *ent = (elf64_sym_ent_t*)(data + offset);
217 |                 offset += sh->entSize;
218 |                 char *name = strtab + ent->name;
219 | 
220 |                 if ((ent->info & 0x0f) == 0x02) // look for symbols tagged FUNC
221 |                 {
222 |                     printf("(function) \"%s\"\n", name);
223 |                     assert(cubin.num_functions < cubin_max_functions);
224 |                     cubin_function_t func = {0};
225 |                     func.name = name;
226 |                     func.h    = cubin.sec_hdrs[ent->shIndx];
227 |                     func.b    = cubin.binary;
228 |                     func.e    = ent;
229 |                     cubin.functions[cubin.num_functions++] = func;
230 | 
231 |                     // elf64_sec_hdr_t *ent_sh = cubin.sec_hdrs[ent->shIndx];
232 |                     // printf("section header \"%s\"\n", strtab + ent_sh->name);
233 |                 }
234 |                 else
235 |                 {
236 |                     printf("(other)    \"%s\"\n", name);
237 |                 }
238 | 
239 |                 #if 0
240 |                 printf("\tinfo:0x%x\n", ent->info);
241 |                 printf("\tother:0x%x\n", ent->other);
242 |                 printf("\tvalue:0x%llx\n", ent->value);
243 |                 printf("\tsize:0x%llx (%llu)\n", ent->size, ent->size);
244 |                 #endif
245 |             }
246 |         }
247 |     }
248 | 
249 |     printf("\nfound %d functions\n", cubin.num_functions);
250 |     for (int i = 0; i < cubin.num_functions; i++)
251 |     {
252 |         printf("\"%s\"\n", cubin.functions[i].name);
253 |         printf("\tRegister count: %d\n", cubin.functions[i].register_count());
254 |         printf("\tInstructions:\n");
255 |         uint64_t *in = cubin.functions[i].instructions();
256 |         int num_instructions = cubin.functions[i].num_instructions();
257 |         for (int j = 0; j < 10 && j < num_instructions; j++)
258 |             printf("\t0x%016llx\n", in[j]);
259 |         if (num_instructions > 10)
260 |             printf("\t... (%d more instructions)\n", num_instructions - 10);
261 |     }
262 |     return cubin;
263 | }
264 | 
265 | void save_cubin(cubin_t *cubin, const char *filename)
266 | {
267 |     FILE *f = fopen(filename, "wb+");
268 |     assert(f);
269 |     fwrite(cubin->binary, 1, cubin->sizeof_binary, f);
270 |     fclose(f);
271 | }
272 | 


--------------------------------------------------------------------------------
/src/sass_6_x/instruction.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | namespace backend_sass {
  4 | 
  5 | enum latency_constants_
  6 | {
  7 |     // All the 32-bit floating point instructions (except sqrt) take exactly
  8 |     // 6 cycles before the result is written to and valid. Subsequent instructions
  9 |     // that read from this result must therefore be executed atleast six cycles
 10 |     // after the first one began. The scheduler tries to fill the gap between one
 11 |     // instruction and one that depends on its results by looking for others that
 12 |     // do not depend on its results. We conveniently structure our input code into
 13 |     // 'blocks' that are entirely independent from other blocks, but the instructions
 14 |     // within a block cannot be reordered. If the scheduler can't find enough
 15 |     // instructions to fill the pipeline, it will have to insert 'stalls', which
 16 |     // do nothing for a given number of clock cycles.
 17 |     LATENCY_X32T  = 6,
 18 | 
 19 |     // sqrt is a variable latency instruction and needs to set a write barrier
 20 |     // which dependent instructions must wait on. The later that instruction
 21 |     // actually does the wait, the more likely it is that the sqrt is finished,
 22 |     // and the barrier does not incur a stall. We work under the assumption that
 23 |     // sqrt finishes after 'LATENCY_SQRT' cycles.
 24 |     LATENCY_SQRT  = 8,
 25 | 
 26 |     // Setting the write barrier takes non-zero clock cycles.
 27 |     LATENCY_WRTDB = 1,
 28 | };
 29 | 
 30 | enum instruction_type_t
 31 | {
 32 |     INSTRUCTION_FFMA,
 33 |     INSTRUCTION_FMUL,
 34 |     INSTRUCTION_FADD,
 35 |     INSTRUCTION_FFMA20I,
 36 |     INSTRUCTION_FMUL20I,
 37 |     INSTRUCTION_FADD20I,
 38 |     INSTRUCTION_FADD20I_ABS_A,
 39 |     INSTRUCTION_FMIN,
 40 |     INSTRUCTION_FMAX,
 41 |     INSTRUCTION_FMAX_NEG_B,
 42 |     INSTRUCTION_SQRT
 43 | };
 44 | 
 45 | struct instruction_t
 46 | {
 47 |     instruction_type_t type;
 48 |     named_register_t a,b,c; // source registers ("operands")
 49 |     named_register_t d;     // destination register
 50 |     float imm_b;            // immediate value in b-slot
 51 | 
 52 |     // filled in by scheduler
 53 |     uint8_t ra,rb,rc,rd;
 54 |     uint8_t reuse;       // register reuse flags
 55 |     uint8_t yield;       // can relinquish control to other warp or not
 56 |     uint8_t stall;       // number of cycles to wait before continuing
 57 |     uint8_t wrtdb;       // write dependencies
 58 |     uint8_t readb;       // read dependencies
 59 |     uint8_t watdb;       // wait dependencies
 60 | };
 61 | 
 62 | enum { MAX_INSTRUCTIONS_PER_BLOCK = 64 };
 63 | struct instruction_block_t
 64 | // An instruction block is a list of instructions that implements a single basic
 65 | // AST opcode, either a primitive or an operator. During code generation (parsing
 66 | // the AST), we create a list of instruction blocks, evaluating the AST bottom-up.
 67 | // During this, we assign to each block up to three register addresses.
 68 | // A destination register, where the output of the block is to be stored, and
 69 | // a left- and right-child register (for boolean operators).
 70 | {
 71 |     instruction_t instructions[MAX_INSTRUCTIONS_PER_BLOCK];
 72 |     int num_instructions;
 73 |     int d,d_left,d_right;
 74 | };
 75 | 
 76 | enum { MAX_INSTRUCTION_BLOCKS = 128 };
 77 | struct instruction_blocks_t
 78 | {
 79 |     instruction_block_t *blocks;
 80 |     int num_blocks;
 81 | };
 82 | 
 83 | void print_instruction(instruction_t in)
 84 | {
 85 |     int n = 0;
 86 |          if (in.type==INSTRUCTION_FFMA)          n+=printf("FFMA r%d,  r%d , r%d, r%d", in.rd, in.ra, in.rb, in.rc);
 87 |     else if (in.type==INSTRUCTION_FMUL)          n+=printf("FMUL r%d,  r%d , r%d", in.rd, in.ra, in.rb);
 88 |     else if (in.type==INSTRUCTION_FADD)          n+=printf("FADD r%d,  r%d , r%d", in.rd, in.ra, in.rb);
 89 |     else if (in.type==INSTRUCTION_FFMA20I)       n+=printf("FFMA r%d,  r%d , %5.2ff, r%d", in.rd, in.ra, in.imm_b, in.rc);
 90 |     else if (in.type==INSTRUCTION_FMUL20I)       n+=printf("FMUL r%d,  r%d , %5.2ff", in.rd, in.ra, in.imm_b);
 91 |     else if (in.type==INSTRUCTION_FADD20I)       n+=printf("FADD r%d,  r%d , %5.2ff", in.rd, in.ra, in.imm_b);
 92 |     else if (in.type==INSTRUCTION_FADD20I_ABS_A) n+=printf("FADD r%d, |r%d|, %5.2ff", in.rd, in.ra, in.imm_b);
 93 |     else if (in.type==INSTRUCTION_FMIN)          n+=printf("FMIN r%d,  r%d , r%d", in.rd, in.ra, in.rb);
 94 |     else if (in.type==INSTRUCTION_FMAX)          n+=printf("FMAX r%d,  r%d , r%d", in.rd, in.ra, in.rb);
 95 |     else if (in.type==INSTRUCTION_FMAX_NEG_B)    n+=printf("FMAX r%d, -r%d , r%d", in.rd, in.ra, in.rb);
 96 |     else if (in.type==INSTRUCTION_SQRT)          n+=printf("SQRT r%d,  r%d", in.rd, in.ra);
 97 |     else assert(false);
 98 | 
 99 |     for (int i = n; i < 30; i++)
100 |         printf(" ");
101 | 
102 |     if (in.watdb)    printf("%02x:", in.watdb); else printf("--:");
103 |     if (in.readb==7) printf("-:");              else printf("%d:", in.readb+1);
104 |     if (in.wrtdb==7) printf("-:");              else printf("%d:", in.wrtdb+1);
105 |     if (in.yield)    printf("-:");              else printf("Y:");
106 |     printf("%x", in.stall);
107 |     if (in.reuse)
108 |         printf(" reuse: %s%s%s",
109 |             (in.reuse & 1) ? "a" : " ",
110 |             (in.reuse & 2) ? "b" : " ",
111 |             (in.reuse & 4) ? "c" : " ");
112 |     printf("\n");
113 | }
114 | 
115 | }
116 | 


--------------------------------------------------------------------------------
/src/sass_6_x/registers.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace backend_sass {
 4 | 
 5 | enum named_register_t
 6 | {
 7 |     // This is used to indicate immediate values
 8 |     // Note: this enum must be 0 because we use memset to clear instructions
 9 |     NO_REGISTER=0,
10 | 
11 |     // Input position coordinates
12 |     REGISTER_X0,
13 |     REGISTER_Y0,
14 |     REGISTER_Z0,
15 | 
16 |     // Temporary calculations
17 |     REGISTER_X,
18 |     REGISTER_Y,
19 |     REGISTER_Z,
20 |     REGISTER_W,
21 | 
22 |     // Result registers (e.g. f(p))
23 |     REGISTER_D,       // result is to be stored here
24 |     REGISTER_D_LEFT,  // result from left child in tree is stored here
25 |     REGISTER_D_RIGHT, // result from right child in tree is stored here
26 | 
27 |     // constant zero
28 |     REGISTER_RZ,
29 |     NUM_NAMED_REGISTERS
30 | };
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/src/sass_6_x/scheduler.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | namespace backend_sass {
  4 | 
  5 | instruction_t *
  6 | schedule_blocks(instruction_blocks_t blocks, int *return_num_instructions)
  7 | // This function performs physical register allocation and instruction scheduling.
  8 | // Register allocation maps the virtual register names used by each instruction to
  9 | // physical register addresses (0 to 255). Instruction scheduling makes sure that
 10 | // enough clock cycles passes between instructions so that the results are ready.
 11 | {
 12 |     enum { max_instructions = 1024 };
 13 |     static instruction_t out[max_instructions];
 14 |     int num_out = 0;
 15 | 
 16 |     enum { max_registers = 256 };
 17 |     enum { num_wait_barriers = 6 };
 18 |     enum { max_temp_registers = 24 };
 19 | 
 20 |     struct wait_barrier_t
 21 |     {
 22 |         uint8_t barrier_on_register[max_registers];
 23 |         bool is_barrier_active[num_wait_barriers];
 24 |         void init()
 25 |         {
 26 |             for (int i = 0; i < num_wait_barriers; i++)
 27 |                 is_barrier_active[i] = false;
 28 |             for (int i = 0; i < max_registers; i++)
 29 |                 barrier_on_register[i] = 7;
 30 |         }
 31 |         bool is_set(uint8_t reg) { return barrier_on_register[reg] != 7; }
 32 |         uint8_t set(uint8_t reg) // return wrtdb flag
 33 |         {
 34 |             for (int i = 0; i < num_wait_barriers; i++)
 35 |             {
 36 |                 if (!is_barrier_active[i])
 37 |                 {
 38 |                     uint8_t barrier = (uint8_t)(i);
 39 |                     barrier_on_register[reg] = barrier;
 40 |                     is_barrier_active[i] = true;
 41 |                     return barrier;
 42 |                 }
 43 |             }
 44 |             assert(false && "Ran out of wait barriers");
 45 |             return 7;
 46 |         }
 47 |         uint8_t wait(uint8_t reg) // return watdb flag (to be OR'd with current flag)
 48 |         {
 49 |             uint8_t barrier = barrier_on_register[reg];
 50 |             assert(barrier != 7 && "Tried to wait on a register that had no wait barrier set.");
 51 |             uint8_t watdb = 1 << barrier;
 52 |             is_barrier_active[barrier] = false;
 53 |             barrier_on_register[reg] = 7;
 54 |             return watdb;
 55 |         }
 56 |     };
 57 | 
 58 |     static wait_barrier_t wait_barrier;
 59 |     wait_barrier.init();
 60 | 
 61 |     for (int i = 0; i < blocks.num_blocks; i++)
 62 |     {
 63 |         int d = blocks.blocks[i].d;
 64 |         assert(d < max_temp_registers);
 65 |         int d_left = blocks.blocks[i].d_left;
 66 |         int d_right = blocks.blocks[i].d_right;
 67 | 
 68 |         static uint8_t register_map[NUM_NAMED_REGISTERS] = {0};
 69 |         register_map[NO_REGISTER]      = 0xff;
 70 |         register_map[REGISTER_X0]      = 0x00;
 71 |         register_map[REGISTER_Y0]      = 0x01;
 72 |         register_map[REGISTER_Z0]      = 0x02;
 73 |         register_map[REGISTER_X]       = 0x03;
 74 |         register_map[REGISTER_Y]       = 0x04;
 75 |         register_map[REGISTER_Z]       = 0x05;
 76 |         register_map[REGISTER_W]       = 0x06;
 77 |         register_map[REGISTER_D]       = 0x07 + d;
 78 |         register_map[REGISTER_D_LEFT]  = 0x07 + d_left;
 79 |         register_map[REGISTER_D_RIGHT] = 0x07 + d_right;
 80 |         register_map[REGISTER_RZ] = 0xff;
 81 | 
 82 |         for (int j = 0; j < blocks.blocks[i].num_instructions; j++)
 83 |         {
 84 |             instruction_t *in = &blocks.blocks[i].instructions[j];
 85 |             in->ra = register_map[in->a];
 86 |             in->rb = register_map[in->b];
 87 |             in->rc = register_map[in->c];
 88 |             in->rd = register_map[in->d];
 89 |             in->reuse = 0;
 90 |             in->watdb = 0;
 91 |             in->readb = 7;
 92 |             in->wrtdb = 7;
 93 |             in->yield = 0;
 94 |             if (in->a != NO_REGISTER && wait_barrier.is_set(in->ra)) { in->watdb |= wait_barrier.wait(in->ra); }
 95 |             if (in->b != NO_REGISTER && wait_barrier.is_set(in->rb)) { in->watdb |= wait_barrier.wait(in->rb); }
 96 |             if (in->c != NO_REGISTER && wait_barrier.is_set(in->rc)) { in->watdb |= wait_barrier.wait(in->rc); }
 97 | 
 98 |             // if we the instruction doesn't have a stall count set already
 99 |             // we set it to the latency of the instruction.
100 |             if (in->stall == 0)
101 |             {
102 |                 if (in->type == INSTRUCTION_SQRT) in->stall = 1+LATENCY_WRTDB;
103 |                 else                              in->stall = LATENCY_X32T;
104 |             }
105 | 
106 |             if (in->type == INSTRUCTION_SQRT) in->wrtdb = wait_barrier.set(in->rd);
107 | 
108 |             // simple reuse tactic
109 |             #if 1
110 |             if (j > 0)
111 |             {
112 |                 instruction_t *last = &blocks.blocks[i].instructions[j-1];
113 |                 if (last->a != NO_REGISTER && last->ra == in->ra && last->rd != in->ra) in->reuse |= 1 << 0;
114 |                 if (last->b != NO_REGISTER && last->rb == in->rb && last->rd != in->rb) in->reuse |= 1 << 1;
115 |                 if (last->c != NO_REGISTER && last->rc == in->rc && last->rd != in->rc) in->reuse |= 1 << 2;
116 |             }
117 |             #endif
118 | 
119 |             out[num_out++] = *in;
120 |             assert(num_out <= max_instructions);
121 |         }
122 |     }
123 | 
124 |     *return_num_instructions = num_out;
125 |     return out;
126 | }
127 | 
128 | }
129 | 


--------------------------------------------------------------------------------
/src/sass_6_x/simulator.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | namespace backend_sass {
  4 | 
  5 | struct sass_simulator_t
  6 | {
  7 |     bool debug;
  8 |     int t;
  9 |     float reg[256];
 10 | 
 11 |     // writes in progress
 12 |     struct job_t
 13 |     {
 14 |         uint8_t dst;
 15 |         float val;
 16 |         int t_write;
 17 |     };
 18 |     enum { max_write_jobs = 1024 };
 19 |     job_t writes[max_write_jobs];
 20 |     int num_writes_waiting;
 21 | 
 22 |     // barriers
 23 |     enum { num_write_barriers = 6 };
 24 |     int register_on_barrier[num_write_barriers];
 25 | 
 26 |     void init(bool _debug)
 27 |     {
 28 |         reg[REGISTER_RZ] = 0.0f;
 29 |         num_writes_waiting = 0;
 30 |         t = 0;
 31 |         debug = _debug;
 32 |         for (int i = 0; i < num_write_barriers; i++)
 33 |             register_on_barrier[i] = -1;
 34 |     }
 35 |     void _step(int cycles)
 36 |     {
 37 |         t += cycles;
 38 |         for (int i = 0; i < num_writes_waiting; i++)
 39 |         {
 40 |             if (t >= writes[i].t_write)
 41 |             {
 42 |                 reg[writes[i].dst] = writes[i].val;
 43 | 
 44 |                 // if a write barrier was set on the register we can take it down
 45 |                 for (int j = 0; j < 6; j++)
 46 |                 {
 47 |                     if (register_on_barrier[j] == writes[i].dst)
 48 |                         register_on_barrier[j] = -1;
 49 |                 }
 50 | 
 51 |                 writes[i] = writes[--num_writes_waiting];
 52 |                 i--;
 53 |             }
 54 |         }
 55 |     }
 56 |     void _set_write_barrier(uint8_t reg, uint8_t barrier)
 57 |     {
 58 |         assert(barrier >= 0 && barrier <= num_write_barriers-1);
 59 |         assert(register_on_barrier[barrier] == -1 && "overwrote an existing write barrier.");
 60 |         register_on_barrier[barrier] = reg;
 61 |     }
 62 |     void _wait_on_barrier(uint8_t barrier)
 63 |     {
 64 |         if (register_on_barrier[barrier] == -1)
 65 |             return;
 66 |         assert(barrier >= 0 && barrier <= num_write_barriers-1);
 67 |         bool resolved = false;
 68 |         for (int i = 0; i < num_writes_waiting; i++)
 69 |         {
 70 |             if (writes[i].dst == (uint8_t)register_on_barrier[barrier])
 71 |             {
 72 |                 int t_to_wait = writes[i].t_write - t;
 73 |                 if (t_to_wait > 0)
 74 |                 {
 75 |                     if (debug) printf("waited %d cycles on barrier\n", t_to_wait);
 76 |                     _step(t_to_wait);
 77 |                 }
 78 |                 resolved = true;
 79 |                 register_on_barrier[barrier] = -1;
 80 |             }
 81 |         }
 82 |         assert(resolved && "waited on a barrier which is not resolved by any on-going writes.");
 83 |     }
 84 |     float _read_reg(uint8_t src)
 85 |     {
 86 |         for (int i = 0; i < num_writes_waiting; i++)
 87 |             if (writes[i].dst == src && debug)
 88 |                 printf("read-before-write conflict on r%d\n", src);
 89 |         return reg[src];
 90 |     }
 91 |     void _write_reg(uint8_t dst, float val, int latency)
 92 |     {
 93 |         assert(num_writes_waiting+1 <= max_write_jobs);
 94 |         writes[num_writes_waiting].dst = dst;
 95 |         writes[num_writes_waiting].val = val;
 96 |         writes[num_writes_waiting].t_write = t + latency;
 97 |         num_writes_waiting++;
 98 |     }
 99 |     void execute(instruction_t in)
100 |     {
101 |         using namespace backend_sass;
102 |         bool is_immediate =
103 |             in.type == INSTRUCTION_FFMA20I ||
104 |             in.type == INSTRUCTION_FMUL20I ||
105 |             in.type == INSTRUCTION_FADD20I ||
106 |             in.type == INSTRUCTION_FADD20I_ABS_A;
107 | 
108 |         if (in.watdb)
109 |         {
110 |             if (in.watdb & 1)  _wait_on_barrier(0);
111 |             if (in.watdb & 2)  _wait_on_barrier(1);
112 |             if (in.watdb & 4)  _wait_on_barrier(2);
113 |             if (in.watdb & 8)  _wait_on_barrier(3);
114 |             if (in.watdb & 16) _wait_on_barrier(4);
115 |             if (in.watdb & 32) _wait_on_barrier(5);
116 |         }
117 | 
118 |         if (in.wrtdb != 7) _set_write_barrier(in.rd, in.wrtdb);
119 | 
120 |         float a = _read_reg(in.ra);
121 |         float b = is_immediate ? in.imm_b : _read_reg(in.rb);
122 |         float c = _read_reg(in.rc);
123 | 
124 |         float d;
125 |         int lat;
126 |              if (in.type==INSTRUCTION_FFMA)          { lat = LATENCY_X32T; d = a*b + c; }
127 |         else if (in.type==INSTRUCTION_FMUL)          { lat = LATENCY_X32T; d = a*b; }
128 |         else if (in.type==INSTRUCTION_FADD)          { lat = LATENCY_X32T; d = a + b; }
129 |         else if (in.type==INSTRUCTION_FFMA20I)       { lat = LATENCY_X32T; d = a*b + c; }
130 |         else if (in.type==INSTRUCTION_FMUL20I)       { lat = LATENCY_X32T; d = a*b; }
131 |         else if (in.type==INSTRUCTION_FADD20I)       { lat = LATENCY_X32T; d = a + b; }
132 |         else if (in.type==INSTRUCTION_FADD20I_ABS_A) { lat = LATENCY_X32T; d = fabsf(a) + b; }
133 |         else if (in.type==INSTRUCTION_FMIN)          { lat = LATENCY_X32T; d = (a < b) ? a : b; }
134 |         else if (in.type==INSTRUCTION_FMAX)          { lat = LATENCY_X32T; d = (a > b) ? a : b; }
135 |         else if (in.type==INSTRUCTION_FMAX_NEG_B)    { lat = LATENCY_X32T; d = (a > -b) ? a : -b; }
136 |         else if (in.type==INSTRUCTION_SQRT)          { lat = LATENCY_SQRT; d = sqrtf(a); }
137 |         else assert(false && "unhandled instruction");
138 | 
139 |         _write_reg(in.rd, d, lat);
140 |         _step(in.stall);
141 | 
142 |         if (debug) print_instruction(in);
143 |     }
144 | };
145 | 
146 | }
147 | 


--------------------------------------------------------------------------------
/test/backend_glsl.cpp:
--------------------------------------------------------------------------------
 1 | #include "../src/backend_glsl.h"
 2 | #include "../src/frep_builder.h"
 3 | 
 4 | int main() {
 5 |     frep_t *f = fBoxCheap(1.0f, 0.5f, 0.25f);
 6 |     f = fOpUnion(f, fBox(2.0f, 1.0f, 1.0f));
 7 |     char *s = frep_compile_to_glsl(f);
 8 |     printf("%s\n", s);
 9 | }
10 | 


--------------------------------------------------------------------------------
/test/backend_ptx.cpp:
--------------------------------------------------------------------------------
  1 | // Example compilation instructions for Linux, g++:
  2 | // (Replace include directory with your installation and version of CUDA)
  3 | // $ g++ -std=c++11 backend_ptx.cpp -I/usr/local/cuda-10.1/include -lcuda
  4 | 
  5 | #include <iostream>
  6 | #include <math.h>
  7 | #include <cuda.h>
  8 | #include "util/cuda_error.h"
  9 | #include "util/init_cuda.h"
 10 | 
 11 | #define PTX_FP20_IMMEDIATE
 12 | #include "../src/frep.h"
 13 | #include "../src/frep_eval.h"
 14 | #include "../src/frep_builder.h"
 15 | #include "../src/backend_ptx.h"
 16 | 
 17 | // This generates a PTX program equivalent to:
 18 | //   float tree(float x, float y, float z) {
 19 | //       // generated PTX instructions
 20 | //   }
 21 | //   void main(vec4 *input, float *output) {
 22 | //       int tid = threadIdx.x + blockDim.x*blockIdx.x;
 23 | //       vec4 p = input[tid];
 24 | //       output[tid] = tree(p.x, p.y, p.z);
 25 | //   }
 26 | // Note: out_length _DOES NOT_ include the null-terminator.
 27 | char *generate_ptx_program(frep_t *f, size_t *out_length)
 28 | {
 29 |     const char *ptx_template = R"str(
 30 |     .version 6.0
 31 |     .target sm_60
 32 |     .address_size 64
 33 |     .func (.reg.f32 f%d) tree(.reg.f32 x0, .reg.f32 y0, .reg.f32 z0) {
 34 |         .reg.f32 f<%d>;
 35 |         %s
 36 |         ret.uni;
 37 |     }
 38 |     .visible.entry main(.param.u64 param0, .param.u64 param1) {
 39 |         .reg.f32 x0;
 40 |         .reg.f32 y0;
 41 |         .reg.f32 z0;
 42 |         .reg.f32 w0;
 43 |         .reg.b32 r<5>;
 44 |         .reg.b64 rd<9>;
 45 |         .reg.f32 d;
 46 |         ld.param.u64 rd1, [param0];
 47 |         ld.param.u64 rd2, [param1];
 48 |         cvta.to.global.u64 rd3, rd2;
 49 |         cvta.to.global.u64 rd4, rd1;
 50 |         mov.u32 r1, %%tid.x;       // threadIdx.x
 51 |         mov.u32 r2, %%ctaid.x;     // blockIdx.x
 52 |         mov.u32 r3, %%ntid.x;      // blockDim.x
 53 |         mad.lo.s32 r4, r3, r2, r1; // blockDim.x*blockIdx.x + threadIdx.x
 54 |         mul.wide.s32 rd5, r4, 16;  // sizeof(vec4)*(blockDim.x*blockIdx.x + threadIdx.x)
 55 |         add.s64 rd6, rd4, rd5;     // param0 + sizeof(vec4)*(blockDim.x*blockIdx.x + threadIdx.x)
 56 |         ld.global.v4.f32 {x0, y0, z0, w0}, [rd6];
 57 |         mul.wide.s32 rd7, r4, 4;   // sizeof(float)*(blockDim.x*blockIdx.x + threadIdx.x)
 58 |         add.s64 rd8, rd3, rd7;     // param1 + sizeof(float)*(blockDim.x*blockIdx.x + threadIdx.x)
 59 |         call.uni (d), tree, (x0,y0,z0);
 60 |         st.global.f32 [rd8], d;
 61 |         ret;
 62 |     }
 63 |     )str";
 64 | 
 65 |     static char buffer[10*1024*1024];
 66 |     char *stream = buffer;
 67 |     int result_register;
 68 |     char *ptx = frep_compile_to_ptx(f, &result_register);
 69 |     stream += sprintf(stream, ptx_template, result_register, result_register, ptx);
 70 |     *out_length = (stream - buffer);
 71 |     return buffer;
 72 | }
 73 | 
 74 | CUmodule load_ptx_program(
 75 |     const char *ptx_source, size_t ptx_source_length,
 76 |     int jit_optimization_level)
 77 | {
 78 |     CUmodule module;
 79 |     void *cubin; size_t cubin_size;
 80 |     CUlinkState link_state;
 81 |     enum { num_options = 8 };
 82 |     CUjit_option options[num_options];
 83 |     void *option_values[num_options];
 84 |     float walltime;
 85 |     char error_log[8192], info_log[8192];
 86 | 
 87 |     assert(jit_optimization_level >= 0 && jit_optimization_level <= 4);
 88 | 
 89 |     // see CUDA Driver API manual for these options (look up cuLinkCreate)
 90 |     options[0] = CU_JIT_WALL_TIME;                   option_values[0] = (void *) &walltime;
 91 |     options[1] = CU_JIT_INFO_LOG_BUFFER;             option_values[1] = (void *) info_log;
 92 |     options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;  option_values[2] = (void *) (long)sizeof(info_log);
 93 |     options[3] = CU_JIT_ERROR_LOG_BUFFER;            option_values[3] = (void *) error_log;
 94 |     options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; option_values[4] = (void *) (long)sizeof(error_log);
 95 |     options[5] = CU_JIT_LOG_VERBOSE;                 option_values[5] = (void *) 1;
 96 |     options[6] = CU_JIT_TARGET;                      option_values[6] = (void *) CU_TARGET_COMPUTE_60;
 97 |     options[7] = CU_JIT_OPTIMIZATION_LEVEL;          option_values[7] = (void *) (long)jit_optimization_level;
 98 |     cudaCheckError(cuLinkCreate(num_options, options, option_values, &link_state));
 99 | 
100 |     int err = cuLinkAddData(link_state, CU_JIT_INPUT_PTX, (void *)ptx_source, ptx_source_length+1, 0, 0, 0, 0);
101 |     if (err != CUDA_SUCCESS)
102 |         fprintf(stderr, "PTX Linker Error:\n%s\n", error_log);
103 |     cudaCheckError(cuLinkComplete(link_state, &cubin, &cubin_size));
104 |     printf("Linking done in %fms. Linker Output:\n%s\n", walltime, info_log);
105 | 
106 |     cudaCheckError(cuModuleLoadData(&module, cubin)); assert(module);
107 |     cudaCheckError(cuLinkDestroy(link_state));
108 |     return module;
109 | }
110 | 
111 | void run_ptx_program(
112 |     void *input, size_t sizeof_input,
113 |     void *output, size_t sizeof_output,
114 |     const char *ptx_source, size_t ptx_source_length, const char *entry_name,
115 |     int num_blocks, int threads_per_block, int shared_memory_bytes=1024,
116 |     int jit_optimization_level=1 /*allowed values = 0,1,2,3,4*/)
117 | {
118 |     CUdeviceptr dev_input;
119 |     CUdeviceptr dev_output;
120 |     cudaCheckError(cuMemAlloc(&dev_input, sizeof_input)); assert(dev_input);
121 |     cudaCheckError(cuMemAlloc(&dev_output, sizeof_output)); assert(dev_output);
122 |     cudaCheckError(cuMemcpyHtoD(dev_input, input, sizeof_input));
123 |     CUmodule module = load_ptx_program(ptx_source, ptx_source_length, jit_optimization_level);
124 |     CUfunction kernel = 0;
125 |     cudaCheckError(cuModuleGetFunction(&kernel, module, entry_name));
126 |     uint64_t param0 = (uint64_t)(dev_input);
127 |     uint64_t param1 = (uint64_t)(dev_output);
128 |     void *kernel_params[] = { (void*)&param0, (void*)&param1 };
129 |     cuLaunchKernel(kernel, num_blocks,1,1, threads_per_block,1,1, shared_memory_bytes, NULL, kernel_params, NULL);
130 |     cudaCheckError(cuCtxSynchronize());
131 |     cudaCheckError(cuMemcpyDtoH(output, dev_output, sizeof_output));
132 |     cudaCheckError(cuMemFree(dev_output));
133 |     cudaCheckError(cuMemFree(dev_input));
134 |     cudaCheckError(cuModuleUnload(module));
135 | }
136 | 
137 | void run_test(int test_number, frep_t *f)
138 | {
139 |     printf("///////////////////////////////////////////////////\n");
140 |     printf("            running test number %d\n", test_number);
141 | 
142 |     const int num_points_x = 4;
143 |     const int num_points_y = 4;
144 |     const int num_points_z = 4;
145 |     const int num_threads = 32;
146 |     const int num_points = num_points_x*num_points_y*num_points_z;
147 |     const int num_blocks = num_points/num_threads;
148 |     const int sizeof_input = num_points*4*sizeof(float);
149 |     const int sizeof_output = num_points*1*sizeof(float);
150 | 
151 |     float *output = (float*)malloc(sizeof_output); assert(output);
152 |     float *cpu_output = (float*)malloc(sizeof_output); assert(cpu_output);
153 |     float *input = (float*)malloc(num_points*4*sizeof(float));
154 | 
155 |     // generate input array data (points sampled in regular grid)
156 |     {
157 |         float *p = input;
158 |         for (int zi = 0; zi < num_points_z; zi++)
159 |         for (int yi = 0; yi < num_points_y; yi++)
160 |         for (int xi = 0; xi < num_points_x; xi++)
161 |         {
162 |             p[0] = (-1.0f + 2.0f*xi/num_points_x);
163 |             p[1] = (-1.0f + 2.0f*yi/num_points_y);
164 |             p[2] = (-1.0f + 2.0f*zi/num_points_z);
165 |             p[3] = 0.0f;
166 |             p += 4;
167 |         }
168 |     }
169 | 
170 |     // compute expected output using CPU-based evaluator
171 |     {
172 |         for (int i = 0; i < num_points; i++)
173 |         {
174 |             float x = input[4*i + 0];
175 |             float y = input[4*i + 1];
176 |             float z = input[4*i + 2];
177 |             cpu_output[i] = frep_eval(f, x, y, z);
178 |         }
179 |     }
180 | 
181 |     // compute output using GPU
182 |     {
183 |         size_t ptx_length;
184 |         char *ptx_source = generate_ptx_program(f, &ptx_length);
185 |         run_ptx_program(
186 |             input, sizeof_input,
187 |             output, sizeof_output,
188 |             ptx_source, ptx_length,
189 |             "main",
190 |             num_blocks, num_threads);
191 |     }
192 | 
193 |     // verify that GPU output matches CPU output
194 |     for (int i = 0; i < num_points; i++)
195 |     {
196 |         float d_cpu = cpu_output[i];
197 |         float d_ptx = output[i];
198 |         if (fabsf(d_cpu - d_ptx) > 0.01f)
199 |         {
200 |             float x = input[4*i + 0];
201 |             float y = input[4*i + 1];
202 |             float z = input[4*i + 2];
203 |             printf("\nEvaluation mismatch!\n");
204 |             printf("cpu: f(%.2f,%.2f,%.2f) = %f\n", x, y, z, d_cpu);
205 |             printf("ptx: f(%.2f,%.2f,%.2f) = %f\n", x, y, z, d_ptx);
206 |             exit(1);
207 |         }
208 |     }
209 | 
210 |     free(output);
211 |     free(cpu_output);
212 |     free(input);
213 | }
214 | 
215 | int main(int argc, char **argv)
216 | {
217 |     init_cuda();
218 | 
219 |     frep_t *f = fBoxCheap(1.0f, 0.5f, 0.25f);
220 |     run_test(1, f);
221 | 
222 |     return 0;
223 | }
224 | 


--------------------------------------------------------------------------------
/test/backend_sass_6_x.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <stdio.h>
  3 | #include <cuda.h>
  4 | #include "util/cuda_error.h"
  5 | #include "util/init_cuda.h"
  6 | #include "../src/frep.h"
  7 | #include "../src/frep_eval.h"
  8 | #include "../src/frep_builder.h"
  9 | #include "../src/backend_sass.h"
 10 | 
 11 | CUmodule link_sass(CUmodule *module,
 12 |                    void *cubin1, size_t sizeof_cubin1,
 13 |                    void *cubin2, size_t sizeof_cubin2);
 14 | 
 15 | int main(int argc, char **argv)
 16 | {
 17 |     setenv("CUDA_CACHE_DISABLE", "1", 1);
 18 |     init_cuda();
 19 | 
 20 |     system("/usr/local/cuda-10.1/bin/nvcc "
 21 |            "--gpu-architecture=sm_60 "
 22 |            "--cubin "
 23 |            "--relocatable-device-code=true "
 24 |            "main.cu "
 25 |            "--output-file main.cubin");
 26 | 
 27 |     size_t sizeof_cubin_main;
 28 |     void *cubin_main = read_file("main.cubin", &sizeof_cubin_main);
 29 | 
 30 |     frep_t *tree = fBoxCheap(1.0f, 0.5f, 0.25f);
 31 | 
 32 |     size_t sizeof_cubin_tree;
 33 |     void *cubin_tree = frep_compile_to_sass(tree, &sizeof_cubin_tree);
 34 | 
 35 |     CUmodule module = 0;
 36 |     link_sass(&module, cubin_main, sizeof_cubin_main, cubin_tree, sizeof_cubin_tree);
 37 | 
 38 |     CUfunction kernel;
 39 |     cudaCheckError(cuModuleGetFunction(&kernel, module, "main")); assert(kernel);
 40 | 
 41 |     //
 42 |     // finally we run the thing to make sure that it actually works.
 43 |     //
 44 |     int N = 32;
 45 |     size_t sizeof_input = 4*N*sizeof(float);
 46 |     size_t sizeof_output = N*sizeof(float);
 47 |     float *input = (float*)malloc(sizeof_input);
 48 |     float *output = (float*)malloc(sizeof_output);
 49 | 
 50 |     for (int i = 0; i < N; i++)
 51 |     {
 52 |         input[4*i + 0] = 1.0f;
 53 |         input[4*i + 1] = 0.0f;
 54 |         input[4*i + 2] = 0.0f;
 55 |         input[4*i + 3] = 0.0f;
 56 |     }
 57 | 
 58 |     int num_blocks = 8;
 59 |     int num_threads = 4;
 60 |     int shared_memory_bytes = 1024;
 61 |     CUdeviceptr dev_input;
 62 |     CUdeviceptr dev_output;
 63 |     cudaCheckError(cuMemAlloc(&dev_input, sizeof_input)); assert(dev_input);
 64 |     cudaCheckError(cuMemAlloc(&dev_output, sizeof_output)); assert(dev_output);
 65 |     cudaCheckError(cuMemcpyHtoD(dev_input, input, sizeof_input));
 66 |     uint64_t param0 = (uint64_t)(dev_input);
 67 |     uint64_t param1 = (uint64_t)(dev_output);
 68 |     void *kernel_params[] = { (void*)&param0, (void*)&param1 };
 69 |     cuLaunchKernel(kernel, num_blocks,1,1, num_threads,1,1, shared_memory_bytes, NULL, kernel_params, NULL);
 70 |     cudaCheckError(cuCtxSynchronize());
 71 |     cudaCheckError(cuMemcpyDtoH(output, dev_output, sizeof_output));
 72 |     cudaCheckError(cuMemFree(dev_output));
 73 |     cudaCheckError(cuMemFree(dev_input));
 74 | 
 75 |     cudaCheckError(cuModuleUnload(module));
 76 | 
 77 |     printf("output:\n");
 78 |     for (int i = 0; i < N; i++)
 79 |         printf("%f ", output[i]);
 80 | 
 81 |     return 0;
 82 | }
 83 | 
 84 | void link_sass(CUmodule *module,
 85 |                void *cubin1, size_t sizeof_cubin1,
 86 |                void *cubin2, size_t sizeof_cubin2)
 87 | {
 88 |     enum { num_options = 6 };
 89 |     CUjit_option options[num_options];
 90 |     void *option_values[num_options];
 91 |     char error_log[8192];
 92 |     char info_log[8192];
 93 |     options[0] = CU_JIT_INFO_LOG_BUFFER;             option_values[0] = (void *) info_log;
 94 |     options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;  option_values[1] = (void *) (long)sizeof(info_log);
 95 |     options[2] = CU_JIT_ERROR_LOG_BUFFER;            option_values[2] = (void *) error_log;
 96 |     options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; option_values[3] = (void *) (long)sizeof(error_log);
 97 |     options[4] = CU_JIT_LOG_VERBOSE;                 option_values[4] = (void *) 1;
 98 |     options[5] = CU_JIT_TARGET;                      option_values[5] = (void *) CU_TARGET_COMPUTE_60;
 99 |     CUlinkState link_state;
100 |     cudaCheckError(cuLinkCreate(num_options, options, option_values, &link_state));
101 | 
102 |     if (CUDA_SUCCESS != cuLinkAddData(link_state, CU_JIT_INPUT_CUBIN,
103 |         (void *)cubin_main, sizeof_cubin_main, 0,0,0,0))
104 |         fprintf(stderr, "nvlink error:\n%s\n", error_log);
105 | 
106 |     if (CUDA_SUCCESS != cuLinkAddData(link_state, CU_JIT_INPUT_CUBIN,
107 |         (void *)cubin_tree, sizeof_cubin_tree, 0,0,0,0))
108 |         fprintf(stderr, "nvlink error:\n%s\n", error_log);
109 | 
110 |     void *cubin;
111 |     size_t cubin_size;
112 |     cudaCheckError(cuLinkComplete(link_state, &cubin, &cubin_size));
113 |     cudaCheckError(cuModuleLoadData(module, cubin)); assert(module);
114 |     cudaCheckError(cuLinkDestroy(link_state));
115 | }
116 | 


--------------------------------------------------------------------------------
/test/backend_sass_6_x_mock.cpp:
--------------------------------------------------------------------------------
 1 | #define COMPUTE_CAPABILITY_6_X
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <stdint.h>
 5 | #include "../src/frep.h"
 6 | #include "../src/frep_builder.h"
 7 | #include "../src/frep_eval.h"
 8 | #include "../src/backend_sass.h"
 9 | #include "../src/sass_6_x/simulator.h"
10 | 
11 | using namespace backend_sass;
12 | 
13 | float frep_eval_sass(
14 |     float x0, float y0, float z0,
15 |     instruction_t *instructions, int num_instructions,
16 |     bool debug=false)
17 | {
18 |     static sass_simulator_t sim = {0};
19 |     sim.init(debug);
20 |     sim.reg[0x00] = x0;
21 |     sim.reg[0x01] = y0;
22 |     sim.reg[0x02] = z0;
23 |     for (int i = 0; i < num_instructions; i++)
24 |         sim.execute(instructions[i]);
25 |     return sim.reg[0x07];
26 | }
27 | 
28 | void run_test(int test_number, frep_t *tree)
29 | {
30 |     instruction_blocks_t blocks = generate_blocks(tree);
31 | 
32 |     int num_instructions;
33 |     instruction_t *instructions = schedule_blocks(blocks, &num_instructions);
34 | 
35 |     printf("///////////////////////////////////////////////////\n");
36 |     printf("                 test number %d\n", test_number);
37 | 
38 |     frep_eval_sass(0.0f,0.0f,0.0f, instructions, num_instructions, true);
39 | 
40 |     for (int i = -4; i <= 4; i++)
41 |     for (int j = -4; j <= 4; j++)
42 |     for (int k = -4; k <= 4; k++)
43 |     {
44 |         float x0 = i/4.0f;
45 |         float y0 = j/4.0f;
46 |         float z0 = k/4.0f;
47 |         float f_sass = frep_eval_sass(x0,y0,z0, instructions, num_instructions);
48 |         float f_true = frep_eval(tree, x0,y0,z0);
49 |         if (fabsf(f_sass - f_true) > 0.00001f)
50 |         {
51 |             printf("\nEvaluation mismatch!\n");
52 |             printf("true: f(%.2f,%.2f,%.2f) = %f\n", x0,y0,z0,f_true);
53 |             printf("sass: f(%.2f,%.2f,%.2f) = %f\n", x0,y0,z0,f_sass);
54 |             exit(1);
55 |         }
56 |     }
57 |     printf("ok!\n");
58 | }
59 | 
60 | int main()
61 | {
62 |     frep_t *tree;
63 | 
64 |     tree = fBoxCheap(0.9f,0.6f,0.3f);
65 |     run_test(0, tree);
66 | 
67 |     tree = fSphere(0.3f);
68 |     run_test(1, tree);
69 | 
70 |     tree = fCylinder(0.6f,0.3f);
71 |     run_test(2, tree);
72 | 
73 |     tree = fPlane(1.0f, 0.3f);
74 |     pOpRotate(tree, 0.3f,0.5f,0.4f);
75 |     pOpTranslate(tree, 0.2f,0.5f,0.4f);
76 |     run_test(3, tree);
77 | 
78 |     frep_t *d1 = fBoxCheap(1.0f,0.5f,0.25f);
79 |     pOpRotate(d1, 0.1f,0.4f,0.3f);
80 |     pOpTranslate(d1, 0.5f,0.25f,0.25f);
81 |     frep_t *d2 = fSphere(0.8f);
82 |     pOpTranslate(d2, 1.0f,0,0);
83 |     frep_t *d3 = fCylinder(0.4f, 0.2f);
84 |     pOpTranslate(d3, 1.0f, 1.0f, 0.3f);
85 |     tree = fOpUnion(fOpUnion(d1, d2), d3);
86 |     run_test(4, tree);
87 | }
88 | 


--------------------------------------------------------------------------------
/test/linker.cpp:
--------------------------------------------------------------------------------
  1 | // This file tests the use of seperate compilation to link together
  2 | // pre-existing (relocatable) Cubin files. This is useful because we
  3 | // can use the CUDA Driver API to generate an executable Cubin from
  4 | // the output of our SASS backend and a user-provided Cubin containing
  5 | // the entrypoint.
  6 | //
  7 | // To compile this file on Linux using g++:
  8 | // $ g++ -std=c++11 linker.cpp -I/usr/local/cuda-10.1/include -lcuda
  9 | //
 10 | #include <stdio.h>
 11 | #include <assert.h>
 12 | #include <cuda.h>
 13 | #include "util/cuda_error.h"
 14 | #include "util/init_cuda.h"
 15 | #include "util/read_file.h"
 16 | #define ENABLE_TIMING
 17 | #include "util/profiler.h"
 18 | 
 19 | int main() {
 20 |     init_cuda();
 21 | 
 22 |     //
 23 |     // Generate relocatable SASS binaries by invoking the PTX assembler
 24 |     // on our two test files. Neither of these can be executed on their
 25 |     // own, so we will link them together into an actual executable using
 26 |     // the CUDA linker in the Driver API.
 27 |     //
 28 |     system("/usr/local/cuda-10.1/bin/ptxas --opt-level 1 --compile-only --gpu-name sm_60 test1.ptx --output-file test1.cubin");
 29 |     system("/usr/local/cuda-10.1/bin/ptxas --opt-level 1 --compile-only --gpu-name sm_60 test2.ptx --output-file test2.cubin");
 30 | 
 31 |     int sizeof_cubin1 = 0;
 32 |     void *cubin1 = (void*)read_file("test1.cubin", &sizeof_cubin1);
 33 |     assert(cubin1);
 34 | 
 35 |     int sizeof_cubin2 = 0;
 36 |     void *cubin2 = (void*)read_file("test2.cubin", &sizeof_cubin2);
 37 |     assert(cubin2);
 38 | 
 39 |     CUfunction kernel;
 40 |     CUmodule module;
 41 |     const char *entry_name = "main";
 42 | 
 43 |     // We do this 100 times and measure the time it takes the driver to
 44 |     // link together the Cubin file, and report the average in ms.
 45 |     for (int i = 0; i < 100; i++)
 46 |     {
 47 |         TIMING("linker");
 48 | 
 49 |         //
 50 |         // initialize the linker. note: CU_JIT_TARGET must match compute mode
 51 |         // specified in test1.ptx and test2.ptx, and the --gpu-name argument
 52 |         // passed to ptxas above.
 53 |         //
 54 |         enum { num_options = 6 };
 55 |         CUjit_option options[num_options];
 56 |         void *option_values[num_options];
 57 |         char error_log[8192];
 58 |         char info_log[8192];
 59 |         options[0] = CU_JIT_INFO_LOG_BUFFER;             option_values[0] = (void *) info_log;
 60 |         options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;  option_values[1] = (void *) (long)sizeof(info_log);
 61 |         options[2] = CU_JIT_ERROR_LOG_BUFFER;            option_values[2] = (void *) error_log;
 62 |         options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; option_values[3] = (void *) (long)sizeof(error_log);
 63 |         options[4] = CU_JIT_LOG_VERBOSE;                 option_values[4] = (void *) 1;
 64 |         options[5] = CU_JIT_TARGET;                      option_values[5] = (void *) CU_TARGET_COMPUTE_60;
 65 |         CUlinkState link_state;
 66 |         cudaCheckError(cuLinkCreate(num_options, options, option_values, &link_state));
 67 | 
 68 |         if (CUDA_SUCCESS != cuLinkAddData(link_state, CU_JIT_INPUT_CUBIN,
 69 |             (void *)cubin1, sizeof_cubin1, 0,0,0,0))
 70 |             fprintf(stderr, "nvlink error:\n%s\n", error_log);
 71 | 
 72 |         if (CUDA_SUCCESS != cuLinkAddData(link_state, CU_JIT_INPUT_CUBIN,
 73 |             (void *)cubin2, sizeof_cubin2, 0,0,0,0))
 74 |             fprintf(stderr, "nvlink error:\n%s\n", error_log);
 75 | 
 76 |         void *cubin;
 77 |         size_t cubin_size;
 78 |         cudaCheckError(cuLinkComplete(link_state, &cubin, &cubin_size));
 79 | 
 80 |         cudaCheckError(cuModuleLoadData(&module, cubin)); assert(module);
 81 |         cudaCheckError(cuLinkDestroy(link_state));
 82 |         cudaCheckError(cuModuleGetFunction(&kernel, module, entry_name)); assert(kernel);
 83 | 
 84 |         TIMING("linker");
 85 |     }
 86 |     assert(kernel);
 87 | 
 88 |     // Print the average linking time in milliseconds
 89 |     TIMING_SUMMARY();
 90 | 
 91 |     //
 92 |     // finally we run the thing to make sure that it actually works.
 93 |     //
 94 |     int N = 32;
 95 |     size_t sizeof_input = 4*N*sizeof(float);
 96 |     size_t sizeof_output = N*sizeof(float);
 97 |     float *input = (float*)malloc(sizeof_input);
 98 |     float *output = (float*)malloc(sizeof_output);
 99 | 
100 |     for (int i = 0; i < 32; i++)
101 |     {
102 |         input[4*i + 0] = 1.1f;
103 |         input[4*i + 1] = 0.0f;
104 |         input[4*i + 2] = 0.0f;
105 |         input[4*i + 3] = 0.0f;
106 |     }
107 | 
108 |     int num_blocks = 8;
109 |     int num_threads = 4;
110 |     int shared_memory_bytes = 1024;
111 |     CUdeviceptr dev_input;
112 |     CUdeviceptr dev_output;
113 |     cudaCheckError(cuMemAlloc(&dev_input, sizeof_input)); assert(dev_input);
114 |     cudaCheckError(cuMemAlloc(&dev_output, sizeof_output)); assert(dev_output);
115 |     cudaCheckError(cuMemcpyHtoD(dev_input, input, sizeof_input));
116 |     uint64_t param0 = (uint64_t)(dev_input);
117 |     uint64_t param1 = (uint64_t)(dev_output);
118 |     void *kernel_params[] = { (void*)&param0, (void*)&param1 };
119 |     cuLaunchKernel(kernel, num_blocks,1,1, num_threads,1,1, shared_memory_bytes, NULL, kernel_params, NULL);
120 |     cudaCheckError(cuCtxSynchronize());
121 |     cudaCheckError(cuMemcpyDtoH(output, dev_output, sizeof_output));
122 |     cudaCheckError(cuMemFree(dev_output));
123 |     cudaCheckError(cuMemFree(dev_input));
124 |     cudaCheckError(cuModuleUnload(module));
125 | 
126 |     printf("output:\n");
127 |     for (int i = 0; i < N; i++)
128 |         printf("%f ", output[i]);
129 | }
130 | 


--------------------------------------------------------------------------------
/test/test1.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lightbits/fast-csg/53b14f651e9544580ba31ba0f157221a77ba44fe/test/test1.cubin


--------------------------------------------------------------------------------
/test/test1.ptx:
--------------------------------------------------------------------------------
 1 | .version 6.0
 2 | .target sm_60
 3 | .address_size 64
 4 | 
 5 | .visible .func (.reg.f32 f1) tree(.reg.f32 x0, .reg.f32 y0, .reg.f32 z0) {
 6 | 	.reg.f32 x;
 7 | 	.reg.f32 y;
 8 | 	.reg.f32 z;
 9 | 	abs.f32.ftz x, x0;
10 | 	abs.f32.ftz y, y0;
11 | 	abs.f32.ftz z, z0;
12 | 	sub.f32.ftz x,x,1.0;
13 | 	sub.f32.ftz y,y,0.5;
14 | 	sub.f32.ftz z,z,0.25;
15 | 	max.f32.ftz f1,x,y;
16 | 	max.f32.ftz f1,f1,z;
17 |     ret.uni;
18 | }
19 | 


--------------------------------------------------------------------------------
/test/test2.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lightbits/fast-csg/53b14f651e9544580ba31ba0f157221a77ba44fe/test/test2.cubin


--------------------------------------------------------------------------------
/test/test2.ptx:
--------------------------------------------------------------------------------
 1 | .version 6.0
 2 | .target sm_60
 3 | .address_size 64
 4 | 
 5 | .extern .func (.reg.f32 f1) tree(.reg.f32 x0, .reg.f32 y0, .reg.f32 z0)
 6 | 
 7 | .visible.entry main(.param.u64 param0, .param.u64 param1) {
 8 |     .reg.f32 x0;
 9 |     .reg.f32 y0;
10 |     .reg.f32 z0;
11 |     .reg.f32 w0;
12 |     .reg.b32 r<5>;
13 |     .reg.b64 rd<9>;
14 |     .reg.f32 d;
15 |     ld.param.u64 rd1, [param0];
16 |     ld.param.u64 rd2, [param1];
17 |     cvta.to.global.u64 rd3, rd2;
18 |     cvta.to.global.u64 rd4, rd1;
19 |     mov.u32 r1, %tid.x;         // threadIdx.x
20 |     mov.u32 r2, %ctaid.x;       // blockIdx.x
21 |     mov.u32 r3, %ntid.x;        // blockDim.x
22 |     mad.lo.s32 r4, r3, r2, r1; // blockDim.x*blockIdx.x + threadIdx.x
23 |     mul.wide.s32 rd5, r4, 16;  // sizeof(vec4)*(blockDim.x*blockIdx.x + threadIdx.x)
24 |     add.s64 rd6, rd4, rd5;     // param0 + sizeof(vec4)*(blockDim.x*blockIdx.x + threadIdx.x)
25 |     ld.global.v4.f32 {x0, y0, z0, w0}, [rd6];
26 |     mul.wide.s32 rd7, r4, 4;   // sizeof(float)*(blockDim.x*blockIdx.x + threadIdx.x)
27 |     add.s64 rd8, rd3, rd7;     // param1 + sizeof(float)*(blockDim.x*blockIdx.x + threadIdx.x)
28 |     call.uni (d), tree, (x0,y0,z0);
29 |     st.global.f32 [rd8], d;
30 |     ret;
31 | }
32 | 


--------------------------------------------------------------------------------
/test/util/cuda_error.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <cuda_runtime.h>
  3 | 
  4 | static const char *cudaErrorToString(CUresult error)
  5 | {
  6 |     switch (error)
  7 |     {
  8 |         case CUDA_SUCCESS:
  9 |             return "CUDA_SUCCESS";
 10 | 
 11 |         case CUDA_ERROR_INVALID_VALUE:
 12 |             return "CUDA_ERROR_INVALID_VALUE";
 13 | 
 14 |         case CUDA_ERROR_OUT_OF_MEMORY:
 15 |             return "CUDA_ERROR_OUT_OF_MEMORY";
 16 | 
 17 |         case CUDA_ERROR_NOT_INITIALIZED:
 18 |             return "CUDA_ERROR_NOT_INITIALIZED";
 19 | 
 20 |         case CUDA_ERROR_DEINITIALIZED:
 21 |             return "CUDA_ERROR_DEINITIALIZED";
 22 | 
 23 |         case CUDA_ERROR_PROFILER_DISABLED:
 24 |             return "CUDA_ERROR_PROFILER_DISABLED";
 25 | 
 26 |         case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
 27 |             return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
 28 | 
 29 |         case CUDA_ERROR_PROFILER_ALREADY_STARTED:
 30 |             return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
 31 | 
 32 |         case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
 33 |             return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
 34 | 
 35 |         case CUDA_ERROR_NO_DEVICE:
 36 |             return "CUDA_ERROR_NO_DEVICE";
 37 | 
 38 |         case CUDA_ERROR_INVALID_DEVICE:
 39 |             return "CUDA_ERROR_INVALID_DEVICE";
 40 | 
 41 |         case CUDA_ERROR_INVALID_IMAGE:
 42 |             return "CUDA_ERROR_INVALID_IMAGE";
 43 | 
 44 |         case CUDA_ERROR_INVALID_CONTEXT:
 45 |             return "CUDA_ERROR_INVALID_CONTEXT";
 46 | 
 47 |         case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
 48 |             return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
 49 | 
 50 |         case CUDA_ERROR_MAP_FAILED:
 51 |             return "CUDA_ERROR_MAP_FAILED";
 52 | 
 53 |         case CUDA_ERROR_UNMAP_FAILED:
 54 |             return "CUDA_ERROR_UNMAP_FAILED";
 55 | 
 56 |         case CUDA_ERROR_ARRAY_IS_MAPPED:
 57 |             return "CUDA_ERROR_ARRAY_IS_MAPPED";
 58 | 
 59 |         case CUDA_ERROR_ALREADY_MAPPED:
 60 |             return "CUDA_ERROR_ALREADY_MAPPED";
 61 | 
 62 |         case CUDA_ERROR_NO_BINARY_FOR_GPU:
 63 |             return "CUDA_ERROR_NO_BINARY_FOR_GPU";
 64 | 
 65 |         case CUDA_ERROR_ALREADY_ACQUIRED:
 66 |             return "CUDA_ERROR_ALREADY_ACQUIRED";
 67 | 
 68 |         case CUDA_ERROR_NOT_MAPPED:
 69 |             return "CUDA_ERROR_NOT_MAPPED";
 70 | 
 71 |         case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
 72 |             return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
 73 | 
 74 |         case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
 75 |             return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
 76 | 
 77 |         case CUDA_ERROR_ECC_UNCORRECTABLE:
 78 |             return "CUDA_ERROR_ECC_UNCORRECTABLE";
 79 | 
 80 |         case CUDA_ERROR_UNSUPPORTED_LIMIT:
 81 |             return "CUDA_ERROR_UNSUPPORTED_LIMIT";
 82 | 
 83 |         case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
 84 |             return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
 85 | 
 86 |         case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
 87 |             return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";
 88 | 
 89 |         case CUDA_ERROR_INVALID_PTX:
 90 |             return "CUDA_ERROR_INVALID_PTX";
 91 | 
 92 |         case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
 93 |             return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT";
 94 | 
 95 |         case CUDA_ERROR_NVLINK_UNCORRECTABLE:
 96 |             return "CUDA_ERROR_NVLINK_UNCORRECTABLE";
 97 | 
 98 |         case CUDA_ERROR_JIT_COMPILER_NOT_FOUND:
 99 |             return "CUDA_ERROR_JIT_COMPILER_NOT_FOUND";
100 | 
101 |         case CUDA_ERROR_INVALID_SOURCE:
102 |             return "CUDA_ERROR_INVALID_SOURCE";
103 | 
104 |         case CUDA_ERROR_FILE_NOT_FOUND:
105 |             return "CUDA_ERROR_FILE_NOT_FOUND";
106 | 
107 |         case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
108 |             return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
109 | 
110 |         case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
111 |             return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
112 | 
113 |         case CUDA_ERROR_OPERATING_SYSTEM:
114 |             return "CUDA_ERROR_OPERATING_SYSTEM";
115 | 
116 |         case CUDA_ERROR_INVALID_HANDLE:
117 |             return "CUDA_ERROR_INVALID_HANDLE";
118 | 
119 |         case CUDA_ERROR_NOT_FOUND:
120 |             return "CUDA_ERROR_NOT_FOUND";
121 | 
122 |         case CUDA_ERROR_NOT_READY:
123 |             return "CUDA_ERROR_NOT_READY";
124 | 
125 |         case CUDA_ERROR_ILLEGAL_ADDRESS:
126 |             return "CUDA_ERROR_ILLEGAL_ADDRESS";
127 | 
128 |         case CUDA_ERROR_LAUNCH_FAILED:
129 |             return "CUDA_ERROR_LAUNCH_FAILED";
130 | 
131 |         case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
132 |             return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
133 | 
134 |         case CUDA_ERROR_LAUNCH_TIMEOUT:
135 |             return "CUDA_ERROR_LAUNCH_TIMEOUT";
136 | 
137 |         case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
138 |             return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
139 | 
140 |         case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
141 |             return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
142 | 
143 |         case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
144 |             return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
145 | 
146 |         case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
147 |             return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
148 | 
149 |         case CUDA_ERROR_CONTEXT_IS_DESTROYED:
150 |             return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
151 | 
152 |         case CUDA_ERROR_ASSERT:
153 |             return "CUDA_ERROR_ASSERT";
154 | 
155 |         case CUDA_ERROR_TOO_MANY_PEERS:
156 |             return "CUDA_ERROR_TOO_MANY_PEERS";
157 | 
158 |         case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
159 |             return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
160 | 
161 |         case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
162 |             return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
163 | 
164 |         case CUDA_ERROR_HARDWARE_STACK_ERROR:
165 |             return "CUDA_ERROR_HARDWARE_STACK_ERROR";
166 | 
167 |         case CUDA_ERROR_ILLEGAL_INSTRUCTION:
168 |             return "CUDA_ERROR_ILLEGAL_INSTRUCTION";
169 | 
170 |         case CUDA_ERROR_MISALIGNED_ADDRESS:
171 |             return "CUDA_ERROR_MISALIGNED_ADDRESS";
172 | 
173 |         case CUDA_ERROR_INVALID_ADDRESS_SPACE:
174 |             return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
175 | 
176 |         case CUDA_ERROR_INVALID_PC:
177 |             return "CUDA_ERROR_INVALID_PC";
178 | 
179 |         case CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE:
180 |             return "CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE";
181 | 
182 |         case CUDA_ERROR_NOT_PERMITTED:
183 |             return "CUDA_ERROR_NOT_PERMITTED";
184 | 
185 |         case CUDA_ERROR_NOT_SUPPORTED:
186 |             return "CUDA_ERROR_NOT_SUPPORTED";
187 | 
188 |         case CUDA_ERROR_UNKNOWN:
189 |             return "CUDA_ERROR_UNKNOWN";
190 |     }
191 | 
192 |     return "<unknown>";
193 | }
194 | 
195 | static const char *cudaErrorToString(cudaError_t error)
196 | {
197 |     switch (error)
198 |     {
199 |         case cudaSuccess:
200 |             return "cudaSuccess";
201 | 
202 |         case cudaErrorMissingConfiguration:
203 |             return "cudaErrorMissingConfiguration";
204 | 
205 |         case cudaErrorMemoryAllocation:
206 |             return "cudaErrorMemoryAllocation";
207 | 
208 |         case cudaErrorInitializationError:
209 |             return "cudaErrorInitializationError";
210 | 
211 |         case cudaErrorLaunchFailure:
212 |             return "cudaErrorLaunchFailure";
213 | 
214 |         case cudaErrorPriorLaunchFailure:
215 |             return "cudaErrorPriorLaunchFailure";
216 | 
217 |         case cudaErrorLaunchTimeout:
218 |             return "cudaErrorLaunchTimeout";
219 | 
220 |         case cudaErrorLaunchOutOfResources:
221 |             return "cudaErrorLaunchOutOfResources";
222 | 
223 |         case cudaErrorInvalidDeviceFunction:
224 |             return "cudaErrorInvalidDeviceFunction";
225 | 
226 |         case cudaErrorInvalidConfiguration:
227 |             return "cudaErrorInvalidConfiguration";
228 | 
229 |         case cudaErrorInvalidDevice:
230 |             return "cudaErrorInvalidDevice";
231 | 
232 |         case cudaErrorInvalidValue:
233 |             return "cudaErrorInvalidValue";
234 | 
235 |         case cudaErrorInvalidPitchValue:
236 |             return "cudaErrorInvalidPitchValue";
237 | 
238 |         case cudaErrorInvalidSymbol:
239 |             return "cudaErrorInvalidSymbol";
240 | 
241 |         case cudaErrorMapBufferObjectFailed:
242 |             return "cudaErrorMapBufferObjectFailed";
243 | 
244 |         case cudaErrorUnmapBufferObjectFailed:
245 |             return "cudaErrorUnmapBufferObjectFailed";
246 | 
247 |         case cudaErrorInvalidHostPointer:
248 |             return "cudaErrorInvalidHostPointer";
249 | 
250 |         case cudaErrorInvalidDevicePointer:
251 |             return "cudaErrorInvalidDevicePointer";
252 | 
253 |         case cudaErrorInvalidTexture:
254 |             return "cudaErrorInvalidTexture";
255 | 
256 |         case cudaErrorInvalidTextureBinding:
257 |             return "cudaErrorInvalidTextureBinding";
258 | 
259 |         case cudaErrorInvalidChannelDescriptor:
260 |             return "cudaErrorInvalidChannelDescriptor";
261 | 
262 |         case cudaErrorInvalidMemcpyDirection:
263 |             return "cudaErrorInvalidMemcpyDirection";
264 | 
265 |         case cudaErrorAddressOfConstant:
266 |             return "cudaErrorAddressOfConstant";
267 | 
268 |         case cudaErrorTextureFetchFailed:
269 |             return "cudaErrorTextureFetchFailed";
270 | 
271 |         case cudaErrorTextureNotBound:
272 |             return "cudaErrorTextureNotBound";
273 | 
274 |         case cudaErrorSynchronizationError:
275 |             return "cudaErrorSynchronizationError";
276 | 
277 |         case cudaErrorInvalidFilterSetting:
278 |             return "cudaErrorInvalidFilterSetting";
279 | 
280 |         case cudaErrorInvalidNormSetting:
281 |             return "cudaErrorInvalidNormSetting";
282 | 
283 |         case cudaErrorMixedDeviceExecution:
284 |             return "cudaErrorMixedDeviceExecution";
285 | 
286 |         case cudaErrorCudartUnloading:
287 |             return "cudaErrorCudartUnloading";
288 | 
289 |         case cudaErrorUnknown:
290 |             return "cudaErrorUnknown";
291 | 
292 |         case cudaErrorNotYetImplemented:
293 |             return "cudaErrorNotYetImplemented";
294 | 
295 |         case cudaErrorMemoryValueTooLarge:
296 |             return "cudaErrorMemoryValueTooLarge";
297 | 
298 |         case cudaErrorInvalidResourceHandle:
299 |             return "cudaErrorInvalidResourceHandle";
300 | 
301 |         case cudaErrorNotReady:
302 |             return "cudaErrorNotReady";
303 | 
304 |         case cudaErrorInsufficientDriver:
305 |             return "cudaErrorInsufficientDriver";
306 | 
307 |         case cudaErrorSetOnActiveProcess:
308 |             return "cudaErrorSetOnActiveProcess";
309 | 
310 |         case cudaErrorInvalidSurface:
311 |             return "cudaErrorInvalidSurface";
312 | 
313 |         case cudaErrorNoDevice:
314 |             return "cudaErrorNoDevice";
315 | 
316 |         case cudaErrorECCUncorrectable:
317 |             return "cudaErrorECCUncorrectable";
318 | 
319 |         case cudaErrorSharedObjectSymbolNotFound:
320 |             return "cudaErrorSharedObjectSymbolNotFound";
321 | 
322 |         case cudaErrorSharedObjectInitFailed:
323 |             return "cudaErrorSharedObjectInitFailed";
324 | 
325 |         case cudaErrorUnsupportedLimit:
326 |             return "cudaErrorUnsupportedLimit";
327 | 
328 |         case cudaErrorDuplicateVariableName:
329 |             return "cudaErrorDuplicateVariableName";
330 | 
331 |         case cudaErrorDuplicateTextureName:
332 |             return "cudaErrorDuplicateTextureName";
333 | 
334 |         case cudaErrorDuplicateSurfaceName:
335 |             return "cudaErrorDuplicateSurfaceName";
336 | 
337 |         case cudaErrorDevicesUnavailable:
338 |             return "cudaErrorDevicesUnavailable";
339 | 
340 |         case cudaErrorInvalidKernelImage:
341 |             return "cudaErrorInvalidKernelImage";
342 | 
343 |         case cudaErrorNoKernelImageForDevice:
344 |             return "cudaErrorNoKernelImageForDevice";
345 | 
346 |         case cudaErrorIncompatibleDriverContext:
347 |             return "cudaErrorIncompatibleDriverContext";
348 | 
349 |         case cudaErrorPeerAccessAlreadyEnabled:
350 |             return "cudaErrorPeerAccessAlreadyEnabled";
351 | 
352 |         case cudaErrorPeerAccessNotEnabled:
353 |             return "cudaErrorPeerAccessNotEnabled";
354 | 
355 |         case cudaErrorDeviceAlreadyInUse:
356 |             return "cudaErrorDeviceAlreadyInUse";
357 | 
358 |         case cudaErrorProfilerDisabled:
359 |             return "cudaErrorProfilerDisabled";
360 | 
361 |         case cudaErrorProfilerNotInitialized:
362 |             return "cudaErrorProfilerNotInitialized";
363 | 
364 |         case cudaErrorProfilerAlreadyStarted:
365 |             return "cudaErrorProfilerAlreadyStarted";
366 | 
367 |         case cudaErrorProfilerAlreadyStopped:
368 |             return "cudaErrorProfilerAlreadyStopped";
369 | 
370 |         /* Since CUDA 4.0*/
371 |         case cudaErrorAssert:
372 |             return "cudaErrorAssert";
373 | 
374 |         case cudaErrorTooManyPeers:
375 |             return "cudaErrorTooManyPeers";
376 | 
377 |         case cudaErrorHostMemoryAlreadyRegistered:
378 |             return "cudaErrorHostMemoryAlreadyRegistered";
379 | 
380 |         case cudaErrorHostMemoryNotRegistered:
381 |             return "cudaErrorHostMemoryNotRegistered";
382 | 
383 |         /* Since CUDA 5.0 */
384 |         case cudaErrorOperatingSystem:
385 |             return "cudaErrorOperatingSystem";
386 | 
387 |         case cudaErrorPeerAccessUnsupported:
388 |             return "cudaErrorPeerAccessUnsupported";
389 | 
390 |         case cudaErrorLaunchMaxDepthExceeded:
391 |             return "cudaErrorLaunchMaxDepthExceeded";
392 | 
393 |         case cudaErrorLaunchFileScopedTex:
394 |             return "cudaErrorLaunchFileScopedTex";
395 | 
396 |         case cudaErrorLaunchFileScopedSurf:
397 |             return "cudaErrorLaunchFileScopedSurf";
398 | 
399 |         case cudaErrorSyncDepthExceeded:
400 |             return "cudaErrorSyncDepthExceeded";
401 | 
402 |         case cudaErrorLaunchPendingCountExceeded:
403 |             return "cudaErrorLaunchPendingCountExceeded";
404 | 
405 |         case cudaErrorNotPermitted:
406 |             return "cudaErrorNotPermitted";
407 | 
408 |         case cudaErrorNotSupported:
409 |             return "cudaErrorNotSupported";
410 | 
411 |         /* Since CUDA 6.0 */
412 |         case cudaErrorHardwareStackError:
413 |             return "cudaErrorHardwareStackError";
414 | 
415 |         case cudaErrorIllegalInstruction:
416 |             return "cudaErrorIllegalInstruction";
417 | 
418 |         case cudaErrorMisalignedAddress:
419 |             return "cudaErrorMisalignedAddress";
420 | 
421 |         case cudaErrorInvalidAddressSpace:
422 |             return "cudaErrorInvalidAddressSpace";
423 | 
424 |         case cudaErrorInvalidPc:
425 |             return "cudaErrorInvalidPc";
426 | 
427 |         case cudaErrorIllegalAddress:
428 |             return "cudaErrorIllegalAddress";
429 | 
430 |         /* Since CUDA 6.5*/
431 |         case cudaErrorInvalidPtx:
432 |             return "cudaErrorInvalidPtx";
433 | 
434 |         case cudaErrorInvalidGraphicsContext:
435 |             return "cudaErrorInvalidGraphicsContext";
436 | 
437 |         case cudaErrorStartupFailure:
438 |             return "cudaErrorStartupFailure";
439 | 
440 |         case cudaErrorApiFailureBase:
441 |             return "cudaErrorApiFailureBase";
442 | 
443 |         /* Since CUDA 8.0*/
444 |         case cudaErrorNvlinkUncorrectable :
445 |             return "cudaErrorNvlinkUncorrectable";
446 | 
447 |         /* Since CUDA 8.5*/
448 |         case cudaErrorJitCompilerNotFound :
449 |             return "cudaErrorJitCompilerNotFound";
450 | 
451 |         /* Since CUDA 9.0*/
452 |         case cudaErrorCooperativeLaunchTooLarge :
453 |             return "cudaErrorCooperativeLaunchTooLarge";
454 | 
455 |     }
456 | 
457 |     return "<unknown>";
458 | }
459 | 
460 | template< typename T >
461 | void _cudaCheckError(T result, char const *const func, const char *const file, int const line)
462 | {
463 |     if (result)
464 |     {
465 |         fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
466 |                 file, line, static_cast<unsigned int>(result), cudaErrorToString(result), func);
467 |         CUcontext ctx;
468 |         cuCtxGetCurrent(&ctx);
469 |         cuCtxDestroy(ctx);
470 |         exit(EXIT_FAILURE);
471 |     }
472 | }
473 | #define cudaCheckError(val) _cudaCheckError ( (val), #val, __FILE__, __LINE__ )
474 | 


--------------------------------------------------------------------------------
/test/util/init_cuda.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda.h>
 3 | #include <cuda_runtime.h>
 4 | #include <stdio.h>
 5 | #include "cuda_error.h"
 6 | void init_cuda()
 7 | {
 8 |     // disable CUDA from caching SASS programs
 9 |     setenv("CUDA_CACHE_DISABLE", "1", 1);
10 | 
11 |     CUcontext context;
12 |     CUdevice device;
13 |     cudaCheckError(cuInit(0));
14 |     cudaCheckError(cuDeviceGet(&device, 0));
15 |     cudaCheckError(cuCtxCreate(&context, 0, device));
16 | 
17 |     char name[256];
18 |     int major = 0, minor = 0;
19 |     int compute_mode = -1;
20 |     cudaCheckError(cuDeviceGetName(name, 100, device));
21 |     cudaCheckError(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
22 |     cudaCheckError(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
23 |     cudaCheckError(cuDeviceGetAttribute(&compute_mode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, device));
24 |     assert(compute_mode != CU_COMPUTEMODE_PROHIBITED && "Device is running in Compute Mode Prohibited");
25 |     printf("Using CUDA device %s: Compute SM %d.%d\n", name, major, minor);
26 | }
27 | 


--------------------------------------------------------------------------------
/test/util/profiler.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <stdio.h>
  3 | #include <string.h>
  4 | 
  5 | #ifdef _WIN32
  6 | #ifndef WIN32_LEAN_AND_MEAN
  7 | #define WIN32_LEAN_AND_MEAN
  8 | #endif
  9 | #include <windows.h>
 10 | 
 11 | LARGE_INTEGER perf_get_tick()
 12 | {
 13 |     LARGE_INTEGER result;
 14 |     QueryPerformanceCounter(&result);
 15 |     return result;
 16 | }
 17 | 
 18 | float perf_seconds_elapsed(LARGE_INTEGER begin, LARGE_INTEGER end)
 19 | {
 20 |     LARGE_INTEGER frequency;
 21 |     QueryPerformanceFrequency(&frequency);
 22 |     return (float)(end.QuadPart - begin.QuadPart) /
 23 |            (float)frequency.QuadPart;
 24 | }
 25 | 
 26 | struct perf_TimingInfo
 27 | {
 28 |     const char *label;
 29 |     LARGE_INTEGER begin;
 30 |     LARGE_INTEGER end;
 31 |     bool counting;
 32 |     float t_sum;
 33 |     float t_last;
 34 |     int hits;
 35 | };
 36 | 
 37 | #else // ifdef _WIN32
 38 | #include <time.h>
 39 | 
 40 | timespec perf_get_tick()
 41 | {
 42 |     timespec result;
 43 |     clock_gettime(CLOCK_REALTIME, &result);
 44 |     return result;
 45 | }
 46 | 
 47 | float perf_seconds_elapsed(timespec begin, timespec end)
 48 | {
 49 |     time_t dsec = end.tv_sec - begin.tv_sec;
 50 |     long dnsec = end.tv_nsec - begin.tv_nsec;
 51 |     double result = (double)dsec + (double)dnsec / 1000000000.0;
 52 |     return (float)result;
 53 | }
 54 | 
 55 | struct perf_TimingInfo
 56 | {
 57 |     const char *label;
 58 |     timespec begin;
 59 |     timespec end;
 60 |     bool counting;
 61 |     float t_sum;
 62 |     float t_last;
 63 |     int hits;
 64 | };
 65 | 
 66 | #endif
 67 | 
 68 | #ifdef ENABLE_TIMING
 69 | static perf_TimingInfo perf_timing_blocks[1024];
 70 | static int perf_count = 0;
 71 | 
 72 | void TIMING(const char *label)
 73 | {
 74 |     perf_TimingInfo *block = 0;
 75 |     for (int i = 0; i < perf_count; i++)
 76 |     {
 77 |         if (strcmp(label, perf_timing_blocks[i].label) == 0)
 78 |         {
 79 |             block = &perf_timing_blocks[i];
 80 |             break;
 81 |         }
 82 |     }
 83 |     if (!block)
 84 |     {
 85 |         block = &perf_timing_blocks[perf_count];
 86 |         perf_count++;
 87 |         block->hits = 0;
 88 |         block->t_sum = 0.0f;
 89 |         block->t_last = 0.0f;
 90 |         block->label = label;
 91 |     }
 92 |     if (block->counting)
 93 |     {
 94 |         block->hits++;
 95 |         block->end = perf_get_tick();
 96 |         float elapsed = perf_seconds_elapsed(block->begin, block->end);
 97 |         block->t_sum += elapsed;
 98 |         block->t_last = elapsed;
 99 |         block->counting = false;
100 |     }
101 |     else
102 |     {
103 |         block->counting = true;
104 |         block->begin = perf_get_tick();
105 |     }
106 | }
107 | 
108 | void TIMING_CLEAR() { perf_count = 0; }
109 | 
110 | void TIMING_SUMMARY()
111 | {
112 |     printf("AVG \tLAST \tHITS\tNAME\n");
113 |     for (int i = 0; i < perf_count; i++)
114 |     {
115 |         perf_TimingInfo block = perf_timing_blocks[i];
116 |         int hits = block.hits;
117 |         float avg = 1000.0f * block.t_sum / block.hits;
118 |         float last = 1000.0f * block.t_last;
119 |         printf("%.2f\t%.2f\t%04d\t%s\n", avg, last, hits, block.label);
120 |     }
121 | }
122 | 
123 | float TIMING_GET_AVG(const char *label)
124 | {
125 |     perf_TimingInfo *block = 0;
126 |     for (int i = 0; i < perf_count; i++)
127 |     {
128 |         if (strcmp(label, perf_timing_blocks[i].label) == 0)
129 |         {
130 |             block = &perf_timing_blocks[i];
131 |             break;
132 |         }
133 |     }
134 |     if (!block)
135 |         return -1.0f;
136 |     return block->t_sum / block->hits;
137 | }
138 | 
139 | #else
140 | void TIMING(const char *label) { }
141 | void TIMING_CLEAR() { }
142 | void TIMING_SUMMARY() { }
143 | void TIMING_GET_AVG(const char *label) { }
144 | #endif
145 | 


--------------------------------------------------------------------------------
/test/util/test_models.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "sdf_builder.h"
  3 | 
  4 | sdf_node_t *model_simple01() { return sdf_box(1.0f, 0.5f, 0.25f); }
  5 | sdf_node_t *model_simple02() { return sdf_cylinder(1.0f, 0.5f); }
  6 | sdf_node_t *model_simple03() { return sdf_sphere(0.98f); }
  7 | sdf_node_t *model_simple04() { return sdf_plane(0.98f); }
  8 | sdf_node_t *model_simple05() { return sdf_rotate(sdf_translate(sdf_box(0.98f, 0.63f, 0.33f), 0.1f,-0.2f,0.3f), 0.1f,0.2f,-0.3f); }
  9 | sdf_node_t *model_simple06() { return sdf_rotate(sdf_translate(sdf_sphere(0.98f),            0.1f,-0.2f,0.3f), 0.1f,0.2f,-0.3f); }
 10 | sdf_node_t *model_simple07() { return sdf_rotate(sdf_translate(sdf_cylinder(0.98f, 0.63f),   0.1f,-0.2f,0.3f), 0.1f,0.2f,-0.3f); }
 11 | sdf_node_t *model_simple08() { return sdf_rotate(sdf_translate(sdf_plane(0.98f),             0.1f,-0.2f,0.3f), 0.1f,0.2f,-0.3f); }
 12 | sdf_node_t *model_simple09() { return sdf_blend(0.4f, sdf_sphere(1.0f), sdf_cylinder(0.3f,1.0f)); }
 13 | sdf_node_t *model_simple10() {
 14 |     sdf_node_t *d1 = sdf_box(0.98f, 0.63f, 0.33f);
 15 |         sdf_rotate(d1, -0.3f, 0.2f, -0.1f);
 16 |         sdf_translate(d1, 0.3f, -0.5f, 0.3f);
 17 |     sdf_node_t *d2 = sdf_sphere(0.63f);
 18 |         sdf_rotate(d2, 0.7f, 0.8f, -0.3f);
 19 |         sdf_translate(d2, -0.6f, +0.5f, 0.2f);
 20 |     sdf_node_t *d = sdf_union(d1, d2);
 21 |     return d;
 22 | }
 23 | sdf_node_t *model_simple11() { return sdf_subtract(sdf_box(1.0f,1.0f,1.0f), sdf_translate(sdf_sphere(0.5f), 0,1.0f,0)); }
 24 | sdf_node_t *model_simple12() { return sdf_subtract(sdf_rotate(sdf_box(1.0f,1.0f,1.0f), 0.77f,0.77f,0), sdf_sphere(0.5f)); }
 25 | sdf_node_t *model_simple13() { return sdf_subtract(sdf_box(1.0f,1.0f,1.0f), sdf_cylinder(0.5f,2.0f)); }
 26 | sdf_node_t *model_simple14() { return sdf_union(sdf_box(0.5f,0.5f,0.5f), sdf_translate(sdf_sphere(0.25f),0.5f,0,0)); }
 27 | sdf_node_t *model_simple15() { return sdf_intersect(sdf_box(0.5f,0.5f,0.5f), sdf_translate(sdf_sphere(0.25f),0.5f,0,0)); }
 28 | sdf_node_t *model_simple16() { return sdf_subtract(sdf_box(0.5f,0.5f,0.5f), sdf_translate(sdf_sphere(0.25f),0.5f,0,0)); }
 29 | 
 30 | sdf_node_t *model_complex_2d_1()
 31 | {
 32 |     auto *d1 = sdf_translate(sdf_box(0.9f, 0.1f, 0.5f), 0.0f, 0.5f, 0.0f);
 33 |     auto *d2 = sdf_translate(sdf_box(0.8f, 0.05f, 0.5f), 0.0f, -0.5f, 0.0f);
 34 |     auto *d3 = sdf_sphere(0.5f);
 35 |     auto *d4 = sdf_box(1.0f, 0.2f, 0.5f);
 36 |     return sdf_rotate(sdf_translate(sdf_union(sdf_union(d1, d2), sdf_subtract(d3, d4)), 0.1f, -0.2f, 0.0f), 0.0f, 0.0f, 0.2f);
 37 | }
 38 | 
 39 | sdf_node_t *model_complex02()
 40 | {
 41 |     sdf_node_t *a1 = sdf_plane(0.3f);
 42 |     sdf_node_t *a2 = sdf_cylinder(0.2f, 0.3f);
 43 |     sdf_node_t *a3 = sdf_box(0.3f,0.3f,0.3f);
 44 |     sdf_node_t *a4 = sdf_sphere(0.5f);
 45 |     sdf_node_t *a5 = sdf_union(a1,a2);
 46 |     sdf_node_t *a6 = sdf_subtract(a3,a4);
 47 |     sdf_node_t *a7 = sdf_union(a5,a6);
 48 |     sdf_node_t *b1 = sdf_plane(0.3f);
 49 |     sdf_node_t *b2 = sdf_cylinder(0.2f, 0.3f);
 50 |     sdf_node_t *b3 = sdf_box(0.3f,0.3f,0.3f);
 51 |     sdf_node_t *b4 = sdf_sphere(0.5f);
 52 |     sdf_node_t *b5 = sdf_union(b1,b2);
 53 |     sdf_node_t *b6 = sdf_subtract(b3,b4);
 54 |     sdf_node_t *b7 = sdf_union(b5,b6);
 55 |     sdf_node_t *d = sdf_union(a7,b7);
 56 |     return d;
 57 | }
 58 | 
 59 | sdf_node_t *model_complex03()
 60 | {
 61 |     float s = 0.3f;
 62 |     sdf_node_t *d1 = sdf_sphere(1.0f*s);
 63 |     sdf_node_t *c1 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(c1, 0,0,0);
 64 |     sdf_node_t *c2 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(c2, 1.54f,0,0);
 65 |     sdf_node_t *c3 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(c3, 0,0,1.54f);
 66 |     sdf_node_t *c12 = sdf_union(c1,c2);
 67 |     sdf_node_t *c123 = sdf_union(c12,c3);
 68 |     sdf_node_t *d2 = sdf_subtract(d1,c123);
 69 | 
 70 |     sdf_node_t *b1 = sdf_box(0.74f*s,0.74f*s,0.74f*s);
 71 |     sdf_node_t *d3 = sdf_intersect(d2,b1);
 72 | 
 73 |     sdf_node_t *s2 = sdf_sphere(0.3f*s);
 74 |     sdf_node_t *c5 = sdf_cylinder(0.1f*s, 0.8f*s); sdf_rotate(c5, 1.54f,0,0);
 75 |     sdf_node_t *c6 = sdf_cylinder(0.1f*s, 0.8f*s); sdf_rotate(c6, 0,0,0);
 76 |     sdf_node_t *c56 = sdf_union(c5,c6); sdf_rotate(c56, 0.7f, 0.0f, 0.0f); sdf_translate(c56, 1.0f*s, 0.0f, 0.0f);
 77 |     sdf_node_t *s2c56 = sdf_union(s2,c56);
 78 |     sdf_node_t *d4 = sdf_union(d3, s2c56);
 79 | 
 80 |     sdf_node_t *b2 = sdf_box(0.2f*s,0.2f*s,0.2f*s); sdf_translate(b2,-1.0f*s,0,0); sdf_rotate(b2,0.77f,0.77f,0);
 81 |     sdf_node_t *d5 = sdf_union(d4,b2);
 82 | 
 83 |     sdf_node_t *d = d5;
 84 |     return d;
 85 | }
 86 | 
 87 | sdf_node_t *model_complex04()
 88 | {
 89 |     float s = 1.5f;
 90 |     sdf_node_t *d1 = sdf_sphere(1.0f*s);
 91 |     sdf_node_t *d2 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(d2, 0,0,0);
 92 |     sdf_node_t *d3 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(d3, 1.54f,0,0);
 93 |     sdf_node_t *d4 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(d4, 0,0,1.54f);
 94 |     sdf_node_t *d5 = sdf_subtract(d1,d2);
 95 |     sdf_node_t *d6 = sdf_subtract(d5,d3);
 96 |     sdf_node_t *d7 = sdf_subtract(d6,d4);
 97 |     sdf_node_t *d8 = sdf_plane(0.74f*s); sdf_rotate(d8, 0,0,1.54f);
 98 |     sdf_node_t *d9 = sdf_plane(0.74f*s); sdf_rotate(d9, 0,0,-1.54f);
 99 |     sdf_node_t *d10 = sdf_plane(0.74f*s); sdf_rotate(d10, 0,1.54f,0);
100 |     sdf_node_t *d11 = sdf_plane(0.74f*s); sdf_rotate(d11, 0,-1.54f,0);
101 |     sdf_node_t *d12 = sdf_plane(0.74f*s); sdf_rotate(d12, 0,0,0);
102 |     sdf_node_t *d13 = sdf_plane(0.74f*s); sdf_rotate(d13, 0,0,3.14f);
103 |     sdf_node_t *d14 = sdf_intersect(d7,d8);
104 |     sdf_node_t *d15 = sdf_intersect(d14,d9);
105 |     sdf_node_t *d16 = sdf_intersect(d15,d10);
106 |     sdf_node_t *d17 = sdf_intersect(d16,d11);
107 |     sdf_node_t *d18 = sdf_intersect(d17,d12);
108 |     sdf_node_t *d19 = sdf_intersect(d18,d13);
109 |     sdf_node_t *d20 = sdf_sphere(0.3f*s);
110 |     sdf_node_t *d21 = sdf_union(d19, d20);
111 |     sdf_node_t *d22 = sdf_cylinder(0.1f*s, 0.8f*s); sdf_rotate(d22, 1.54f,0,0);
112 |     sdf_node_t *d23 = sdf_cylinder(0.1f*s, 0.8f*s); sdf_rotate(d23, 0,0,0);
113 |     sdf_node_t *d24 = sdf_union(d21, d22);
114 |     sdf_node_t *d25 = sdf_union(d24, d23);
115 |     return d25;
116 | }
117 | 
118 | sdf_node_t *model_complex05()
119 | {
120 |     float s = 1.5f;
121 |     sdf_node_t *d1 = sdf_sphere(1.0f*s);
122 |     sdf_node_t *d2 = sdf_cylinder(0.54f*s,1.2f*s); sdf_rotate(d2, 0,0,0);
123 |     sdf_node_t *d3 = sdf_subtract(d1,d2);
124 |     sdf_node_t *d4 = sdf_plane(0.44f*s); sdf_rotate(d4, 0,0,1.54f);
125 |     sdf_node_t *d5 = sdf_plane(0.44f*s); sdf_rotate(d5, 0,0,-1.54f);
126 |     sdf_node_t *d6 = sdf_intersect(d3,d4);
127 |     sdf_node_t *d7 = sdf_intersect(d6,d5);
128 |     return d7;
129 | }
130 | 
131 | sdf_node_t *model_chair1_2d()
132 | {
133 |     float k = 0.5f;
134 |     sdf_node_t *seat = sdf_box(1.0f*k, 0.1f*k, 1.0f);
135 |     sdf_node_t *leg1 = sdf_rotate(sdf_translate(sdf_box(0.1f*k, 1.0f*k, 1.0f), -1.0f*k,0,0), 0,0,-0.2f);
136 |     sdf_node_t *leg2 = sdf_rotate(sdf_translate(sdf_box(0.1f*k, 1.0f*k, 1.0f), +1.0f*k,0,0), 0,0,+0.1f);
137 |     sdf_node_t *legs = sdf_translate(sdf_union(leg1, leg2), 0,-1.0f*k,0);
138 |     sdf_node_t *back = sdf_rotate(sdf_translate(sdf_box(0.1f*k, 1.0f*k, 1.0f), 1.0f*k,1.0f*k,0), 0,0,-0.1f);
139 |     sdf_node_t *seat_and_legs = sdf_union(seat, legs);
140 |     sdf_node_t *chair = sdf_union(seat_and_legs, back);
141 |     return chair;
142 | }
143 | 
144 | sdf_node_t *model_chair2_2d()
145 | {
146 |     float k = 0.5f;
147 |     sdf_node_t *seat = sdf_rotate(sdf_box(0.8f*k, 0.15f*k, 1.0f), 0,0,0.2f);
148 |     sdf_node_t *leg1 = sdf_rotate(sdf_translate(sdf_box(0.1f*k, 1.0f*k, 1.0f), -0.75f*k,0,0), 0,0,-0.05f);
149 |     sdf_node_t *leg2 = sdf_rotate(sdf_translate(sdf_box(0.1f*k, 1.0f*k, 1.0f), +0.8f*k,0.05f*k,0), 0,0,0.1f);
150 |     sdf_node_t *mid = sdf_translate(sdf_box(0.8f*k, 0.05f*k, 1.0f), 0,-1.0f*k,0);
151 |     sdf_node_t *legs = sdf_intersect(sdf_translate(sdf_union(leg1, leg2), 0,-1.0f*k,0),
152 |                                      sdf_rotate(sdf_plane(1.9f*k), 0,0,-3.14f/2.0f));
153 |     sdf_node_t *seat_and_legs = sdf_union(seat, legs);
154 |     sdf_node_t *chair = sdf_union(seat_and_legs, mid);
155 |     return chair;
156 | }
157 | 
158 | sdf_node_t *model_translated_sphere()
159 | {
160 |     return
161 |     sdf_translate(sdf_sphere(1.0f), -0.5f,0.0f,0.0f);
162 | }
163 | 
164 | sdf_node_t *model_intersection()
165 | {
166 |     return
167 |     sdf_intersect(sdf_translate(sdf_sphere(0.5f), -0.2f,0.0f,0.0f),
168 |                   sdf_translate(sdf_sphere(0.5f), +0.2f,0.0f,0.0f));
169 | }
170 | 
171 | sdf_node_t *model_two_spheres()
172 | {
173 |     return
174 |     sdf_union(sdf_translate(sdf_sphere(0.1f), -0.5f,0.0f,0.0f),
175 |               sdf_translate(sdf_sphere(0.5f), +0.3f,0.0f,0.0f));
176 | }
177 | 
178 | sdf_node_t *model_two_spheres_equal()
179 | {
180 |     return
181 |     sdf_union(sdf_translate(sdf_sphere(0.3f), -0.4f,0.0f,0.0f),
182 |               sdf_translate(sdf_sphere(0.3f), +0.4f,0.0f,0.0f));
183 | }
184 | 
185 | sdf_node_t *model_four_spheres()
186 | {
187 |     return
188 |     sdf_union(
189 |               sdf_union(
190 |                         sdf_translate(sdf_sphere(0.2f), 0.0f,0.7f,0.0f),
191 |                         sdf_translate(sdf_sphere(0.2f), 0.0f,-0.7f,0.0f)),
192 |               sdf_union(
193 |                         sdf_translate(sdf_sphere(0.4f), -0.5f,0.0f,0.0f),
194 |                         sdf_translate(sdf_sphere(0.4f), +0.5f,0.0f,0.0f)));
195 | }
196 | 
197 | sdf_node_t *model_scissor()
198 | {
199 |     return
200 |     sdf_union(
201 |               sdf_translate(sdf_sphere(0.4f), 0.0f,0.6f,0.0f),
202 |               sdf_intersect(
203 |                         sdf_translate(sdf_sphere(0.8f), -0.5f,0.0f,0.0f),
204 |                         sdf_translate(sdf_sphere(0.8f), +0.5f,0.0f,0.0f)));
205 | }
206 | 
207 | sdf_node_t *model_fillet()
208 | {
209 |     return
210 |     sdf_union
211 |     (
212 |         sdf_translate(sdf_sphere(0.25f), 0.25f,0.25f,0.0f),
213 |         sdf_intersect
214 |         (
215 |             sdf_rotate(sdf_plane(0.53f), 0.0f,0.0f,3.1415f/4.0f),
216 |             sdf_box(0.5f, 0.5f, 0.5f)
217 |         )
218 |     );
219 | }
220 | 
221 | sdf_node_t *model_two_box()
222 | {
223 |     return
224 |     sdf_union
225 |     (
226 |         sdf_translate(sdf_box(0.55f,0.05f,1.0f), 0.25f,0.5f,0.0f),
227 |         sdf_translate(sdf_box(0.05f,0.55f,1.0f), -0.25f,0.0f,0.0f)
228 |     );
229 | }
230 | 
231 | sdf_node_t *model_two_box_unequal()
232 | {
233 |     return
234 |     sdf_union
235 |     (
236 |         sdf_translate(sdf_box(0.35f,0.05f,1.0f), 0.15f,0.5f,0.0f),
237 |         sdf_translate(sdf_box(0.05f,0.55f,1.0f), -0.25f,0.0f,0.0f)
238 |     );
239 | }
240 | 
241 | sdf_node_t *model_offset_box()
242 | {
243 |     return sdf_rotate(sdf_translate(sdf_box(0.5f,0.5f,0.5f), 0.2f, -0.2f, 0.0f), 0.0f, 0.0f, -0.5f);
244 | }
245 | 
246 | sdf_node_t *model_motion0(int which)
247 | {
248 |     if (which == 0) {
249 |         auto *d1 = sdf_box(0.3f, 0.3f, 0.3f);
250 |         auto *d2 = sdf_box(0.2f, 0.2f, 0.2f);
251 |         d2 = sdf_rotate(sdf_translate(d2, +0.3f, +0.2f, 0.0f), 0.0f, 0.0f, 0.3f);
252 |         auto *d5 = sdf_union(d1, d2);
253 |         d5 = sdf_rotate(d5, 0.0f, 0.0f, 0.2f);
254 |         d5 = sdf_translate(d5, 0.45f, -0.5f, 0.0f);
255 |         auto *d6 = sdf_sphere(0.3f);
256 |         d6 = sdf_translate(d6, -0.4f, +0.2f, 0.0);
257 |         return sdf_union(d5, d6);
258 |     } else {
259 |         auto *d1 = sdf_box(0.3f, 0.3f, 0.3f);
260 |         auto *d2 = sdf_box(0.2f, 0.2f, 0.2f);
261 |         d2 = sdf_rotate(sdf_translate(d2, +0.3f, +0.2f, 0.0f), 0.0f, 0.0f, 0.3f);
262 |         auto *d5 = sdf_union(d1, d2);
263 |         d5 = sdf_rotate(d5, 0.0f, 0.0f, 0.2f);
264 |         d5 = sdf_rotate(sdf_translate(d5, 0.45f, -0.1f, 0.0f), 0.0f, 0.0f, -0.3f);
265 |         auto *d6 = sdf_sphere(0.3f);
266 |         d6 = sdf_translate(d6, -0.4f, +0.2f, 0.0);
267 |         return sdf_union(d5, d6);
268 |     }
269 | }
270 | 
271 | sdf_node_t *model_motion1(int which)
272 | {
273 |     if (which == 0) {
274 |         auto *d1 = sdf_box(0.3f, 0.3f, 0.3f);
275 |         auto *d2 = sdf_box(0.2f, 0.2f, 0.2f);
276 |         auto *d3 = sdf_box(0.2f, 0.2f, 0.2f);
277 |         d2 = sdf_rotate(sdf_translate(d2, +0.3f, +0.2f, 0.0f), 0.0f, 0.0f, 0.3f);
278 |         d3 = sdf_rotate(sdf_translate(d3, -0.3f, -0.2f, 0.0f), 0.0f, 0.0f, 0.7f);
279 |         auto *d4 = sdf_union(d2, d3);
280 |         auto *d5 = sdf_union(d1, d4);
281 |         d5 = sdf_rotate(d5, 0.0f, 0.0f, 0.2f);
282 |         d5 = sdf_translate(d5, 0.4f, -0.2f, 0.0f);
283 |         auto *d6 = sdf_sphere(0.3f);
284 |         d6 = sdf_translate(d6, -0.3f, +0.2f, 0.0);
285 |         return sdf_union(d5, d6);
286 |     } else {
287 |         auto *d1 = sdf_box(0.3f, 0.3f, 0.3f);
288 |         auto *d2 = sdf_box(0.2f, 0.2f, 0.2f);
289 |         auto *d3 = sdf_box(0.2f, 0.2f, 0.2f);
290 |         d2 = sdf_rotate(sdf_translate(d2, +0.3f, +0.2f, 0.0f), 0.0f, 0.0f, 0.3f);
291 |         d3 = sdf_rotate(sdf_translate(d3, -0.3f, -0.2f, 0.0f), 0.0f, 0.0f, 0.7f);
292 |         auto *d4 = sdf_union(d2, d3);
293 |         auto *d5 = sdf_union(d1, d4);
294 |         d5 = sdf_rotate(d5, 0.0f, 0.0f, 0.2f);
295 |         d5 = sdf_translate(d5, 0.45f, -0.1f, 0.0f);
296 |         auto *d6 = sdf_sphere(0.3f);
297 |         d6 = sdf_translate(d6, -0.3f, +0.2f, 0.0);
298 |         return sdf_union(d5, d6);
299 |     }
300 | }
301 | 


--------------------------------------------------------------------------------