├── CODE_OF_CONDUCT.md ├── Obsidian-Mozilla ├── API.webidl └── README.md └── README.md /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | A reminder that this issue tracker is managed by the Khronos Group. Interactions here should follow the Khronos Code of Conduct (https://www.khronos.org/developers/code-of-conduct), which prohibits aggressive or derogatory language. Please keep the discussion friendly and civil. 2 | -------------------------------------------------------------------------------- /Obsidian-Mozilla/API.webidl: -------------------------------------------------------------------------------- 1 | // main section 2 | 3 | interface ObContext { 4 | sequence getSupportedExtensions(); 5 | any? enableExtension(DOMString name); 6 | sequence getPhysicalDevices(); 7 | ObCreatedDevice createDevice(ObPhysicalDevice device, sequence queueInfos); 8 | }; 9 | 10 | interface ObQueueFamily { 11 | readonly attribute ObQueueFlagsEnum queueFlags; 12 | }; 13 | 14 | interface ObMemoryType { 15 | readonly attribute ObMemoryFlagsEnum memoryFlags; 16 | }; 17 | 18 | interface ObPhysicalDevice { 19 | sequence getQueueFamilies(); 20 | sequence getMemoryTypes(); 21 | }; 22 | 23 | interface ObDevice { 24 | ObSwapchain createSwapchain(HTMLCanvasElement canvas, ObCanvasCreateInfo info); 25 | ObImage? acquireNextImage(ObSwapchain swapchain, unsigned long timeout, ObSemaphore semaphore, optional ObFence? fence = null); 26 | ObDeviceError? getError(); 27 | ObFence createFence(boolean signalled); 28 | void resetFences(sequence fences); 29 | boolean waitForFences(sequence fences, boolean waitAll, unsigned long timeout); 30 | void destroyFence(ObFence fence); 31 | ObSemaphore createSemaphore(); 32 | void destroySemaphore(ObSemaphore semaphore); 33 | ObEvent createEvent(); 34 | void setEvent(ObEvent event); 35 | void resetEvent(ObEvent event); 36 | ObEventStatus getEventStatus(ObEvent event); 37 | void destroyEvent(ObEvent event); 38 | ObQueryPool createQueryPool(ObQueryPoolCreateInfo info); 39 | sequence getQueryPoolResults(ObQueryPool pool, unsigned short first, unsigned short count); 40 | void destroyQueryPool(ObQueryPool pool); 41 | void waitIdle(); 42 | ObPrimaryCommandBuffer createPrimaryCommandBuffer(ObQueueFamily family); 43 | ObSecondaryCommandBuffer createSecondaryCommandBuffer(ObQueueFamily family, ObSecondaryCommandBufferCreateInfo info); 44 | void destroyCommandBuffer(ObCommandBuffer buffer); 45 | ObRenderPass createRenderPass(ObRenderPassCreateInfo info); 46 | void destroyRenderPass(ObRenderPass pass); 47 | ObFramebuffer createFramebuffer(ObFramebufferCreateInfo info); 48 | void destroyFramebuffer(ObFramebuffer buffer); 49 | ObShaderModule createShaderModule((DOMString or Blob) glslTextOrSpirvBinary); 50 | void destroyShaderModule(ObShaderModule shaderModule); 51 | ObComputePipeline createComputePipeline(ObComputePipelineCreateInfo info); 52 | ObGraphicsPipeline createGraphicsPipeline(ObGraphicsPipelineCreateInfo info); 53 | void destroyPipeline(ObPipeline pipeline); 54 | ObSampler createSampler(ObSamplerCreateInfo info); 55 | void destroySampler(ObSampler sampler); 56 | ObDescriptorSetLayout createDescriptorSetLayout(sequence bindings); 57 | void destroyDescriptorSetLayout(ObDescriptorSetLayout layout); 58 | ObPipelineLayout createPipelineLayout(ObPipelineLayoutCreateInfo info); 59 | void destroyPipelineLayout(ObPipelineLayout layout); 60 | ObDescriptorPool createDescriptorPool(ObDescriptorPoolCreateInfo info); 61 | void destroyDescriptorPool(ObDescriptorPool pool); 62 | sequence allocateDescriptorSets(ObDescriptorPool pool, sequence layouts); 63 | void updateDescriptorSets(sequence writes, sequence copies); 64 | void freeDescriptorSets(ObDescriptorPool pool, sequence sets); 65 | ObMemory allocateMemory(ObMemoryType type, ObMemoryRequirements requirements); 66 | void freeMemory(ObMemory memory); 67 | ObBuffer createBuffer(ObBufferCreateInfo info); 68 | void uploadBuffer(ObBuffer buffer, unsigned long offset, ArrayBuffer data); 69 | ArrayBuffer downloadBuffer(ObBuffer buffer, unsigned long offset, unsigned long size); 70 | void destroyBuffer(ObBuffer buffer); 71 | void bindBufferMemory(ObBuffer buffer, ObMemory memory, unsigned long offset); 72 | void bindImageMemory(ObImage image, ObMemory memory, unsigned long offset); 73 | }; 74 | 75 | interface ObSwapchain { 76 | readonly attribute unsigned long width; 77 | readonly attribute unsigned long height; 78 | readonly attribute ObFormatEnum format; 79 | }; 80 | 81 | interface ObQueue { 82 | void submit(ObSubmitInfo info, optional ObFence? fence = null); //TODO 83 | void present(ObSwapchain swapchain, sequence semaphores); 84 | void waitIdle(); 85 | ObQueueError? getError(); 86 | }; 87 | 88 | interface ObCommandBuffer { 89 | void begin(); 90 | void end(); 91 | void cmdSetEvent(ObEvent event, ObStageMask stages); 92 | void cmdResetEvent(ObEvent event, ObStageMask stages); 93 | void cmdWaitEvents(sequence events, ObStageMask sourceStages, ObStageMask destStages, ObMemoryBarriers barriers); 94 | void cmdBindPipeline(ObPipelineBindPointEnum bind, ObPipeline pipeline); 95 | void cmdBindDescriptorSets(ObPipelineBindPointEnum bind, ObPipelineLayout layout, sequence descriptors, sequence dynamicOffsets); 96 | void cmdPushConstants(ObPipelineLayout layout, ObShaderStage shaderStages, unsigned long offset, Blob data); 97 | void cmdBindIndexBuffer(ObBuffer buffer, unsigned long offset, ObIndexType type); 98 | void cmdDraw(unsigned long vertexCount, unsigned long instanceCount, unsigned long firstVertex, unsigned long firstInstance); 99 | void cmdDrawIndexed(unsigned long indexCount, unsigned long instanceCount, unsigned long firstIndex, unsigned long vertexOffset, unsigned long firstInstance); 100 | void cmdDrawIndirect(ObBuffer buffer, unsigned long offset, unsigned long drawCount, unsigned long stride); 101 | ObCommandBufferError? getError(); 102 | }; 103 | 104 | interface ObPrimaryCommandBuffer: ObCommandBuffer { 105 | void cmdExecuteCommands(sequence combufs); 106 | void cmdPipelineBarrier(ObStageMask sourceStages, ObStageMask destStages, ObMemoryBarriers barriers); 107 | void cmdBeginRenderPass(ObRenderPassBeginInfo info); 108 | void cmdNextSubpass(); 109 | void cmdEndRenderPass(); 110 | void cmdResetQueryPool(ObQueryPool pool, unsigned short first, unsigned short count); 111 | void cmdBeginQuery(ObQueryPool pool, unsigned short index); 112 | void cmdEndQuery(ObQueryPool pool, unsigned short index); 113 | void cmdClearColorImage(ObImage image, ObImageLayoutEnum layout, ObClearColorInfo info, sequence ranges); 114 | void cmdClearDepthStencil(ObImage image, ObImageLayoutEnum layout, ObClearDepthStencilInfo info); 115 | void cmdClearAttachments(sequence attachments, sequence rectangles); 116 | void cmdFillBuffer(ObBuffer dest, unsigned long offset, unsigned long size, unsigned long value); 117 | void cmdUpdateBuffer(ObBuffer dest, unsigned long offset, unsigned long size, ArrayBuffer data); 118 | void cmdCopyBuffer(ObBuffer src, ObBuffer dst, sequence regions); 119 | void cmdCopyImage(ObImage src, ObImageLayoutEnum srcLayout, ObImage dst, ObImageLayoutEnum dstLayout, sequence regions); 120 | void cmdCopyBufferToImage(ObBuffer src, ObImage dst, ObImageLayoutEnum dstLayout, sequence regions); 121 | void cmdCopyImageToBuffer(ObImage src, ObImageLayoutEnum srcLayout, ObBuffer dst, sequence regions); 122 | void cmdResolveImage(ObImage src, ObImageLayoutEnum srcLayout, ObImage dst, ObImageLayoutEnum dstLayout, sequence regions); 123 | }; 124 | 125 | interface ObSecondaryCommandBuffer: ObCommandBuffer { 126 | }; 127 | 128 | interface ObBuffer { 129 | ObMemoryRequirements getMemoryRequirements(); 130 | ObBufferError? getError(); 131 | }; 132 | 133 | // stubs 134 | 135 | dictionary ObRenderPassCreateInfo { 136 | required byte stub; 137 | }; 138 | dictionary ObFramebufferCreateInfo { 139 | required byte stub; 140 | }; 141 | dictionary ObComputePipelineCreateInfo { 142 | required byte stub; 143 | }; 144 | dictionary ObGraphicsPipelineCreateInfo { 145 | required byte stub; 146 | }; 147 | dictionary ObSamplerCreateInfo { 148 | required byte stub; 149 | }; 150 | dictionary ObDescriptorSetBinding { 151 | required byte stub; 152 | }; 153 | dictionary ObPipelineLayoutCreateInfo { 154 | required byte stub; 155 | }; 156 | dictionary ObDescriptorPoolCreateInfo { 157 | required byte stub; 158 | }; 159 | dictionary ObQueryPoolCreateInfo { 160 | required byte stub; 161 | }; 162 | dictionary ObBufferCreateInfo { 163 | required byte stub; 164 | }; 165 | dictionary ObRenderPassBeginInfo { 166 | required byte stub; 167 | }; 168 | dictionary ObClearDepthStencilInfo { 169 | required byte stub; 170 | }; 171 | dictionary ObWriteDecriptorSet {}; 172 | dictionary ObCopyDescriptorSet {}; 173 | dictionary ObClearAttachment {}; 174 | dictionary ObClearRect {}; 175 | dictionary ObSubresourceRange {}; 176 | dictionary ObImageResolve {}; 177 | dictionary ObBufferCopy {}; 178 | dictionary ObImageCopy {}; 179 | dictionary ObBufferImageCopy {}; 180 | typedef byte ObDeviceError; 181 | typedef byte ObQueueError; 182 | typedef byte ObCommandBufferError; 183 | typedef byte ObBufferError; 184 | typedef sequence ObClearColorInfo; 185 | typedef byte ObEventStatus; 186 | typedef byte ObQueryResult; 187 | typedef byte ObStageMask; 188 | typedef byte ObShaderStage; 189 | typedef byte ObIndexType; 190 | interface ObFence {}; 191 | interface ObSemaphore {}; 192 | interface ObEvent {}; 193 | interface ObRenderPass {}; 194 | interface ObFramebuffer {}; 195 | interface ObShaderModule {}; 196 | interface ObPipeline {}; 197 | interface ObComputePipeline: ObPipeline {}; 198 | interface ObGraphicsPipeline: ObPipeline {}; 199 | interface ObSampler {}; 200 | interface ObPipelineLayout {}; 201 | interface ObDescriptorSetLayout {}; 202 | interface ObDescriptorPool {}; 203 | interface ObDescriptorSet {}; 204 | interface ObQueryPool {}; 205 | interface ObMemory {}; 206 | interface ObImage {}; 207 | 208 | // structures 209 | 210 | dictionary ObCreatedDevice { 211 | ObDevice device; 212 | sequence queues; 213 | }; 214 | 215 | dictionary ObQueueCreateInfo { 216 | ObQueueFamily queueFamily; 217 | sequence queuePriorities; 218 | }; 219 | 220 | dictionary ObCanvasCreateInfo { 221 | required unsigned short minImageCount; 222 | }; 223 | 224 | dictionary ObSubmitInfo { 225 | required sequence waitSemaphores; 226 | required sequence waitDstStageMasks; 227 | required sequence commandBuffers; 228 | required sequence signalSemaphores; 229 | }; 230 | 231 | dictionary ObMemoryRequirements { 232 | required unsigned long size; 233 | required unsigned long alignment; 234 | required ObMemoryFlagsEnum memoryFlags; 235 | }; 236 | 237 | dictionary ObMemoryBarriers { 238 | required ObAccessEnum srcAccessMask; 239 | required ObAccessEnum dstAccessMask; 240 | required ObImageLayoutEnum oldLayout; 241 | required ObImageLayoutEnum newLayout; 242 | required ObSubresourceRange subresourceRange; 243 | }; 244 | 245 | // constants 246 | //Note - WebIDL validator complains: 247 | //> the Web platform is moving away from using named integer codes in the style of an enumeration, in favor of the use of strings 248 | 249 | typedef byte ObQueueFlagsEnum; 250 | interface ObQueueFlags { 251 | const byte GRAPHICS_BIT = 1; 252 | const byte PRESENT_BIT = 2; 253 | }; 254 | 255 | typedef byte ObMemoryFlagsEnum; 256 | interface ObMemoryFlags { 257 | const byte HOST_VISIBLE_BIT = 1; 258 | const byte DEVICE_LOCAL_BIT = 2; 259 | }; 260 | 261 | typedef byte ObImageAspectEnum; 262 | interface ObImageAspect { 263 | const byte COLOR_BIT = 1; 264 | const byte DEPTH_BIT = 2; 265 | const byte STENCIL_BIT = 4; 266 | }; 267 | 268 | typedef byte ObImageLayoutEnum; 269 | interface ObImageLayout { 270 | const byte UNDEFINED = 0; 271 | const byte GENERAL = 1; 272 | const byte COLOR_ATTACHMENT_OPTIMAL = 2; 273 | const byte DEPTH_STENCIL_ATTACHMENT_OPTIMAL = 3; 274 | const byte DEPTH_STENCIL_READ_ONLY_OPTIMAL = 4; 275 | const byte SHADER_READ_ONLY_OPTIMAL = 5; 276 | const byte TRANSFER_SRC_OPTIMAL = 6; 277 | const byte TRANSFER_DST_OPTIMAL = 7; 278 | const byte PREINITIALIZED = 8; 279 | const byte PRESENT_SRC = 9; 280 | }; 281 | 282 | typedef long ObPipelineStageEnum; 283 | interface ObPipelineStage { 284 | //TODO 285 | const long TRANSFER_BIT = 0x1000; 286 | }; 287 | 288 | typedef byte ObPipelineBindPointEnum; 289 | interface ObPipelineBindPoint { 290 | const byte GRAPHICS = 0; 291 | const byte COMPUTE = 1; 292 | }; 293 | 294 | typedef long ObAccessEnum; 295 | interface ObAccess { 296 | //TODO 297 | const long TRANSFER_WRITE_BIT = 0x00001000; 298 | const long MEMORY_READ_BIT = 0x00008000; 299 | }; 300 | 301 | typedef long ObSampleCountEnum; 302 | interface ObSampleCount { 303 | const long S1_BIT = 1; 304 | //TODO 305 | }; 306 | 307 | typedef byte ObAttachmentLoadOpEnum; 308 | interface ObAttachmentLoadOp { 309 | const byte LOAD = 0; 310 | const byte CLEAR = 1; 311 | const byte DONT_CARE = 2; 312 | }; 313 | 314 | typedef byte ObAttachmentStoreOpEnum; 315 | interface ObAttachmentStoreOp { 316 | const byte STORE = 0; 317 | const byte DONT_CARE = 1; 318 | }; 319 | 320 | typedef byte ObPrimitiveTopologyEnum; 321 | interface ObPrimitiveTopology { 322 | //TODO 323 | const byte TRIANGLE_LIST = 3; 324 | }; 325 | 326 | typedef byte ObFormatEnum; 327 | interface ObFormat { 328 | const byte R32G32B32A32_SFLOAT = 109; 329 | }; 330 | 331 | typedef long ObBufferUsageEnum; 332 | interface ObBufferUsage { 333 | const long TRANSFER_SRC_BIT = 0x00000001; 334 | const long TRANSFER_DST_BIT = 0x00000002; 335 | const long VERTEX_BUFFER_BIT = 0x00000080; 336 | //TODO 337 | }; 338 | 339 | typedef byte ObVertexInputRateEnum; 340 | interface ObVertexInputRate { 341 | const byte VERTEX = 0; 342 | const byte INSTANCE = 1; 343 | }; 344 | 345 | typedef byte ObPolygonModeEnum; 346 | interface ObPolygonMode { 347 | const byte FILL = 0; 348 | const byte LINE = 1; 349 | const byte POINT = 2; 350 | }; 351 | 352 | typedef byte ObCullModeEnum; 353 | interface ObCullMode { 354 | const byte NONE = 0; 355 | const byte FRONT_BIT = 1; 356 | const byte BACK_BIT = 2; 357 | const byte FRONT_AND_BACK = 3; 358 | }; 359 | 360 | typedef byte ObFrontFaceEnum; 361 | interface ObFrontFace { 362 | const byte COUNTER_CLOCKWISE = 0; 363 | const byte CLOCKWISE = 1; 364 | }; 365 | -------------------------------------------------------------------------------- /Obsidian-Mozilla/README.md: -------------------------------------------------------------------------------- 1 | # Obsidian API Proposal 2 | 3 | 21th March 2017 - [Dzmitry Malyshau](mailto:dmalyshau@mozilla.com), [Jeff Gilbert](mailto:jgilbert@mozilla.com) 4 | 5 | 6 | ## Introduction 7 | 8 | This is Mozilla's draft proposal for the GPU API for the Web, called _Obsidian_. It is a low-level API that provides maximum feature set of the GPU to the web applications. The API is designed for WebAssembly, modern GPUs, and multi-threaded environment in mind. 9 | 10 | _Obsidian_ is a temporary code name, signifying the Vulkan roots of the API: 11 | > Obsidian is a naturally occurring volcanic glass formed as an extrusive igneous rock. 12 | 13 | This proposal is not a specification. It includes reasoning for the design decisions, draft WebIDL and a bit of example code. We don't aim to provide a complete specification, instead we want this proposal to represent our vision of the future API in the working group discussions, a vision of rich graphics on the Web powered by a low-level explicit API. 14 | 15 | The contents are split into the following sections: 16 | 1. introduction and philosophy 17 | 2. design details, differences from Vulkan/D3D12 18 | 3. synchronization and memory model 19 | 4. API and examples 20 | 21 | ### History 22 | 23 | The need for a more efficient graphics API for the Web is clear. What is not clear is the look and design of such API. Previous mailing lists discussions were split into two groups: one that considered Metal as a good baseline for the new API, due to it simpler and higher level abstraction, which is easier to provide safely on the Web. Another group saw Vulkan as an ultimately portable API that just needs to be re-defined on the Web with some feature cuts. 24 | 25 | We built a prototype into Servo, providing a Metal-like API in WebIDL (for JavaScript), backed by Vulkan. It exposed the problems of communicating with the graphics backend via the process barrier, transcoding the commands between different APIs, and briefly touched the security aspect. We found this way to be reachable, but we also realized that this is an unique opportunity for the Web to get so much more. 26 | 27 | Providing render passes, for example, would open the door to efficient graphics on mobile. Having secondary command buffers can vastly increase the fidelity of the content being rendered. Even exposing the resource layout can make execution on AMD hardware (in particular) more predictable. And even if it imposes a bigger CPU overhead for sanitizing the commands, it gets faster when it reaches the GPU. Thus, we decided to focus on the capability of the API instead of the simplicity. 28 | 29 | Within the next-gen APIs, Vulkan is the most feature rich, and Metal is the simplest. The advanced shader syntax (e.g. pointer semantics) of Metal don't map well to other APIs. Thus, we started with Vulkan and SPIR-V, and for each feature (that it has over other APIs), we evaluated the cost of emulating it on the other APIs versus the benefit of exposing it. 30 | 31 | ### Main concepts 32 | 33 | - _Command Buffer_ - an opaque object that stores hardware commands, such as binding resources and drawing. Recording the command buffers simultaneously on multiple threads and executing them on different command queues is essential for efficient parallelization of work on both CPU and GPU. 34 | - _Pipeline State Object_ - an object encapsulating the shader program as well as all the state that depends on it. Keeping all that state together allows early validation by the browser as well as the native API, which can make sure all the internally used shader code is ready for the state to be used for rendering. 35 | - _Descriptor Set_ - an object storing a set of descriptors to be used by shaders. Reduces the overhead of binding and validation of multiple resource descriptors. 36 | - _Render Pass_ - a definition of a rendering pass over a particular set of surfaces. It may consist of multiple sub-passes that are very efficient to render on tile-based hardware architectures. 37 | 38 | ### Platform 39 | 40 | We believe in the power of the Web platform. The API's main goal is providing the enhanced GPU capabilities to the Web. If some of the features, like tessellation, are not available on a particular platform, we reserve the right to either suggest an emulation path, or gate the feature by a run-time flag available to applications. 41 | 42 | Focusing on the platform means that we are less concerned about the difficulty of using the API directly. We expect 3rd parties will build higher-level interfaces on top of the new API, as has happened for WebGL with Three.js, among others. 43 | 44 | ### Open standards 45 | 46 | Among the next-gen desktop APIs we see Vulkan as the most capable, portable, and open. Thus, our work can be seen as a reduction of Vulkan that would make sense for the Web. We believe that the basing on Vulkan would let us focus on the missing parts instead of bike-shedding the function names. 47 | 48 | ### Design constraints 49 | 50 | The API has to be efficiently implementable in at least Vulkan and Metal, and likely D3D12 as well. Implementability on D3D11 and OpenGL is not a constraint. 51 | 52 | The API has to execute efficiently on WebAssembly and in multi-threaded environment. That means no GC allocations during the rendering loop in order to avoid the garbage collection pauses. Thus, we don't use strings for the enumeration types, and rendering routines don't create new objects. 53 | 54 | 55 | ## Details 56 | 57 | The work flow we followed involved taking the Vulkan specification and evaluating features that can be ported to the Web. The main points of diversion are: 58 | 59 | - undefined behavior is made secure, and minimized when possible 60 | - restricted pipeline caches 61 | - restricted secondary command buffers 62 | - everything is internally synchronized 63 | 64 | ### Object creation, destruction, and error handling 65 | 66 | Creating an object (e.g. buffer, descriptor set, etc) returns immediately with an opaque indentifier. The client can use this result right away for any dependent operations. The GPU process will ensure that the corresponding native object is created before it's used. If an error happens during the object creation, any operations involving this object will result in an error. We haven't finalized the exact error query mechanism, and we don't consider it vital to our core idea. For the purpose of this proposal, we added `getError` functions to some of the interfaces, allowing the client to check for a failure and react accordingly. 67 | 68 | The destruction of the objects is explicit from the client point of view. The GPU process tracks all the real usage of a resource, and will delay the actual release of the resource until the GPU is done using it. 69 | 70 | ### Descriptor sets 71 | 72 | Vulkan and D3D12 expose descriptor sets, which group resources ahead of time, allowing the application to bind them efficiently. These native APIs impose different restrictions on the contents of the descriptor sets, and we want to provide the intersection of their features: 73 | 74 | - samplers can not be mixed with any other resources in the same descriptor set 75 | - no descriptor arrays 76 | 77 | On Metal, binding a descriptor set would translate to one or more calls of binding the resources (e.g. `setVertexBuffers`). 78 | 79 | ### Multiple render passes 80 | 81 | Declaring multiple sub-passes inside a render pass is a form of providing a part of your rendering dependency graph to the driver and the hardware. It allows the results of one pixel shader to be consumed by a different one without taking a trip to the graphics memory. This is especially important on tiled architectures. For example, during the "Vulkan Game Development on Mobile" session at GDC, Hans-Kristian (of ARM) showed 30% FPS improvement and 80% bandwidth reduction from using sub-passes in a deferred renderer, when testing on Galaxy S7. 82 | 83 | We expose the sub-passes similarly to Vulkan. On D3D12 and Metal, the pass dependencies would have to be emulated as texture parameters to the shaders, and the pass inputs replaced by regular texture fetching. The shader patching would be done before SPIR-V is converted to HLSL and MSL. Metal also has [storageModeMemoryless](https://developer.apple.com/reference/metal/mtlresourceoptions/1649224-storagemodememoryless) that can be used to store temporary sub-pass results. 84 | 85 | ### Resource layouts 86 | 87 | Vulkan and D3D12 encapsulate the interior mutability of resources in their layouts/states. Whenever a resource is used on GPU, the user is expected to provide the current layout for it, which affects the way GPU treats the resource. If we were to track the layouts automatically, we'd face the following performance issues: 88 | 89 | 1. In many cases, there are multiple compatible layouts (e.g. `ObImageLayout.GENERAL` can be used anywhere). We'd need some sort of heuristic to guess the best layout for a particular resource usage, which can not be as precise as the user specifying one directly. 90 | 2. Tracking the current layout of a resource is non-trivial, especially if you consider a resource used simultaneously by multiple command buffers that are submitted to different command queues. This is the CPU overhead we'd like to avoid when recording a command buffer. 91 | 3. User may know in advance what layout a resource is going to be used with, so they will be able to insert a memory barrier (for the layout transition) ahead of time, reducing the possible GPU stalls on waiting for that barrier. 92 | 93 | Q: since we are going to keep track of the objects for the matter of their destruction, maybe it's not that much overhead to track the current resource layouts at the same time? 94 | - it is still an overhead, which becomes especially complicated when tracking comes from multiple queues and command buffers 95 | - this is not a problem for resource lifetime tracking, since reference counting is additive 96 | 97 | Given no straightforward way to hide the layout mutability without compromising performance, we decided to expose this feature directly. In most cases, we are just going to pass the requested layouts down to the underlying API and guarantee that any read/writes to this resource would have undefined values, and would only touch the memory allocated for this resource. 98 | 99 | Controlling the layout transitions directly makes the GPU performance more predictable. There is also evidence that a higher-level abstraction on the application side can handle the transitions efficiently: see Frostbite's [frame graph](http://www.frostbite.com/2017/03/framegraph-extensible-rendering-architecture-in-frostbite/). 100 | 101 | Note: Metal, OpenGL, Direct3D11 do not expose the resource layouts, thus relevant browser implementations are free just ignore the user-provided layout transitions. Tracking and validating resource layouts would still be needed at the browser level, however. 102 | 103 | ### Undefined behavior 104 | 105 | Undefined behavior is important to allow the graphics hardware and driver implementations to perform optimizations that rely on the input data/parameters being valid. However, we can't allow truly undefined behavior due to potential expoitability. Instead, we classify the types of such behavior and only allow restricted forms of it (with localized side effecs) in places where enforcing a particular behavior would compromise the performance. For example, when a particular operation on a resource can lead to undefined behavior, our goal is to ensure that no memory outside of the allocated region (for this resource) is accessed, and the contents of the allocated memory are initialized. 106 | 107 | Note that we do not propose undefined behavior in any of the valid use case scenarios. What we want to constrain is the consequences of an application using the API incorrectly and running under a non-debugging session, thus with limited validation enabled. 108 | 109 | We could classify the undefined behavior into the following classes: 110 | 111 | 1. unrestricted. Example: loading a value outside of the resource memory. This could mean a driver crash, HW restart, access to non-owned memory, or anything else. This class poses a security risk that we can't allow for the Web API. 112 | 2. possible GPU hangs/crashes. Example: infinite loop in the shader. This is inconvenient, and we would prefer limiting the cases to a minimum, but essentially we allow this, since it doesn't pose a security risk. 113 | 3. undefined value. Example: accessing a resource with mismatched layout. We allow this behavior with one caveat: all resources imply a defined initial state (contrary to native APIs), so the value would never come from random GPU memory. We also recommend a validation layer in the browsers to detect and handle all occurencies of this behavior. 114 | 115 | For the resource layouts, for example, we are working with hardware vendors, Microsoft, and Khronos to clarify the behavior (of handling mismatched layouts) in the native APIs. 116 | 117 | ### Shading language 118 | 119 | We believe taking SPIR-V for the shading language is a straightforward and efficient solution for the Web, because: 120 | 121 | - it's an open, portable standard, already used for Vulkan and OpenCL 122 | - it maps well to the latest hardware features and has debugging meta-data 123 | - can be transformed to/from LLVM, allowing the source to be written in a large set of programming languages 124 | - it comes with a wide range of tools for reflection, optimization, and conversion to/from other shading languages, such as HLSL and GLSL 125 | 126 | In order to guarantee secure execution on GPU, we'll need to validate the provided SPIR-V binaries and add extra checks or return an error if the code attempts to use unsafe features that we are unable to port. Here is an incomplete list of spots that we would need to patch in SPIR-V: 127 | 128 | - bounds checks/clamps for buffer/image access, if the underlying context does not provide robustness guarantees (part of Vulkan [device features](https://www.khronos.org/registry/vulkan/specs/1.0-wsi_extensions/html/vkspec.html#features), default in D3D11) 129 | - sub-pass inputs conversion to texture fetches, when the native backend doesn't support them natively 130 | - TODO 131 | 132 | Concerns about SPIR-V: 133 | 1. Non-human-readable: 134 | - similarly to WebAssembly, we believe that a machine-readable binary representation is the most efficient. It can be converted to a human-readable format by a tool in order to assist debugging. 135 | - the API can also accept GLSL specified inside the document, to be converted to SPIR-V using `glslValidator` 136 | 2. Large binary size: 137 | - it is already on Khronos radar to address, plus there are community projects like SMOL-V 138 | - transferring the shader code is not expected to be a major network bottleneck, comparing to buffer and texture data 139 | 140 | ### Pipeline caches 141 | 142 | Since we inspect and patch the shader code before passing it to GPU, we can't allow unknown pre-compiled binaries to be provided in the pipeline cache. In the worst case, a rogue shader can be used to exploit a driver bug and hack the system, posing a security risk. Thus, we are not providing any functions to pre-initialize a pipeline cache with data, or to get the data from cache. We do provide the pipeline caches for temporary use by an application, since can drastically reduce the time spent on pipeline creation. 143 | 144 | ### Secondary command buffers 145 | 146 | Secondary command buffers (SCB) in Vulkan are useful for recording multiple command buffers simultaneously to be executed within sub-passes of a single primary command buffer pass. D3D12 has a similar concept, called "bundle", allowing to pre-record multiple draw calls for later re-use during the frame rendering. Early [tests(https://www.slideshare.net/DevCentralAMD/introduction-to-dx12-by-ivan-nevraev) by Microsoft] showed around 30% CPU time reduction on XBox from using bundles. 147 | 148 | These features have slightly different goals and restrictions, but we propose to expose them via a single interface. To achieve that, we restrict SCB in the following way: 149 | 150 | - forbidden operations: clears, copies, command buffer executions, barriers, resolves, queries, render pass/target setups 151 | - a render pass, sub-pass index, and a framebuffer are provided for the SCB creation 152 | - you can only execute SCB inside a render pass 153 | - all the state that SCB can not change is inherited, no state is leaked after execution 154 | - it can be executed multiple times 155 | 156 | The backend implementations would have to be smart about preserving the behavior: 157 | 158 | - Vulkan: use the native SCB and insert the calls to set up the framebuffer and render pass for the state initialization at the beginning of SCB. 159 | - D3D12: use the Bundle, reset the state back at the end of recording. 160 | - Metal: emulate the recording. Some sort of emulated command buffer is already required by the implementation in order to support command buffer re-submission (which native Metal doesn't have), so this logic just needs to be re-used for SCB. 161 | 162 | 163 | ## Synchronization 164 | 165 | ### CPU-CPU 166 | 167 | Vulkan API considers most of the objects to be externally synchronized (`vkCommandBuffer`, `vkQueue`, etc). This means that the user is responsible for synchronizing the access to these objects from multiple threads. 168 | 169 | The story of multi-threading on the client side of the Web is not finished. There are WebWorkers, and WebAssembly will eventually get its own threads. For now, we designate all objects as internally synchronized. 170 | 171 | ### CPU-GPU 172 | 173 | The `ObFence` is the main primitive of synchronizing CPU with GPU. The API allows signalling a fence upon finishing execution of a command buffer, which the client can then wait on. Similarly, `ObDevice` and `ObQueue` objects have `waitIdle` methods. 174 | 175 | The `ObEvent` can be used to stop a command buffer execution on the GPU until an event is fired by the client. It can also be polled by the client (but not waited upon) in case a GPU operation is expected to fire it. 176 | 177 | ### GPU-GPU 178 | 179 | There are many levels of communication between GPU objects: 180 | 181 | - `ObSemaphore` allows synchronizing the resource access across multiple queues 182 | - `ObEvent` is a fine-grained primitive that can also be used to synchronize different command buffers 183 | - _Pipeline Barrier_ - provides synchronization between different parts of the GPU pipeline within a command buffer 184 | - _Render Passes_ - can encode dependency information between sub-passes and transit the resource layouts 185 | 186 | 187 | ## Memory model 188 | 189 | Explicit memory allocation and re-use between resources is one of the key features of next-gen APIs (Vulkan and D3D12) for achieving a performance advantage over the older APIs. 190 | 191 | We expose different memory types available to a device and require the memory to be allocated and bound to a resource before it can be used. A memory type can be visible to the host, be device local, or both. Any GPU operation on a resource (such as including it in a descriptor set, or in a framebuffer) requires its memory to be device local. Failure to satisfy this requirement would result in an error and the rendering operations to be discarded. 192 | 193 | ### Aliasing 194 | 195 | Memory aliasing is useful to reduce the memory footprint of the application. For example, Frostbite engine got almost 50% reduction of render target memory usage by introducing the [transient resource system](http://www.frostbite.com/2017/03/framegraph-extensible-rendering-architecture-in-frostbite/). It does not allow using multiple resources pointing to aliased memory ranges at the same time. Doing so would result in undefined contents of the resources (see type 3 in the undefined behavior section). In other words, aliasing can be either temporal or spatial, but not both. Thus, we can safely emulate the aliasing on higher-level APIs like Metal and D3D11 by allocating the memory for each resource independently. 196 | 197 | ### CPU-GPU interaction 198 | 199 | In order to copy the resource data between the host memory and device local memory, the following transfer operations can be recorded to a command buffer: `cmdCopyBuffer`, `cmdCopyBufferToImage`, and `cmdCopyImageToBuffer`. The temporary resource placed on the host memory is called a "staging" resource. The client can exchange the data with a staging buffer using `uploadBuffer` and `downloadBuffer` functions of the `ObDevice`. 200 | 201 | ### Initial data 202 | 203 | The contents of all resources are initialized to 0 upon allocation. The implementation is free to skip the pre-initialization if it can prove that the resource memory is going to be completely overwritten by a transfer or draw command. For example, an allocation can be followed by an `uploadBuffer` call specifying the full range, which would override the zeroed contents completely. It makes no difference from the application point of view. 204 | 205 | ### Data races 206 | 207 | Users are required to synchronize the access to memory (with relevant primitives described in the Synchronization section). Failure to accomplish this may lead to data races for read-vs-write and write-vs-write scenarios. We consider those data races secure, since the memory is guaranteed to be initialized, and the resulting data always comes from the user provided/generated content. This scenario falls into type 3 undefined behavior. 208 | 209 | ### Out of bounds 210 | 211 | Any read of a resource outside of the designed bounds will return an undefined value within the resource boundaries. Any write out of bounds will be discarded. This is achieved by patching SPIR-V code before uploading it to GPU and/or enabling the robustness features of the backend API. 212 | 213 | ### Security 214 | 215 | When a resource is bound to an allocated memory region, the implementation can check that the resource is completely contained within the memory bounds and trigger an error otherwise. Any operations with a resource that is not bound to memory are ignored, and errors are produced. Allocated memory is considered initialized, and simultaneous aliased access falls into "undefined content" category, which guarantees that no data from other applications can be leaked in. 216 | 217 | The browser implementation tracks all resources bound to a particular memory region. The `freeMemory` operation on this region will only succeed if there are no resources that are bound to it. 218 | 219 | 220 | ## API 221 | 222 | ### Initialization 223 | 224 | The context obtained from [Document](https://developer.mozilla.org/en/docs/Web/API/Document) is similar to [vkInstance](https://www.khronos.org/registry/vulkan/specs/1.0/man/html/VkInstance.html): it doesn't provide a single GPU device, but instead allows the user to query available GPUs and work with them manually. The browser can filter out the physically available devices, based on settings, available hardware, and the active power modes. Creating a GPU device also produces the command queues that can execute command buffers. 225 | 226 | The user creates a swap chain manually as well by providing the [Canvas](https://developer.mozilla.org/en/docs/Web/API/Canvas) object. This is different from WebGL, where the context is obtained from the `Canvas`. Disconnecting the context from Canvas allows managing multiple canvases, potentially using different devices and command queues. 227 | 228 | ### WebIDL 229 | 230 | _Obsidian_ is an object-oriented API [defined in WebIDL](API.webidl) for WebAssembly and JavaScript code. All related types have an `Ob` prefix (for the lack of proper namespaces in WebIDL). This convention is to be revised when we get closer to formalizing the specification. 231 | 232 | Enumeration values as well as flags are defined as integer constants inside the corresponding interfaces, e.g. `ObPipelineStage.TRANSFER_BIT`. Using integers instead of strings (which are recommended for JavaScript-aimed WebIDL interfaces) would allow for more efficient WebAssembly interaction. 233 | 234 | We presently use WebIDL dictionaries in place of structs in the function parameters. While the current WebAssembly implementations de-route to JavaScript for handling these, we've been assured by the WebAssembly team that the improvements are planned to map the dictionaries directly to structures, thus making them efficient for frequent in-frame function calls. Direct bindings are especially important for the native WASM threads, given that they will not have access to JavaScript. 235 | 236 | 237 | ## Examples 238 | 239 | ### Screen clear 240 | 241 | ```js 242 | var context = document.getContext("obsidian-0.1"); 243 | 244 | // init device, queue, swapchain, and command buffer 245 | var deviceProto = context.getPhysicalDevices()[0]; 246 | var queueFamily = deviceProto.getQueueFamilies().find(function(family) { 247 | return (family.queueFlags & ObQueueFlags.GRAPHICS_BIT) && 248 | (family.queueFlags & ObQueueFlags.PRESENT_BIT); 249 | }); 250 | var {device, queues: [queue]} = context.createDevice(deviceProto, [ 251 | { 252 | queueFamily: queueFamily, 253 | queuePriorities: [1.0], 254 | } 255 | ]); 256 | 257 | var canvas = document.getElementById("canvas"); 258 | var swapchain = device.createSwapchain(canvas, { 259 | minImageCount: 2, 260 | }); 261 | 262 | var frameBeginSemaphore = device.createSemaphore(); 263 | var frameEndSemaphore = device.createSemaphore(); 264 | var commandBuffer = device.createPrimaryCommandBuffer(queueFamily); 265 | 266 | // render a frame 267 | var frameImage = device.acquireNextImage(swapchain, frameBeginSemaphore); 268 | commandBuffer.begin(); 269 | 270 | var subresourceRange = { 271 | aspectMask: ObImageAspect.COLOR_BIT, 272 | baseMipLevel: 0, 273 | levelCount: 1, 274 | baseArrayLayer: 0, 275 | layerCount: 1, 276 | }; 277 | commandBuffer.cmdPipelineBarrier(ObPipelineStage.TRANSFER_BIT, ObPipelineStage.TRANSFER_BIT, [ 278 | { 279 | srcAccessMask: ObAccess.MEMORY_READ_BIT, 280 | dstAccessMask: ObAccess.TRANSFER_WRITE_BIT, 281 | oldLayout: ObImageLayout.UNDEFINED, 282 | newLayout: ObImageLayout.TRANSFER_DST_OPTIMAL, 283 | subresourceRange: subresourceRange, 284 | } 285 | ]); 286 | commandBuffer.cmdClearColorImage(frameImage, ObImageLayout.TRANSFER_DST_OPTIMAL, [0.1, 0.2, 0.3, 1.0], [subresourceRange]); 287 | commandBuffer.cmdPipelineBarrier(ObPipelineStage.TRANSFER_BIT, ObPipelineStage.TRANSFER_BIT, [ 288 | { 289 | srcAccessMask: ObAccess.TRANSFER_WRITE_BIT, 290 | dstAccessMask: ObAccess.MEMORY_READ_BIT, 291 | oldLayout: ObImageLayout.TRANSFER_DST_OPTIMAL, 292 | newLayout: ObImageLayout.LAYOUT_PRESENT_SRC, 293 | subresourceRange: subresourceRange, 294 | } 295 | ]); 296 | 297 | commandBuffer.end(); 298 | queue.submit({ 299 | waitSemaphores: [frameBeginSemaphore], 300 | waitDstStageMasks: [ObPipelineStage.TRANSFER_BIT], 301 | commandBuffers: [commandBuffer], 302 | signalSemaphores: [frameEndSemaphore], 303 | }); 304 | queue.present(swapchain, [frameEndSemaphore]); 305 | ``` 306 | 307 | ### Specifying shaders in GLSL 308 | 309 | While SPIR-V is the shader format of this API, the user can provide GLSL code in the document to be automatically converted to SPIR-V using [glslValidator](https://github.com/KhronosGroup/glslang): 310 | 311 | ```html 312 | 325 | 335 | ``` 336 | 337 | ### Triangle state preparation 338 | 339 | ```js 340 | var renderpass = device.createRenderPass({ 341 | attachments: [ 342 | { 343 | format: swapchain.format, 344 | samples: ObSampleCount.S1_BIT, 345 | loadOp: ObAttachmentLoadOp.CLEAR, 346 | storeOp: ObAttachmentLoadOp.STORE, 347 | initialLayout: ObImageLayout.UNDEFINED, 348 | finalLayout: ObImageLayout.PRESENT_SRC, 349 | }, 350 | ], 351 | subpasses: [ 352 | { 353 | pipelineBindPoint: ObPipelineBindPoint.GRAPHICS, 354 | inputAttachments: [], 355 | colorAttachments: { 356 | 0 => ObImageLayout.COLOR_ATTACHMENT_OPTIMAL, 357 | }, 358 | preserveAttachments: [], 359 | } 360 | ], 361 | dependencies: [], 362 | }); 363 | //Note: for rendering more than one frame, we'll need a framebuffer per swap image 364 | var framebuffer = device.createFramebuffer({ 365 | renderPass: renderPass, 366 | attachments: [frameImage], 367 | width: swapchain.width, 368 | height: swapchain.height, 369 | layers: 1, 370 | }); 371 | //Note: providing GLSL text arguments to `createShaderModule` 372 | var vsModule = device.createShaderModule(document.getElementById("shader-vs").textContent); 373 | var fsModule = device.createShaderModule(document.getElementById("shader-fs").textContent); 374 | var pipelineLayout = device.createPipelineLayout({ 375 | descriptorSetLayouts: [], 376 | pushConstantRanges: [], 377 | }); 378 | var [pipeline] = device.createGraphicsPipelines([ 379 | { 380 | stages: { 381 | ObShaderStage.VERTEX_BIT => { 382 | module: vsModule, 383 | name: "main", 384 | }, 385 | ObShaderStage.FRAGMENT_BIT => { 386 | module: fsModule, 387 | name: "main", 388 | }, 389 | }, 390 | vertexInputState: { 391 | vertexBindingDescriptions: [], 392 | vertexAttrbuteDescriptions: [], 393 | }, 394 | inputAssemblyState: { 395 | topology: ObPrimitiveTopology.TRIANGLE_LIST, 396 | }, 397 | viewportState: { 398 | viewports: [ 399 | { 400 | x: 0, 401 | y: 0, 402 | width: swapchain.width, 403 | height: swapchain.height, 404 | } 405 | ], 406 | scissors: [ 407 | { 408 | offset: [0, 0], 409 | extent: [swapchain.width, swapchain.height], 410 | } 411 | ], 412 | }, 413 | rasterizationState: { 414 | polygonMode: ObPolygonMode.FILL, 415 | cullMode: ObCullMode.BACK_BIT, 416 | frontFace: ObFrontFace.COUNTER_CLOCKWISE, 417 | }, 418 | multisampleState: { 419 | rasterizationSamples: ObSampleCount.S1_BIT, 420 | }, 421 | colorBlendState: { 422 | logicOpEnable: false, 423 | attachments: [ 424 | { 425 | blendEnable: false, 426 | } 427 | ], 428 | blendConstants: [0.0, 0.0, 0.0, 0.0], 429 | }, 430 | layout: pipelineLayout, 431 | renderPass: renderPass, 432 | subpass: 0, 433 | } 434 | ]); 435 | ``` 436 | 437 | ### Triangle rendering 438 | 439 | ```js 440 | commandBuffer.cmdBeginRenderPass({ 441 | renderPass: renderPass, 442 | framebuffer: framebuffer, 443 | renderArea: { 444 | offset: [0, 0], 445 | extent: [swapchain.width, swapchain.height], 446 | }, 447 | clearValues: [ 448 | [0.1, 0.2, 0.3, 1.0] 449 | ], 450 | }); 451 | commandBuffer.cmdBindPipeline(ObPipelineBindPoint.GRAPHICS, pipeline); 452 | commandBuffer.cmdDraw(3, 1, 0, 0); 453 | commandBuffer.cmdEndRenderPass(); 454 | ``` 455 | 456 | ### Vertex data setup 457 | 458 | ```js 459 | var hostMemoryType = deviceProto.getMemoryTypes().find(function(type) { 460 | return type.memoryFlags & ObMemoryFlags.HOST_VISIBLE_BIT; 461 | }); 462 | var deviceMemoryType = deviceProto.getMemoryTypes().find(function(type) { 463 | return type.memoryFlags & ObMemoryFlags.DEVICE_LOCAL_BIT; 464 | }); 465 | 466 | // allocate and fill the staging buffer 467 | var bufferSize = 4 * 4 * (4 + 4); 468 | var stagingBuffer = device.createBuffer({ 469 | size: bufferSize, 470 | usage: ObBufferUsage.TRANSFER_SRC_BIT, 471 | }); 472 | var stagingMemory = device.allocateMemory(hostMemoryType, stagingBuffer.getMemoryRequirements()); 473 | device.mapBufferMemory(stagingBuffer, stagingMemory, 0); 474 | device.uploadBuffer(stagingBuffer, 0, new Float32Array([ 475 | // X Y Z W R G B A 476 | -0.7, -0.7, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 477 | -0.7, 0.7, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 478 | 0.7, -0.7, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 479 | 0.7, 0.7, 0.0, 1.0, 0.3, 0.3, 0.3, 0.0, 480 | ])); 481 | 482 | var vertexBuffer = device.createBuffer({ 483 | size: bufferSize, 484 | usage: ObBufferUsage.TRANSFER_DST_BIT | ObBufferUsage.VERTEX_BUFFER_BIT, 485 | }); 486 | var deviceBufferMemory = device.allocateMemory(deviceMemoryType, vertexBuffer.getMemoryRequirements()); 487 | device.mapBufferMemory(vertexBuffer, deviceBufferMemory, 0); 488 | 489 | // the PSO initialization part 490 | var vertexBinding = 0; 491 | var vertexInputState = { 492 | vertexBindingDescriptions: { 493 | vertexBinding => { 494 | stride: (4 + 4) * 4, 495 | inputRate: ObVertexInputRate.VERTEX, 496 | } 497 | }, 498 | vertexAttrbuteDescriptions: { 499 | 0 => { 500 | binding: vertexBinding, 501 | format: ObFormat.R32G32B32A32_SFLOAT, 502 | offset: 0, 503 | }, 504 | 1 => { 505 | binding: vertexBinding, 506 | format: ObFormat.R32G32B32A32_SFLOAT, 507 | offset: 4 * 4, 508 | }, 509 | }, 510 | }; 511 | // one-time initialization, assuming the `commandBuffer.begin()` was called 512 | commandBuffer.cmdCopyBuffer(stagingBuffer, vertexBuffer, [ 513 | { 514 | srcOffset: 0, 515 | dstOffset: 0, 516 | size: bufferSize, 517 | } 518 | ]); 519 | ``` 520 | 521 | ## Acknowledgements 522 | 523 | Mozillians: 524 | - Glenn Watson 525 | - Jeff Muizelaar 526 | - Nicolas Silva 527 | - Olli Pettay 528 | 529 | External reviewers: 530 | - Baldur Karlsson 531 | - Markus Siglreithmaier 532 | - Martin Krastev 533 | - Pierre Krieger 534 | - Wolfgang Engel 535 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WebGLNext-Proposals 2 | Proposals for the design of the WebGL Next API. 3 | --------------------------------------------------------------------------------