├── .gitignore ├── Core.meta ├── Core ├── AlexMalyutin.CMAA2.asmdef ├── AlexMalyutin.CMAA2.asmdef.meta ├── AtomicTextureHandle.cs ├── AtomicTextureHandle.cs.meta ├── CMAA2.Compute.cs ├── CMAA2.Compute.cs.meta ├── CMAA2.compute ├── CMAA2.compute.meta ├── CMAA2.hlsl ├── CMAA2.hlsl.meta ├── CMAA2RenderFeature.cs ├── CMAA2RenderFeature.cs.meta ├── CMAA2RenderPass.cs └── CMAA2RenderPass.cs.meta ├── LICENSE ├── LICENSE.meta ├── README.md ├── README.md.meta ├── THIRD_PARTY_LICENSES.meta ├── THIRD_PARTY_LICENSES ├── CMAA2-LICENSE └── CMAA2-LICENSE.meta ├── package.json └── package.json.meta /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /Core.meta: -------------------------------------------------------------------------------- 1 | fileFormatVersion: 2 2 | guid: 6c085ef84026a41fbbe4d7a371d986bd 3 | folderAsset: yes 4 | DefaultImporter: 5 | externalObjects: {} 6 | userData: 7 | assetBundleName: 8 | assetBundleVariant: 9 | -------------------------------------------------------------------------------- /Core/AlexMalyutin.CMAA2.asmdef: -------------------------------------------------------------------------------- 1 | { 2 | "name": "AlexMalyutin.CMAA2", 3 | "rootNamespace": "CMAA2", 4 | "references": [ 5 | "Unity.RenderPipelines.Core.Runtime", 6 | "Unity.RenderPipelines.Universal.Runtime" 7 | ], 8 | "includePlatforms": [], 9 | "excludePlatforms": [], 10 | "allowUnsafeCode": false, 11 | "overrideReferences": false, 12 | "precompiledReferences": [], 13 | "autoReferenced": true, 14 | "defineConstraints": [], 15 | "versionDefines": [], 16 | "noEngineReferences": false 17 | } -------------------------------------------------------------------------------- /Core/AlexMalyutin.CMAA2.asmdef.meta: -------------------------------------------------------------------------------- 1 | fileFormatVersion: 2 2 | guid: d7288b684d76f4d00bd8f2e0236e5496 3 | AssemblyDefinitionImporter: 4 | externalObjects: {} 5 | userData: 6 | assetBundleName: 7 | assetBundleVariant: 8 | -------------------------------------------------------------------------------- /Core/AtomicTextureHandle.cs: -------------------------------------------------------------------------------- 1 | #define DONT_USE_TEXTURE_ATOMICS 2 | #if PLATFORM_STANDALONE_OSX || DONT_USE_TEXTURE_ATOMICS 3 | #define TEXTURE_ATOMICS_NOT_SUPPORTED 4 | #endif 5 | 6 | using UnityEngine; 7 | using UnityEngine.Experimental.Rendering; 8 | using UnityEngine.Rendering; 9 | using UnityEngine.Rendering.RenderGraphModule; 10 | 11 | namespace CMAA2.Core 12 | { 13 | public struct AtomicTextureHandle 14 | { 15 | public Vector4 Size => new Vector4(Width, Height); 16 | 17 | public int Width; 18 | public int Height; 19 | #if TEXTURE_ATOMICS_NOT_SUPPORTED 20 | public BufferHandle Handle; 21 | #else 22 | public TextureHandle Handle; 23 | #endif 24 | 25 | public static AtomicTextureHandle CreateTransientUint(IBaseRenderGraphBuilder builder, int width, int height) 26 | { 27 | var handle = new AtomicTextureHandle() 28 | { 29 | Width = width, 30 | Height = height, 31 | }; 32 | 33 | #if TEXTURE_ATOMICS_NOT_SUPPORTED 34 | var desc = new BufferDesc(width * height, sizeof(uint), GraphicsBuffer.Target.Structured); 35 | handle.Handle = builder.CreateTransientBuffer(desc); 36 | builder.UseBuffer(handle.Handle, AccessFlags.ReadWrite); 37 | #else 38 | var desc = new TextureDesc(width, height) 39 | { 40 | format = GraphicsFormat.R8_UInt, 41 | enableRandomWrite = true, 42 | }; 43 | handle.Handle = builder.CreateTransientTexture(desc); 44 | builder.UseTexture(handle.Handle, AccessFlags.ReadWrite); 45 | #endif 46 | return handle; 47 | } 48 | 49 | public void Bind(IComputeCommandBuffer cmd, ComputeShader compute, int kernelIndex, string name) 50 | { 51 | #if TEXTURE_ATOMICS_NOT_SUPPORTED 52 | cmd.SetComputeBufferParam(compute, kernelIndex, name, Handle); 53 | #else 54 | cmd.SetComputeTextureParam(compute, kernelIndex, name, Handle); 55 | #endif 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /Core/AtomicTextureHandle.cs.meta: -------------------------------------------------------------------------------- 1 | fileFormatVersion: 2 2 | guid: afadf6a2eb36480a9f9c5a7c1e2c99fd 3 | timeCreated: 1748460785 -------------------------------------------------------------------------------- /Core/CMAA2.Compute.cs: -------------------------------------------------------------------------------- 1 | using UnityEngine; 2 | using UnityEngine.Rendering; 3 | using UnityEngine.Rendering.RenderGraphModule; 4 | 5 | namespace CMAA2.Core 6 | { 7 | public class CMAA2Compute 8 | { 9 | private ComputeShader _compute; 10 | private readonly int _edgesColor2x2CS; 11 | private readonly int _computeDispatchArgsCS; 12 | private readonly int _processCandidatesCS; 13 | private readonly int _deferredColorApply2x2CS; 14 | 15 | private ThreadGroupSizes _edgesColor2x2TreadGroupSize; 16 | 17 | public CMAA2Compute(ComputeShader compute) 18 | { 19 | _compute = compute; 20 | _edgesColor2x2CS = _compute.FindKernel("EdgesColor2x2CS"); 21 | _compute.GetKernelThreadGroupSizes(_edgesColor2x2CS, out var x, out var y, out var z); 22 | _edgesColor2x2TreadGroupSize = new ThreadGroupSizes(x, y, z); 23 | 24 | _computeDispatchArgsCS = _compute.FindKernel("ComputeDispatchArgsCS"); 25 | _processCandidatesCS = _compute.FindKernel("ProcessCandidatesCS"); 26 | _deferredColorApply2x2CS = _compute.FindKernel("DeferredColorApply2x2CS"); 27 | } 28 | 29 | public void EdgesColor2x2CS( 30 | IComputeCommandBuffer cmd, 31 | TextureHandle inColorTexture, 32 | Vector2Int textureResolution, 33 | TextureHandle workingEdges, 34 | SizedBufferHandle workingShapeCandidates, 35 | AtomicTextureHandle workingDeferredBlendItemListHeads, 36 | BufferHandle workingControlBuffer 37 | ) 38 | { 39 | var kernelId = _edgesColor2x2CS; 40 | var sampleName = nameof(EdgesColor2x2CS); 41 | 42 | cmd.BeginSample(sampleName); 43 | 44 | Bind(cmd, kernelId, "g_inoutColorReadonly", inColorTexture); 45 | Bind(cmd, kernelId, "g_workingEdges", workingEdges); 46 | 47 | Bind(cmd, "g_workingShapeCandidates_Dim", workingShapeCandidates.Dimensions); 48 | Bind(cmd, kernelId, "g_workingShapeCandidates", workingShapeCandidates.Buffer); 49 | 50 | Bind(cmd, kernelId, "g_workingControlBuffer", workingControlBuffer); 51 | 52 | Bind(cmd, "g_workingDeferredBlendItemListHeads_Width", workingDeferredBlendItemListHeads.Width); 53 | workingDeferredBlendItemListHeads.Bind(cmd, _compute, kernelId, "g_workingDeferredBlendItemListHeads"); 54 | 55 | // TODO: ThreadGroups count! 56 | int csOutputKernelSizeX = (int)(_edgesColor2x2TreadGroupSize.X - 2); // m_csInputKernelSizeX - 2; 57 | int csOutputKernelSizeY = (int)(_edgesColor2x2TreadGroupSize.Y - 2); // m_csInputKernelSizeY - 2; 58 | int threadGroupCountX = (textureResolution.x + csOutputKernelSizeX * 2 - 1) / (csOutputKernelSizeX * 2); 59 | int threadGroupCountY = (textureResolution.y + csOutputKernelSizeY * 2 - 1) / (csOutputKernelSizeY * 2); 60 | cmd.DispatchCompute(_compute, kernelId, threadGroupCountX, threadGroupCountY, 1); 61 | 62 | cmd.EndSample(sampleName); 63 | } 64 | 65 | public void ComputeDispatchArgsCS( 66 | IComputeCommandBuffer cmd, 67 | int threadGroupsX, 68 | int threadGroupsY, 69 | BufferHandle workingControlBuffer, 70 | SizedBufferHandle workingDeferredBlendLocationList, 71 | SizedBufferHandle workingShapeCandidates, 72 | BufferHandle workingExecuteIndirectBuffer 73 | ) 74 | { 75 | int kernelId = _computeDispatchArgsCS; 76 | var sampleName = nameof(ComputeDispatchArgsCS); 77 | 78 | cmd.BeginSample(sampleName); 79 | 80 | Bind(cmd, kernelId, "g_workingControlBuffer", workingControlBuffer); 81 | 82 | // TODO: Remove passing unnecessary vectors! 83 | Bind(cmd, "g_workingDeferredBlendLocationList_Dim", workingDeferredBlendLocationList.Dimensions); 84 | Bind(cmd, kernelId, "g_workingDeferredBlendLocationList", workingDeferredBlendLocationList.Buffer); 85 | 86 | // TODO: Remove passing unnecessary vectors! 87 | Bind(cmd, "g_workingShapeCandidates_Dim", workingShapeCandidates.Dimensions); 88 | Bind(cmd, kernelId, "g_workingShapeCandidates", workingShapeCandidates.Buffer); 89 | 90 | // Out 91 | Bind(cmd, kernelId, "g_workingExecuteIndirectBuffer", workingExecuteIndirectBuffer); 92 | 93 | // TODO: ThreadGroups count! 94 | cmd.DispatchCompute(_compute, kernelId, threadGroupsX, threadGroupsY, 1); 95 | 96 | cmd.EndSample(sampleName); 97 | } 98 | 99 | // inColor : Texture2D 100 | // workingEdges : RWTexture2D 101 | // workingDeferredBlendItemListHeads 102 | // - MacOS|IOS : RWStructuredBuffer 103 | // - Windows : RWTexture2D 104 | // workingShapeCandidates : RWStructuredBuffer 105 | // workingDeferredBlendLocationList : RWStructuredBuffer 106 | public void ProcessCandidatesCS( 107 | IComputeCommandBuffer cmd, 108 | BufferHandle workingExecuteDirectBuffer, 109 | TextureHandle inColor, 110 | TextureHandle workingEdges, 111 | AtomicTextureHandle workingDeferredBlendItemListHeads, 112 | BufferHandle workingControlBuffer, 113 | BufferHandle workingDeferredBlendItemList, 114 | SizedBufferHandle workingShapeCandidates, 115 | SizedBufferHandle workingDeferredBlendLocationList 116 | ) 117 | { 118 | int kernelId = _processCandidatesCS; 119 | var sampleName = nameof(ProcessCandidatesCS); 120 | 121 | cmd.BeginSample(sampleName); 122 | 123 | Bind(cmd, kernelId, "g_inoutColorReadonly", inColor); 124 | Bind(cmd, kernelId, "g_workingEdges", workingEdges); 125 | 126 | // NOTE: Size only needed on platforms that don't support texture's atomics operations. 127 | Bind(cmd, "g_workingDeferredBlendItemListHeads_Width", workingDeferredBlendItemListHeads.Width); 128 | workingDeferredBlendItemListHeads.Bind(cmd, _compute, kernelId, "g_workingDeferredBlendItemListHeads"); 129 | 130 | Bind(cmd, kernelId, "g_workingControlBuffer", workingControlBuffer); 131 | Bind(cmd, kernelId, "g_workingDeferredBlendItemList", workingDeferredBlendItemList); 132 | 133 | Bind(cmd, "g_workingDeferredBlendLocationList_Dim", workingDeferredBlendLocationList.Dimensions); 134 | Bind(cmd, kernelId, "g_workingDeferredBlendLocationList", workingDeferredBlendLocationList.Buffer); 135 | 136 | Bind(cmd, "g_workingShapeCandidates_Dim", workingShapeCandidates.Dimensions); 137 | Bind(cmd, kernelId, "g_workingShapeCandidates", workingShapeCandidates.Buffer); 138 | 139 | // TODO: ThreadGroups count! 140 | // cmd.DispatchCompute(_compute, kernelId, 1, 1, 1); 141 | cmd.DispatchCompute(_compute, kernelId, workingExecuteDirectBuffer, 0); 142 | cmd.EndSample(sampleName); 143 | } 144 | 145 | public void DeferredColorApply2x2CS( 146 | IComputeCommandBuffer cmd, 147 | BufferHandle workingExecuteIndirectBuffer, 148 | TextureHandle outColor, 149 | BufferHandle workingControlBuffer, 150 | BufferHandle workingDeferredBlendItemList, 151 | AtomicTextureHandle workingDeferredBlendItemListHeads, 152 | SizedBufferHandle workingDeferredBlendLocationList 153 | ) 154 | { 155 | var kernelId = _deferredColorApply2x2CS; 156 | var sampleName = nameof(DeferredColorApply2x2CS); 157 | 158 | cmd.BeginSample(sampleName); 159 | 160 | Bind(cmd, kernelId, "g_inoutColorWriteonly", outColor); 161 | Bind(cmd, kernelId, "g_workingControlBuffer", workingControlBuffer); 162 | Bind(cmd, kernelId, "g_workingDeferredBlendItemList", workingDeferredBlendItemList); 163 | 164 | // NOTE: Size only needed on platforms that don't support texture's atomics operations. 165 | Bind(cmd, "g_workingDeferredBlendItemListHeads_Width", workingDeferredBlendItemListHeads.Width); 166 | workingDeferredBlendItemListHeads.Bind(cmd, _compute, kernelId, "g_workingDeferredBlendItemListHeads"); 167 | 168 | Bind(cmd, "g_workingDeferredBlendLocationList_Dim", workingDeferredBlendLocationList.Dimensions); 169 | Bind(cmd, kernelId, "g_workingDeferredBlendLocationList", workingDeferredBlendLocationList.Buffer); 170 | 171 | cmd.DispatchCompute(_compute, kernelId, workingExecuteIndirectBuffer, 0); 172 | 173 | cmd.EndSample(sampleName); 174 | } 175 | 176 | private void Bind(IComputeCommandBuffer cmd, string name, int value) 177 | { 178 | cmd.SetComputeIntParam(_compute, name, value); 179 | } 180 | 181 | private void Bind(IComputeCommandBuffer cmd, string name, Vector4 vector) 182 | { 183 | cmd.SetComputeVectorParam(_compute, name, vector); 184 | } 185 | 186 | private void Bind(IComputeCommandBuffer cmd, int kernelId, string name, TextureHandle textureHandle) 187 | { 188 | cmd.SetComputeTextureParam(_compute, kernelId, name, textureHandle); 189 | } 190 | 191 | private void Bind(IComputeCommandBuffer cmd, int kernelId, string name, BufferHandle bufferHandle) 192 | { 193 | cmd.SetComputeBufferParam(_compute, kernelId, name, bufferHandle); 194 | } 195 | } 196 | 197 | struct ThreadGroupSizes 198 | { 199 | public uint X, Y, Z; 200 | 201 | public ThreadGroupSizes(uint x, uint y, uint z) 202 | { 203 | X = x; 204 | Y = y; 205 | Z = z; 206 | } 207 | } 208 | } 209 | -------------------------------------------------------------------------------- /Core/CMAA2.Compute.cs.meta: -------------------------------------------------------------------------------- 1 | fileFormatVersion: 2 2 | guid: f389d29ec0554aa7880ce3989dbc3f66 3 | timeCreated: 1748371512 -------------------------------------------------------------------------------- /Core/CMAA2.compute: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2 | // Copyright 2025, Alex Malyutin 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // 16 | // This file is a Unity-compatible port of Intel’s CMAA2 (Conservative Morphological Anti-Aliasing) 17 | // originally developed and distributed by Intel Corporation under the Apache 2.0 license. 18 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 19 | 20 | #if defined(SHADER_API_METAL) || defined(SHADER_API_GLES) 21 | #define PLATFORM_NO_TEXTURE_LOAD_OFFSET 22 | #define PLATFORM_NO_TEXTURE_ATOMICS 23 | #endif 24 | 25 | #if SHADER_API_D3D11 || SHADER_API_D3D11_9X 26 | #define PLATFORM_NO_TEXTURE_ATOMICS 27 | #endif 28 | 29 | #define CMAA2_UAV_STORE_TYPED 1 // use typed UAV store 30 | #define CMAA2_UAV_STORE_CONVERT_TO_SRGB 0 // no need to convert to SRGB - R11G11B10_FLOAT does not use SRGB encoding 31 | #define CMAA2_UAV_STORE_TYPED_UNORM_FLOAT 0 // not required for non-float semantics correctness (RWTexture2D) 32 | 33 | #include "CMAA2.hlsl" 34 | 35 | #pragma kernel EdgesColor2x2CS 36 | #pragma kernel ComputeDispatchArgsCS 37 | #pragma kernel ProcessCandidatesCS 38 | #pragma kernel DeferredColorApply2x2CS 39 | -------------------------------------------------------------------------------- /Core/CMAA2.compute.meta: -------------------------------------------------------------------------------- 1 | fileFormatVersion: 2 2 | guid: bdca705f137f466380668861f6afe032 3 | timeCreated: 1748370540 -------------------------------------------------------------------------------- /Core/CMAA2.hlsl: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2018, Intel Corporation 3 | // 4 | // Licensed under the Apache License, Version 2.0 ( the "License" ); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 16 | // 17 | // Conservative Morphological Anti-Aliasing, version: 2.3 18 | // 19 | // Author(s): Filip Strugar (filip.strugar@intel.com) 20 | // 21 | // More info: https://github.com/GameTechDev/CMAA2 22 | // 23 | // Please see https://github.com/GameTechDev/CMAA2/README.md for additional information and a basic integration guide. 24 | // 25 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 26 | 27 | #ifndef __CMAA2_HLSL__ 28 | #define __CMAA2_HLSL__ 29 | 30 | // this line is VA framework specific (ignore/remove when using outside of VA) 31 | #ifdef VA_COMPILED_AS_SHADER_CODE 32 | #include "MagicMacrosMagicFile.h" 33 | #endif 34 | 35 | // Constants that C++/API side needs to know! 36 | #define CMAA_PACK_SINGLE_SAMPLE_EDGE_TO_HALF_WIDTH 1 // adds more ALU but reduces memory use for edges by half by packing two 4 bit edge info into one R8_UINT texel - helps on all HW except at really low res 37 | #define CMAA2_CS_INPUT_KERNEL_SIZE_X 16 38 | #define CMAA2_CS_INPUT_KERNEL_SIZE_Y 16 39 | 40 | // The rest below is shader only code 41 | #ifndef __cplusplus 42 | 43 | // If the color buffer range is bigger than [0, 1] then use this, otherwise don't (and gain some precision - see https://bartwronski.com/2017/04/02/small-float-formats-r11g11b10f-precision/) 44 | #ifndef CMAA2_SUPPORT_HDR_COLOR_RANGE 45 | #define CMAA2_SUPPORT_HDR_COLOR_RANGE 0 46 | #endif 47 | 48 | // 0 is full color-based edge detection, 1 and 2 are idential log luma based, with the difference bing that 1 loads color and computes log luma in-place (less efficient) while 2 loads precomputed log luma from a separate R8_UNORM texture (more efficient). 49 | // Luma-based edge detection has a slightly lower quality but better performance so use it as a default; providing luma as a separate texture (or .a channel of the main one) will improve performance. 50 | // See RGBToLumaForEdges for luma conversions in non-HDR and HDR versions. 51 | #ifndef CMAA2_EDGE_DETECTION_LUMA_PATH 52 | #define CMAA2_EDGE_DETECTION_LUMA_PATH 1 53 | #endif 54 | 55 | // for CMAA2+MSAA support 56 | #ifndef CMAA_MSAA_SAMPLE_COUNT 57 | #define CMAA_MSAA_SAMPLE_COUNT 1 58 | #endif 59 | 60 | #define CMAA2_CS_OUTPUT_KERNEL_SIZE_X (CMAA2_CS_INPUT_KERNEL_SIZE_X-2) 61 | #define CMAA2_CS_OUTPUT_KERNEL_SIZE_Y (CMAA2_CS_INPUT_KERNEL_SIZE_Y-2) 62 | #define CMAA2_PROCESS_CANDIDATES_NUM_THREADS 128 63 | #define CMAA2_DEFERRED_APPLY_NUM_THREADS 32 64 | 65 | // Optimization paths 66 | #define CMAA2_DEFERRED_APPLY_THREADGROUP_SWAP 1 // 1 seems to be better or same on all HW 67 | #define CMAA2_COLLECT_EXPAND_BLEND_ITEMS 1 // this reschedules final part of work in the ProcessCandidatesCS (where the sampling and blending takes place) from few to all threads to increase hardware thread occupancy 68 | #ifndef CMAA2_USE_HALF_FLOAT_PRECISION 69 | #define CMAA2_USE_HALF_FLOAT_PRECISION 0 // use half precision by default? (not on by default due to driver issues on various different hardware, but let external code decide to define if needed) 70 | #endif 71 | 72 | #ifndef CMAA2_UAV_STORE_TYPED 73 | #error Warning - make sure correct value is set according to D3D11_FORMAT_SUPPORT_TYPED_UNORDERED_ACCESS_VIEW & D3D11_FORMAT_SUPPORT2_UAV_TYPED_STORE caps for the color UAV format used in g_inoutColorWriteonly 74 | #define CMAA2_UAV_STORE_TYPED 1 // use defaults that match the most common scenario: DXGI_FORMAT_R8G8B8A8_UNORM as UAV on a DXGI_FORMAT_R8G8B8A8_UNORM_SRGB resource (no typed stores for sRGB so we have to manually convert) 75 | #endif 76 | 77 | #ifndef CMAA2_UAV_STORE_CONVERT_TO_SRGB 78 | #error Warning - make sure correct value is set according to whether manual linear->sRGB color conversion is needed when writing color output to g_inoutColorWriteonly 79 | #define CMAA2_UAV_STORE_CONVERT_TO_SRGB 1 // use defaults that match the most common scenario: DXGI_FORMAT_R8G8B8A8_UNORM as UAV on a DXGI_FORMAT_R8G8B8A8_UNORM_SRGB resource (no typed stores for sRGB so we have to manually convert) 80 | #endif 81 | 82 | #ifndef CMAA2_UAV_STORE_TYPED_UNORM_FLOAT 83 | #error Warning - make sure correct value is set according to the color UAV format used in g_inoutColorWriteonly 84 | #define CMAA2_UAV_STORE_TYPED_UNORM_FLOAT 1 // for typed UAV stores: set to 1 for all _UNORM formats and to 0 for _FLOAT formats 85 | #endif 86 | 87 | #if CMAA2_UAV_STORE_TYPED 88 | #ifndef CMAA2_UAV_STORE_TYPED_UNORM_FLOAT 89 | #error When CMAA2_UAV_STORE_TYPED is set to 1, CMAA2_UAV_STORE_TYPED_UNORM_FLOAT must be set 1 if the color UAV is not a _FLOAT format or 0 if it is. 90 | #endif 91 | #else 92 | #ifndef CMAA2_UAV_STORE_UNTYPED_FORMAT 93 | #error Error - untyped format required (see FinalUAVStore function for the list) 94 | #endif 95 | #endif 96 | 97 | #if (CMAA2_USE_HALF_FLOAT_PRECISION != 0) 98 | #error this codepath needs testing - it's likely not valid anymore 99 | typedef min16float lpfloat; 100 | typedef min16float2 lpfloat2; 101 | typedef min16float3 lpfloat3; 102 | typedef min16float4 lpfloat4; 103 | #else 104 | typedef float lpfloat; 105 | typedef float2 lpfloat2; 106 | typedef float3 lpfloat3; 107 | typedef float4 lpfloat4; 108 | #endif 109 | 110 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 111 | // VARIOUS QUALITY SETTINGS 112 | // 113 | // Longest line search distance; must be even number; for high perf low quality start from ~32 - the bigger the number, 114 | // the nicer the gradients but more costly. Max supported is 128! 115 | static const uint c_maxLineLength = 86; 116 | // 117 | #ifndef CMAA2_EXTRA_SHARPNESS 118 | #define CMAA2_EXTRA_SHARPNESS 0 // Set to 1 to preserve even more text and shape clarity at the expense of less AA 119 | #endif 120 | // 121 | // It makes sense to slightly drop edge detection thresholds with increase in MSAA sample count, as with the higher 122 | // MSAA level the overall impact of CMAA2 alone is reduced but the cost increases. 123 | #define CMAA2_SCALE_QUALITY_WITH_MSAA 0 124 | // 125 | // 126 | #ifndef CMAA2_STATIC_QUALITY_PRESET 127 | #define CMAA2_STATIC_QUALITY_PRESET 2 // 0 - LOW, 1 - MEDIUM, 2 - HIGH, 3 - ULTRA 128 | #endif 129 | // presets (for HDR color buffer maybe use higher values) 130 | #if CMAA2_STATIC_QUALITY_PRESET == 0 // LOW 131 | #define g_CMAA2_EdgeThreshold lpfloat(0.15) 132 | #elif CMAA2_STATIC_QUALITY_PRESET == 1 // MEDIUM 133 | #define g_CMAA2_EdgeThreshold lpfloat(0.10) 134 | #elif CMAA2_STATIC_QUALITY_PRESET == 2 // HIGH (default) 135 | #define g_CMAA2_EdgeThreshold lpfloat(0.07) 136 | #elif CMAA2_STATIC_QUALITY_PRESET == 3 // ULTRA 137 | #define g_CMAA2_EdgeThreshold lpfloat(0.05) 138 | #else 139 | #error CMAA2_STATIC_QUALITY_PRESET not set? 140 | #endif 141 | // 142 | #if CMAA2_EXTRA_SHARPNESS 143 | #define g_CMAA2_LocalContrastAdaptationAmount lpfloat(0.15) 144 | #define g_CMAA2_SimpleShapeBlurinessAmount lpfloat(0.07) 145 | #else 146 | #define g_CMAA2_LocalContrastAdaptationAmount lpfloat(0.10) 147 | #define g_CMAA2_SimpleShapeBlurinessAmount lpfloat(0.10) 148 | #endif 149 | // 150 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 151 | 152 | 153 | #if CMAA_MSAA_SAMPLE_COUNT > 1 154 | #define CMAA_MSAA_USE_COMPLEXITY_MASK 1 155 | #endif 156 | 157 | #if CMAA2_EDGE_DETECTION_LUMA_PATH == 2 || CMAA2_EDGE_DETECTION_LUMA_PATH == 3 || CMAA_MSAA_USE_COMPLEXITY_MASK 158 | SamplerState g_gather_point_clamp_Sampler : register( s0 ); // there's also a slightly less efficient codepath that avoids Gather for easier porting 159 | #endif 160 | 161 | // Is the output UAV format R32_UINT for manual shader packing, or a supported UAV store format? 162 | #if CMAA2_UAV_STORE_TYPED 163 | #if CMAA2_UAV_STORE_TYPED_UNORM_FLOAT 164 | RWTexture2D g_inoutColorWriteonly : register( u0 ); // final output color 165 | #else 166 | RWTexture2D g_inoutColorWriteonly : register( u0 ); // final output color 167 | #endif 168 | #else 169 | RWTexture2D g_inoutColorWriteonly : register( u0 ); // final output color 170 | #endif 171 | 172 | #if CMAA2_EDGE_UNORM 173 | RWTexture2D g_workingEdges : register( u1 ); // output edges (only used in the fist pass) 174 | #else 175 | RWTexture2D g_workingEdges : register( u1 ); // output edges (only used in the fist pass) 176 | #endif 177 | 178 | RWStructuredBuffer g_workingShapeCandidates : register( u2 ); 179 | RWStructuredBuffer g_workingDeferredBlendLocationList : register( u3 ); 180 | RWStructuredBuffer g_workingDeferredBlendItemList : register( u4 ); // 181 | #if !defined(PLATFORM_NO_TEXTURE_ATOMICS) 182 | RWTexture2D g_workingDeferredBlendItemListHeads : register( u5 ); 183 | #else // NOTE: Metal doesn't support texture atomics! Using StructuredBuffer instead. 184 | RWStructuredBuffer g_workingDeferredBlendItemListHeads : register( u5 ); 185 | uint g_workingDeferredBlendItemListHeads_Width; // Width to calc flat index 186 | #endif 187 | RWByteAddressBuffer g_workingControlBuffer : register( u6 ); 188 | RWByteAddressBuffer g_workingExecuteIndirectBuffer : register( u7 ); 189 | 190 | #if CMAA_MSAA_SAMPLE_COUNT > 1 191 | Texture2DArray g_inColorMSReadonly : register( t2 ); // input MS color 192 | Texture2D g_inColorMSComplexityMaskReadonly : register( t1 ); // input MS color control surface 193 | #else 194 | Texture2D g_inoutColorReadonly : register( t0 ); // input color 195 | #endif 196 | 197 | #if CMAA2_EDGE_DETECTION_LUMA_PATH == 2 198 | Texture2D g_inLumaReadonly : register( t3 ); 199 | #endif 200 | 201 | #if defined(PLATFORM_NO_TEXTURE_ATOMICS) 202 | #define BUFFER_DIMENSION(buffer) float2 buffer##_Dim 203 | #define GET_BUFFER_DIMENSIONS(buffer, count, stride) \ 204 | count = buffer##_Dim.x; \ 205 | stride = buffer##_Dim.y; 206 | #else 207 | #define BUFFER_DIMENSION(buffer) 208 | #define GET_BUFFER_DIMENSIONS(buffer, count, stride) buffer.GetDimensions(count, stride) 209 | #endif 210 | 211 | BUFFER_DIMENSION(g_workingShapeCandidates); 212 | BUFFER_DIMENSION(g_workingDeferredBlendLocationList); 213 | 214 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 215 | // encoding/decoding of various data such as edges 216 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 217 | // how .rgba channels from the edge texture maps to pixel edges: 218 | // 219 | // A - 0x08 (A - there's an edge between us and a pixel above us) 220 | // |---------| (R - there's an edge between us and a pixel to the right) 221 | // | | (G - there's an edge between us and a pixel at the bottom) 222 | // 0x04 - B | pixel | R - 0x01 (B - there's an edge between us and a pixel to the left) 223 | // | | 224 | // |_________| 225 | // G - 0x02 226 | uint PackEdges( lpfloat4 edges ) // input edges are binary 0 or 1 227 | { 228 | return (uint)dot( edges, lpfloat4( 1, 2, 4, 8 ) ); 229 | } 230 | uint4 UnpackEdges( uint value ) 231 | { 232 | int4 ret; 233 | ret.x = ( value & 0x01 ) != 0; 234 | ret.y = ( value & 0x02 ) != 0; 235 | ret.z = ( value & 0x04 ) != 0; 236 | ret.w = ( value & 0x08 ) != 0; 237 | return ret; 238 | } 239 | lpfloat4 UnpackEdgesFlt( uint value ) 240 | { 241 | lpfloat4 ret; 242 | ret.x = ( value & 0x01 ) != 0; 243 | ret.y = ( value & 0x02 ) != 0; 244 | ret.z = ( value & 0x04 ) != 0; 245 | ret.w = ( value & 0x08 ) != 0; 246 | return ret; 247 | } 248 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 249 | 250 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 251 | // source color & color conversion helpers 252 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 253 | 254 | 255 | lpfloat3 LoadSourceColor( uint2 pixelPos, int2 offset, int sampleIndex ) 256 | { 257 | #if CMAA_MSAA_SAMPLE_COUNT > 1 258 | lpfloat3 color = g_inColorMSReadonly.Load( int4( pixelPos, sampleIndex, 0 ), offset ).rgb; 259 | #else 260 | #if !defined(PLATFORM_NO_TEXTURE_LOAD_OFFSET) 261 | lpfloat3 color = g_inoutColorReadonly.Load( int3( pixelPos, 0 ), offset ).rgb; 262 | #else 263 | lpfloat3 color = g_inoutColorReadonly.Load( int3( pixelPos + offset, 0 ) ).rgb; 264 | #endif 265 | #endif 266 | return color; 267 | } 268 | // 269 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 270 | // (R11G11B10 conversion code below taken from Miniengine's PixelPacking_R11G11B10.hlsli, 271 | // Copyright (c) Microsoft, MIT license, Developed by Minigraph, Author: James Stanard; original file link: 272 | // https://github.com/Microsoft/DirectX-Graphics-Samples/blob/master/MiniEngine/Core/Shaders/PixelPacking_R11G11B10.hlsli ) 273 | // 274 | // The standard 32-bit HDR color format. Each float has a 5-bit exponent and no sign bit. 275 | uint Pack_R11G11B10_FLOAT( float3 rgb ) 276 | { 277 | // Clamp upper bound so that it doesn't accidentally round up to INF 278 | // Exponent=15, Mantissa=1.11111 279 | rgb = min(rgb, asfloat(0x477C0000)); 280 | uint r = ((f32tof16(rgb.x) + 8) >> 4) & 0x000007FF; 281 | uint g = ((f32tof16(rgb.y) + 8) << 7) & 0x003FF800; 282 | uint b = ((f32tof16(rgb.z) + 16) << 17) & 0xFFC00000; 283 | return r | g | b; 284 | } 285 | 286 | float3 Unpack_R11G11B10_FLOAT( uint rgb ) 287 | { 288 | float r = f16tof32((rgb << 4 ) & 0x7FF0); 289 | float g = f16tof32((rgb >> 7 ) & 0x7FF0); 290 | float b = f16tof32((rgb >> 17) & 0x7FE0); 291 | return float3(r, g, b); 292 | } 293 | // 294 | // These next two encodings are great for LDR data. By knowing that our values are [0.0, 1.0] 295 | // (or [0.0, 2.0), incidentally), we can reduce how many bits we need in the exponent. We can 296 | // immediately eliminate all postive exponents. By giving more bits to the mantissa, we can 297 | // improve precision at the expense of range. The 8E3 format goes one bit further, quadrupling 298 | // mantissa precision but increasing smallest exponent from -14 to -6. The smallest value of 8E3 299 | // is 2^-14, while the smallest value of 7E4 is 2^-21. Both are smaller than the smallest 8-bit 300 | // sRGB value, which is close to 2^-12. 301 | // 302 | // This is like R11G11B10_FLOAT except that it moves one bit from each exponent to each mantissa. 303 | uint Pack_R11G11B10_E4_FLOAT( float3 rgb ) 304 | { 305 | // Clamp to [0.0, 2.0). The magic number is 1.FFFFF x 2^0. (We can't represent hex floats in HLSL.) 306 | // This trick works because clamping your exponent to 0 reduces the number of bits needed by 1. 307 | rgb = clamp( rgb, 0.0, asfloat(0x3FFFFFFF) ); 308 | uint r = ((f32tof16(rgb.r) + 4) >> 3 ) & 0x000007FF; 309 | uint g = ((f32tof16(rgb.g) + 4) << 8 ) & 0x003FF800; 310 | uint b = ((f32tof16(rgb.b) + 8) << 18) & 0xFFC00000; 311 | return r | g | b; 312 | } 313 | // 314 | float3 Unpack_R11G11B10_E4_FLOAT( uint rgb ) 315 | { 316 | float r = f16tof32((rgb << 3 ) & 0x3FF8); 317 | float g = f16tof32((rgb >> 8 ) & 0x3FF8); 318 | float b = f16tof32((rgb >> 18) & 0x3FF0); 319 | return float3(r, g, b); 320 | } 321 | // 322 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 323 | // This is for temporary storage - R11G11B10_E4 covers 8bit per channel sRGB well enough; 324 | // For HDR range (CMAA2_SUPPORT_HDR_COLOR_RANGE) use standard float packing - not using it by default because it's not precise 325 | // enough to match sRGB 8bit, but in a HDR scenario we simply need the range. 326 | // For even more precision un LDR try E3 version and there are other options for HDR range (see above 327 | // PixelPacking_R11G11GB10.hlsli link for a number of excellent options). 328 | // It's worth noting that since CMAA2 works on high contrast edges, the lack of precision will not be nearly as 329 | // noticeable as it would be on gradients (which always remain unaffected). 330 | lpfloat3 InternalUnpackColor( uint packedColor ) 331 | { 332 | #if CMAA2_SUPPORT_HDR_COLOR_RANGE 333 | // ideally using 32bit packing is best for performance reasons but there might be precision issues: look into 334 | // 335 | return Unpack_R11G11B10_FLOAT( packedColor ); 336 | #else 337 | return Unpack_R11G11B10_E4_FLOAT( packedColor ); 338 | #endif 339 | } 340 | // 341 | uint InternalPackColor( lpfloat3 color ) 342 | { 343 | #if CMAA2_SUPPORT_HDR_COLOR_RANGE 344 | return Pack_R11G11B10_FLOAT( color ); 345 | #else 346 | return Pack_R11G11B10_E4_FLOAT( color ); 347 | #endif 348 | } 349 | // 350 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 351 | // 352 | void StoreColorSample( uint2 pixelPos, lpfloat3 color, bool isComplexShape, uint msaaSampleIndex ) 353 | { 354 | uint counterIndex; g_workingControlBuffer.InterlockedAdd( 4*12, 1, counterIndex ); 355 | 356 | // quad coordinates 357 | uint2 quadPos = pixelPos / uint2( 2, 2 ); 358 | // 2x2 inter-quad coordinates 359 | uint offsetXY = (pixelPos.y % 2) * 2 + (pixelPos.x % 2); 360 | // encode item-specific info: {2 bits for 2x2 quad location}, {3 bits for MSAA sample index}, {1 bit for isComplexShape flag}, {26 bits left for address (index)} 361 | uint header = ( offsetXY << 30 ) | ( msaaSampleIndex << 27 ) | ( isComplexShape << 26 ); 362 | 363 | uint counterIndexWithHeader = counterIndex | header; 364 | 365 | uint originalIndex; 366 | #if !defined(PLATFORM_NO_TEXTURE_ATOMICS) 367 | InterlockedExchange( g_workingDeferredBlendItemListHeads[ quadPos ], counterIndexWithHeader, originalIndex ); 368 | #else 369 | uint quadPosFlat = quadPos.x + quadPos.y * g_workingDeferredBlendItemListHeads_Width; 370 | InterlockedExchange( g_workingDeferredBlendItemListHeads[ quadPosFlat ], counterIndexWithHeader, originalIndex ); 371 | #endif 372 | g_workingDeferredBlendItemList[counterIndex] = uint2( originalIndex, InternalPackColor( color ) ); 373 | 374 | // First one added? 375 | if( originalIndex == 0xFFFFFFFF ) 376 | { 377 | // Make a list of all edge pixels - these cover all potential pixels where AA is applied. 378 | uint edgeListCounter; g_workingControlBuffer.InterlockedAdd( 4*8, 1, edgeListCounter ); 379 | g_workingDeferredBlendLocationList[edgeListCounter] = (quadPos.x << 16) | quadPos.y; 380 | } 381 | } 382 | // 383 | #if CMAA2_COLLECT_EXPAND_BLEND_ITEMS 384 | #define CMAA2_BLEND_ITEM_SLM_SIZE 768 // there's a fallback for extreme cases (observed with this value set to 256 or below) in which case image will remain correct but performance will suffer 385 | groupshared uint g_groupSharedBlendItemCount; 386 | groupshared uint2 g_groupSharedBlendItems[ CMAA2_BLEND_ITEM_SLM_SIZE ]; 387 | #endif 388 | // 389 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 390 | // Untyped UAV store packing & sRGB conversion helpers 391 | // 392 | lpfloat LINEAR_to_SRGB( lpfloat val ) 393 | { 394 | if( val < 0.0031308 ) 395 | val *= lpfloat( 12.92 ); 396 | else 397 | val = lpfloat( 1.055 ) * pow( abs( val ), lpfloat( 1.0 ) / lpfloat( 2.4 ) ) - lpfloat( 0.055 ); 398 | return val; 399 | } 400 | lpfloat3 LINEAR_to_SRGB( lpfloat3 val ) 401 | { 402 | return lpfloat3( LINEAR_to_SRGB( val.x ), LINEAR_to_SRGB( val.y ), LINEAR_to_SRGB( val.z ) ); 403 | } 404 | // 405 | uint FLOAT4_to_R8G8B8A8_UNORM( lpfloat4 unpackedInput ) 406 | { 407 | return (( uint( saturate( unpackedInput.x ) * 255 + 0.5 ) ) | 408 | ( uint( saturate( unpackedInput.y ) * 255 + 0.5 ) << 8 ) | 409 | ( uint( saturate( unpackedInput.z ) * 255 + 0.5 ) << 16 ) | 410 | ( uint( saturate( unpackedInput.w ) * 255 + 0.5 ) << 24 ) ); 411 | } 412 | // 413 | uint FLOAT4_to_R10G10B10A2_UNORM( lpfloat4 unpackedInput ) 414 | { 415 | return (( uint( saturate( unpackedInput.x ) * 1023 + 0.5 ) ) | 416 | ( uint( saturate( unpackedInput.y ) * 1023 + 0.5 ) << 10 ) | 417 | ( uint( saturate( unpackedInput.z ) * 1023 + 0.5 ) << 20 ) | 418 | ( uint( saturate( unpackedInput.w ) * 3 + 0.5 ) << 30 ) ); 419 | } 420 | // 421 | // This handles various permutations for various formats with no/partial/full typed UAV store support 422 | void FinalUAVStore( uint2 pixelPos, lpfloat3 color ) 423 | { 424 | #if CMAA2_UAV_STORE_CONVERT_TO_SRGB 425 | color = LINEAR_to_SRGB( color ) ; 426 | #endif 427 | 428 | #if CMAA2_UAV_STORE_TYPED 429 | g_inoutColorWriteonly[ pixelPos ] = lpfloat4( color.rgb, 0 ); 430 | #else 431 | #if CMAA2_UAV_STORE_UNTYPED_FORMAT == 1 // R8G8B8A8_UNORM (or R8G8B8A8_UNORM_SRGB with CMAA2_UAV_STORE_CONVERT_TO_SRGB) 432 | g_inoutColorWriteonly[ pixelPos ] = FLOAT4_to_R8G8B8A8_UNORM( lpfloat4( color, 0 ) ); 433 | #elif CMAA2_UAV_STORE_UNTYPED_FORMAT == 2 // R10G10B10A2_UNORM (or R10G10B10A2_UNORM_SRGB with CMAA2_UAV_STORE_CONVERT_TO_SRGB) 434 | g_inoutColorWriteonly[ pixelPos ] = FLOAT4_to_R10G10B10A2_UNORM( lpfloat4( color, 0 ) ); 435 | #else 436 | #error CMAA color packing format not defined - add it here! 437 | #endif 438 | #endif 439 | } 440 | // 441 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 442 | 443 | 444 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 445 | // Edge detection and local contrast adaptation helpers 446 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 447 | // 448 | lpfloat GetActualEdgeThreshold( ) 449 | { 450 | lpfloat retVal = g_CMAA2_EdgeThreshold; 451 | #if CMAA2_SCALE_QUALITY_WITH_MSAA 452 | retVal *= 1.0 + (CMAA_MSAA_SAMPLE_COUNT-1) * 0.06; 453 | #endif 454 | return retVal; 455 | } 456 | // 457 | lpfloat EdgeDetectColorCalcDiff( lpfloat3 colorA, lpfloat3 colorB ) 458 | { 459 | const lpfloat3 LumWeights = lpfloat3( 0.299, 0.587, 0.114 ); 460 | lpfloat3 diff = abs( (colorA.rgb - colorB.rgb) ); 461 | return dot( diff.rgb, LumWeights.rgb ); 462 | } 463 | // 464 | // apply custom curve / processing to put input color (linear) in the format required by ComputeEdge 465 | lpfloat3 ProcessColorForEdgeDetect( lpfloat3 color ) 466 | { 467 | //pixelColors[i] = LINEAR_to_SRGB( pixelColors[i] ); // correct reference 468 | //pixelColors[i] = pow( max( 0, pixelColors[i], 1.0 / 2.4 ) ); // approximate sRGB curve 469 | return sqrt( color ); // just very roughly approximate RGB curve 470 | } 471 | // 472 | lpfloat2 ComputeEdge( int x, int y, lpfloat3 pixelColors[3 * 3 - 1] ) 473 | { 474 | lpfloat2 temp; 475 | temp.x = EdgeDetectColorCalcDiff( pixelColors[x + y * 3].rgb, pixelColors[x + 1 + y * 3].rgb ); 476 | temp.y = EdgeDetectColorCalcDiff( pixelColors[x + y * 3].rgb, pixelColors[x + ( y + 1 ) * 3].rgb ); 477 | return temp; // for HDR edge detection it might be good to premultiply both of these by some factor - otherwise clamping to 1 might prevent some local contrast adaptation. It's a very minor nitpick though, unlikely to significantly affect things. 478 | } 479 | // color -> log luma-for-edges conversion 480 | float RGBToLumaForEdges( float3 linearRGB ) 481 | { 482 | #if 0 483 | // this matches Miniengine luma path 484 | float Luma = dot( linearRGB, float3(0.212671, 0.715160, 0.072169) ); 485 | return log2(1 + Luma * 15) / 4; 486 | #else 487 | // this is what original FXAA (and consequently CMAA2) use by default - these coefficients correspond to Rec. 601 and those should be 488 | // used on gamma-compressed components (see https://en.wikipedia.org/wiki/Luma_(video)#Rec._601_luma_versus_Rec._709_luma_coefficients), 489 | float luma = dot( sqrt( linearRGB.rgb ), float3( 0.299, 0.587, 0.114 ) ); // http://en.wikipedia.org/wiki/CCIR_601 490 | // using sqrt luma for now but log luma like in miniengine provides a nicer curve on the low-end 491 | return luma; 492 | #endif 493 | } 494 | lpfloat2 ComputeEdgeLuma( int x, int y, lpfloat pixelLumas[3 * 3 - 1] ) 495 | { 496 | lpfloat2 temp; 497 | temp.x = abs( pixelLumas[x + y * 3] - pixelLumas[x + 1 + y * 3] ); 498 | temp.y = abs( pixelLumas[x + y * 3] - pixelLumas[x + ( y + 1 ) * 3] ); 499 | return temp; // for HDR edge detection it might be good to premultiply both of these by some factor - otherwise clamping to 1 might prevent some local contrast adaptation. It's a very minor nitpick though, unlikely to significantly affect things. 500 | } 501 | // 502 | lpfloat ComputeLocalContrastV( int x, int y, in lpfloat2 neighbourhood[4][4] ) 503 | { 504 | // new, small kernel 4-connecting-edges-only local contrast adaptation 505 | return max( max( neighbourhood[x + 1][y + 0].y, neighbourhood[x + 1][y + 1].y ), max( neighbourhood[x + 2][y + 0].y, neighbourhood[x + 2][y + 1].y ) ) * lpfloat( g_CMAA2_LocalContrastAdaptationAmount ); 506 | 507 | // // slightly bigger kernel that enhances edges in-line (not worth the cost) 508 | // return ( max( max( neighbourhood[x + 1][y + 0].y, neighbourhood[x + 1][y + 1].y ), max( neighbourhood[x + 2][y + 0].y, neighbourhood[x + 2][y + 1].y ) ) 509 | // - ( neighbourhood[x + 1][y + 0].x + neighbourhood[x + 1][y + 2].x ) * 0.3 ) * lpfloat( g_CMAA2_LocalContrastAdaptationAmount ); 510 | } 511 | // 512 | lpfloat ComputeLocalContrastH( int x, int y, in lpfloat2 neighbourhood[4][4] ) 513 | { 514 | // new, small kernel 4-connecting-edges-only local contrast adaptation 515 | return max( max( neighbourhood[x + 0][y + 1].x, neighbourhood[x + 1][y + 1].x ), max( neighbourhood[x + 0][y + 2].x, neighbourhood[x + 1][y + 2].x ) ) * lpfloat( g_CMAA2_LocalContrastAdaptationAmount ); 516 | 517 | // // slightly bigger kernel that enhances edges in-line (not worth the cost) 518 | // return ( max( max( neighbourhood[x + 0][y + 1].x, neighbourhood[x + 1][y + 1].x ), max( neighbourhood[x + 0][y + 2].x, neighbourhood[x + 1][y + 2].x ) ) 519 | // - ( neighbourhood[x + 0][y + 1].y + neighbourhood[x + 2][y + 1].y ) * 0.3 ) * lpfloat( g_CMAA2_LocalContrastAdaptationAmount ); 520 | } 521 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 522 | 523 | lpfloat4 ComputeSimpleShapeBlendValues( lpfloat4 edges, lpfloat4 edgesLeft, lpfloat4 edgesRight, lpfloat4 edgesTop, lpfloat4 edgesBottom, uniform bool dontTestShapeValidity ) 524 | { 525 | // a 3x3 kernel for higher quality handling of L-based shapes (still rather basic and conservative) 526 | 527 | lpfloat fromRight = edges.r; 528 | lpfloat fromBelow = edges.g; 529 | lpfloat fromLeft = edges.b; 530 | lpfloat fromAbove = edges.a; 531 | 532 | lpfloat blurCoeff = lpfloat( g_CMAA2_SimpleShapeBlurinessAmount ); 533 | 534 | lpfloat numberOfEdges = dot( edges, lpfloat4( 1, 1, 1, 1 ) ); 535 | 536 | lpfloat numberOfEdgesAllAround = dot(edgesLeft.bga + edgesRight.rga + edgesTop.rba + edgesBottom.rgb, lpfloat3( 1, 1, 1 ) ); 537 | 538 | // skip if already tested for before calling this function 539 | if( !dontTestShapeValidity ) 540 | { 541 | // No blur for straight edge 542 | if( numberOfEdges == 1 ) 543 | blurCoeff = 0; 544 | 545 | // L-like step shape ( only blur if it's a corner, not if it's two parallel edges) 546 | if( numberOfEdges == 2 ) 547 | blurCoeff *= ( ( lpfloat(1.0) - fromBelow * fromAbove ) * ( lpfloat(1.0) - fromRight * fromLeft ) ); 548 | } 549 | 550 | // L-like step shape 551 | //[branch] 552 | if( numberOfEdges == 2 ) 553 | { 554 | blurCoeff *= 0.75; 555 | 556 | #if 1 557 | float k = 0.9f; 558 | #if 0 559 | fromRight += k * (edges.g * edgesTop.r + edges.a * edgesBottom.r ); 560 | fromBelow += k * (edges.r * edgesLeft.g + edges.b * edgesRight.g ); 561 | fromLeft += k * (edges.g * edgesTop.b + edges.a * edgesBottom.b ); 562 | fromAbove += k * (edges.b * edgesRight.a + edges.r * edgesLeft.a ); 563 | #else 564 | fromRight += k * (edges.g * edgesTop.r * (1.0-edgesLeft.g) + edges.a * edgesBottom.r * (1.0-edgesLeft.a) ); 565 | fromBelow += k * (edges.b * edgesRight.g * (1.0-edgesTop.b) + edges.r * edgesLeft.g * (1.0-edgesTop.r) ); 566 | fromLeft += k * (edges.a * edgesBottom.b * (1.0-edgesRight.a) + edges.g * edgesTop.b * (1.0-edgesRight.g) ); 567 | fromAbove += k * (edges.r * edgesLeft.a * (1.0-edgesBottom.r) + edges.b * edgesRight.a * (1.0-edgesBottom.b) ); 568 | #endif 569 | #endif 570 | } 571 | 572 | // if( numberOfEdges == 3 ) 573 | // blurCoeff *= 0.95; 574 | 575 | // Dampen the blurring effect when lots of neighbouring edges - additionally preserves text and texture detail 576 | #if CMAA2_EXTRA_SHARPNESS 577 | blurCoeff *= saturate( 1.15 - numberOfEdgesAllAround / 8.0 ); 578 | #else 579 | blurCoeff *= saturate( 1.30 - numberOfEdgesAllAround / 10.0 ); 580 | #endif 581 | 582 | return lpfloat4( fromLeft, fromAbove, fromRight, fromBelow ) * blurCoeff; 583 | } 584 | 585 | uint LoadEdge( int2 pixelPos, int2 offset, uint msaaSampleIndex ) 586 | { 587 | #if CMAA_MSAA_SAMPLE_COUNT > 1 588 | uint edge = g_workingEdges.Load( pixelPos + offset ).x; 589 | edge = (edge >> (msaaSampleIndex*4)) & 0xF; 590 | #else 591 | #if CMAA_PACK_SINGLE_SAMPLE_EDGE_TO_HALF_WIDTH 592 | uint a = uint(pixelPos.x+offset.x) % 2; 593 | 594 | #if CMAA2_EDGE_UNORM 595 | uint edge = (uint)(g_workingEdges.Load( uint2( uint(pixelPos.x+offset.x)/2, pixelPos.y + offset.y ) ).x * 255.0 + 0.5); 596 | #else 597 | uint edge = g_workingEdges.Load( uint2( uint(pixelPos.x+offset.x)/2, pixelPos.y + offset.y ) ).x; 598 | #endif 599 | edge = (edge >> (a*4)) & 0xF; 600 | #else 601 | uint edge = g_workingEdges.Load( pixelPos + offset ).x; 602 | #endif 603 | #endif 604 | return edge; 605 | } 606 | 607 | groupshared lpfloat4 g_groupShared2x2FracEdgesH[CMAA2_CS_INPUT_KERNEL_SIZE_X * CMAA2_CS_INPUT_KERNEL_SIZE_Y]; 608 | groupshared lpfloat4 g_groupShared2x2FracEdgesV[CMAA2_CS_INPUT_KERNEL_SIZE_X * CMAA2_CS_INPUT_KERNEL_SIZE_Y]; 609 | // void GroupsharedLoadQuadH( uint addr, out lpfloat e00, out lpfloat e10, out lpfloat e01, out lpfloat e11 ) { lpfloat4 val = g_groupShared2x2FracEdgesH[addr]; e00 = val.x; e10 = val.y; e01 = val.z; e11 = val.w; } 610 | // void GroupsharedLoadQuadV( uint addr, out lpfloat e00, out lpfloat e10, out lpfloat e01, out lpfloat e11 ) { lpfloat4 val = g_groupShared2x2FracEdgesV[addr]; e00 = val.x; e10 = val.y; e01 = val.z; e11 = val.w; } 611 | void GroupsharedLoadQuadHV( uint addr, out lpfloat2 e00, out lpfloat2 e10, out lpfloat2 e01, out lpfloat2 e11 ) 612 | { 613 | lpfloat4 valH = g_groupShared2x2FracEdgesH[addr]; e00.y = valH.x; e10.y = valH.y; e01.y = valH.z; e11.y = valH.w; 614 | lpfloat4 valV = g_groupShared2x2FracEdgesV[addr]; e00.x = valV.x; e10.x = valV.y; e01.x = valV.z; e11.x = valV.w; 615 | } 616 | 617 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 618 | // Edge detection compute shader 619 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 620 | //groupshared uint g_groupShared2x2ProcColors[(CMAA2_CS_INPUT_KERNEL_SIZE_X * 2 + 1) * (CMAA2_CS_INPUT_KERNEL_SIZE_Y * 2 + 1)]; 621 | //groupshared float3 g_groupSharedResolvedMSColors[(CMAA2_CS_INPUT_KERNEL_SIZE_X * 2 + 1) * (CMAA2_CS_INPUT_KERNEL_SIZE_Y * 2 + 1)]; 622 | // 623 | [numthreads( CMAA2_CS_INPUT_KERNEL_SIZE_X, CMAA2_CS_INPUT_KERNEL_SIZE_Y, 1 )] 624 | void EdgesColor2x2CS( uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID ) 625 | { 626 | // screen position in the input (expanded) kernel (shifted one 2x2 block up/left) 627 | uint2 pixelPos = groupID.xy * int2( CMAA2_CS_OUTPUT_KERNEL_SIZE_X, CMAA2_CS_OUTPUT_KERNEL_SIZE_Y ) + groupThreadID.xy - int2( 1, 1 ); 628 | pixelPos *= int2( 2, 2 ); 629 | 630 | const uint2 qeOffsets[4] = { {0, 0}, {1, 0}, {0, 1}, {1, 1} }; 631 | const uint rowStride2x2 = CMAA2_CS_INPUT_KERNEL_SIZE_X; 632 | const uint centerAddr2x2 = groupThreadID.x + groupThreadID.y * rowStride2x2; 633 | // const uint msaaSliceStride2x2 = CMAA2_CS_INPUT_KERNEL_SIZE_X * CMAA2_CS_INPUT_KERNEL_SIZE_Y; 634 | const bool inOutputKernel = !any( bool4( groupThreadID.x == ( CMAA2_CS_INPUT_KERNEL_SIZE_X - 1 ), groupThreadID.x == 0, groupThreadID.y == ( CMAA2_CS_INPUT_KERNEL_SIZE_Y - 1 ), groupThreadID.y == 0 ) ); 635 | 636 | uint i; 637 | lpfloat2 qe0, qe1, qe2, qe3; 638 | uint4 outEdges = { 0, 0, 0, 0 }; 639 | 640 | #if CMAA_MSAA_SAMPLE_COUNT > 1 641 | bool firstLoopIsEnough = false; 642 | 643 | #if CMAA_MSAA_USE_COMPLEXITY_MASK 644 | { 645 | float2 texSize; 646 | g_inColorMSComplexityMaskReadonly.GetDimensions( texSize.x, texSize.y ); 647 | float2 gatherUV = float2(pixelPos) / texSize; 648 | float4 TL = g_inColorMSComplexityMaskReadonly.GatherRed( g_gather_point_clamp_Sampler, gatherUV, int2( 0, 0 ) ); 649 | float4 TR = g_inColorMSComplexityMaskReadonly.GatherRed( g_gather_point_clamp_Sampler, gatherUV, int2( 2, 0 ) ); 650 | float4 BL = g_inColorMSComplexityMaskReadonly.GatherRed( g_gather_point_clamp_Sampler, gatherUV, int2( 0, 2 ) ); 651 | float4 BR = g_inColorMSComplexityMaskReadonly.GatherRed( g_gather_point_clamp_Sampler, gatherUV, int2( 2, 2 ) ); 652 | float4 sumAll = TL+TR+BL+BR; 653 | firstLoopIsEnough = !any(sumAll); 654 | } 655 | #endif 656 | #endif 657 | 658 | 659 | // not optimal - to be optimized 660 | #if CMAA_MSAA_SAMPLE_COUNT > 1 661 | // clear this here to reduce complexity below - turns out it's quicker as well this way 662 | g_workingDeferredBlendItemListHeads[ uint2( pixelPos ) / 2 ] = 0xFFFFFFFF; 663 | [loop] 664 | for( uint msaaSampleIndex = 0; msaaSampleIndex < CMAA_MSAA_SAMPLE_COUNT; msaaSampleIndex++ ) 665 | { 666 | bool msaaSampleIsRelevant = !firstLoopIsEnough || msaaSampleIndex == 0; 667 | [branch] 668 | if( msaaSampleIsRelevant ) 669 | { 670 | #else 671 | { 672 | uint msaaSampleIndex = 0; 673 | #endif 674 | 675 | 676 | // edge detection 677 | #if CMAA2_EDGE_DETECTION_LUMA_PATH == 0 678 | lpfloat3 pixelColors[3 * 3 - 1]; 679 | [unroll] 680 | for( i = 0; i < 3 * 3 - 1; i++ ) 681 | pixelColors[i] = LoadSourceColor( pixelPos, int2( i % 3, i / 3 ), msaaSampleIndex ).rgb; 682 | 683 | [unroll] 684 | for( i = 0; i < 3 * 3 - 1; i++ ) 685 | pixelColors[i] = ProcessColorForEdgeDetect( pixelColors[i] ); 686 | 687 | qe0 = ComputeEdge( 0, 0, pixelColors ); 688 | qe1 = ComputeEdge( 1, 0, pixelColors ); 689 | qe2 = ComputeEdge( 0, 1, pixelColors ); 690 | qe3 = ComputeEdge( 1, 1, pixelColors ); 691 | #else // CMAA2_EDGE_DETECTION_LUMA_PATH != 0 692 | lpfloat pixelLumas[3 * 3 - 1]; 693 | #if CMAA2_EDGE_DETECTION_LUMA_PATH == 1 // compute in-place 694 | [unroll] 695 | for( i = 0; i < 3 * 3 - 1; i++ ) 696 | { 697 | lpfloat3 color = LoadSourceColor( pixelPos, int2( i % 3, i / 3 ), msaaSampleIndex ).rgb; 698 | pixelLumas[i] = RGBToLumaForEdges( color ); 699 | } 700 | #elif CMAA2_EDGE_DETECTION_LUMA_PATH == 2 // source from outside 701 | #if 0 // same as below, just without Gather 702 | [unroll] 703 | for( i = 0; i < 3 * 3 - 1; i++ ) 704 | pixelLumas[i] = g_inLumaReadonly.Load( int3( pixelPos, 0 ), int2( i % 3, i / 3 ) ).r; 705 | #else 706 | float2 texSize; 707 | g_inLumaReadonly.GetDimensions( texSize.x, texSize.y ); 708 | float2 gatherUV = (float2(pixelPos) + float2( 0.5, 0.5 )) / texSize; 709 | float4 TL = g_inLumaReadonly.GatherRed( g_gather_point_clamp_Sampler, gatherUV ); 710 | float4 TR = g_inLumaReadonly.GatherRed( g_gather_point_clamp_Sampler, gatherUV, int2( 1, 0 ) ); 711 | float4 BL = g_inLumaReadonly.GatherRed( g_gather_point_clamp_Sampler, gatherUV, int2( 0, 1 ) ); 712 | pixelLumas[0] = TL.w; pixelLumas[1] = TL.z; pixelLumas[2] = TR.z; pixelLumas[3] = TL.x; 713 | pixelLumas[4] = TL.y; pixelLumas[5] = TR.y; pixelLumas[6] = BL.x; pixelLumas[7] = BL.y; 714 | #endif 715 | #elif CMAA2_EDGE_DETECTION_LUMA_PATH == 3 // source in alpha channel of input color 716 | float2 texSize; 717 | g_inoutColorReadonly.GetDimensions( texSize.x, texSize.y ); 718 | float2 gatherUV = (float2(pixelPos) + float2( 0.5, 0.5 )) / texSize; 719 | float4 TL = g_inoutColorReadonly.GatherAlpha( g_gather_point_clamp_Sampler, gatherUV ); 720 | float4 TR = g_inoutColorReadonly.GatherAlpha( g_gather_point_clamp_Sampler, gatherUV, int2( 1, 0 ) ); 721 | float4 BL = g_inoutColorReadonly.GatherAlpha( g_gather_point_clamp_Sampler, gatherUV, int2( 0, 1 ) ); 722 | pixelLumas[0] = (lpfloat)TL.w; pixelLumas[1] = (lpfloat)TL.z; pixelLumas[2] = (lpfloat)TR.z; pixelLumas[3] = (lpfloat)TL.x; 723 | pixelLumas[4] = (lpfloat)TL.y; pixelLumas[5] = (lpfloat)TR.y; pixelLumas[6] = (lpfloat)BL.x; pixelLumas[7] = (lpfloat)BL.y; 724 | #endif 725 | qe0 = ComputeEdgeLuma( 0, 0, pixelLumas ); 726 | qe1 = ComputeEdgeLuma( 1, 0, pixelLumas ); 727 | qe2 = ComputeEdgeLuma( 0, 1, pixelLumas ); 728 | qe3 = ComputeEdgeLuma( 1, 1, pixelLumas ); 729 | #endif 730 | 731 | g_groupShared2x2FracEdgesV[centerAddr2x2 + rowStride2x2 * 0] = lpfloat4( qe0.x, qe1.x, qe2.x, qe3.x ); 732 | g_groupShared2x2FracEdgesH[centerAddr2x2 + rowStride2x2 * 0] = lpfloat4( qe0.y, qe1.y, qe2.y, qe3.y ); 733 | 734 | #if CMAA_MSAA_SAMPLE_COUNT > 1 735 | } // if( msaaSampleIsRelevant ) 736 | #endif 737 | 738 | GroupMemoryBarrierWithGroupSync( ); 739 | 740 | [branch] 741 | if( inOutputKernel ) 742 | { 743 | lpfloat2 topRow = g_groupShared2x2FracEdgesH[ centerAddr2x2 - rowStride2x2 ].zw; // top row's bottom edge 744 | lpfloat2 leftColumn = g_groupShared2x2FracEdgesV[ centerAddr2x2 - 1 ].yw; // left column's right edge 745 | 746 | bool someNonZeroEdges = any( lpfloat4( qe0, qe1 ) + lpfloat4( qe2, qe3 ) + lpfloat4( topRow[0], topRow[1], leftColumn[0], leftColumn[1] ) ); 747 | //bool someNonZeroEdges = packedCenterEdges.x | packedCenterEdges.y | (packedQuadP0M1.y & 0xFFFF0000) | (packedQuadM1P0.x & 0xFF00FF00); 748 | 749 | [branch] 750 | if( someNonZeroEdges ) 751 | { 752 | #if CMAA_MSAA_SAMPLE_COUNT == 1 753 | // Clear deferred color list heads to empty (if potentially needed - even though some edges might get culled by local contrast adaptation 754 | // step below, it's still cheaper to just clear it without additional logic) 755 | #if !defined(PLATFORM_NO_TEXTURE_ATOMICS) 756 | g_workingDeferredBlendItemListHeads[ uint2( pixelPos ) / 2 ] = 0xFFFFFFFF; 757 | #else 758 | uint quadPosFlat = pixelPos.x / 2 + pixelPos.y / 2 * g_workingDeferredBlendItemListHeads_Width; 759 | g_workingDeferredBlendItemListHeads[ quadPosFlat ] = 0xFFFFFFFF; 760 | #endif 761 | #endif 762 | 763 | lpfloat4 ce[4]; 764 | 765 | #if 1 // local contrast adaptation 766 | lpfloat2 dummyd0, dummyd1, dummyd2; 767 | lpfloat2 neighbourhood[4][4]; 768 | 769 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 770 | // load & unpack kernel data from SLM 771 | GroupsharedLoadQuadHV( centerAddr2x2 - rowStride2x2 - 1 , dummyd0, dummyd1, dummyd2, neighbourhood[0][0] ); 772 | GroupsharedLoadQuadHV( centerAddr2x2 - rowStride2x2 , dummyd0, dummyd1, neighbourhood[1][0], neighbourhood[2][0] ); 773 | GroupsharedLoadQuadHV( centerAddr2x2 - rowStride2x2 + 1 , dummyd0, dummyd1, neighbourhood[3][0], dummyd2 ); 774 | GroupsharedLoadQuadHV( centerAddr2x2 - 1 , dummyd0, neighbourhood[0][1], dummyd1, neighbourhood[0][2] ); 775 | GroupsharedLoadQuadHV( centerAddr2x2 + 1 , neighbourhood[3][1], dummyd0, neighbourhood[3][2], dummyd1 ); 776 | GroupsharedLoadQuadHV( centerAddr2x2 - 1 + rowStride2x2 , dummyd0, neighbourhood[0][3], dummyd1, dummyd2 ); 777 | GroupsharedLoadQuadHV( centerAddr2x2 + rowStride2x2 , neighbourhood[1][3], neighbourhood[2][3], dummyd0, dummyd1 ); 778 | neighbourhood[1][0].y = topRow[0]; // already in registers 779 | neighbourhood[2][0].y = topRow[1]; // already in registers 780 | neighbourhood[0][1].x = leftColumn[0]; // already in registers 781 | neighbourhood[0][2].x = leftColumn[1]; // already in registers 782 | neighbourhood[1][1] = qe0; // already in registers 783 | neighbourhood[2][1] = qe1; // already in registers 784 | neighbourhood[1][2] = qe2; // already in registers 785 | neighbourhood[2][2] = qe3; // already in registers 786 | //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 787 | 788 | topRow[0] = ( topRow[0] - ComputeLocalContrastH( 0, -1, neighbourhood ) ) > GetActualEdgeThreshold(); 789 | topRow[1] = ( topRow[1] - ComputeLocalContrastH( 1, -1, neighbourhood ) ) > GetActualEdgeThreshold(); 790 | leftColumn[0] = ( leftColumn[0] - ComputeLocalContrastV( -1, 0, neighbourhood ) ) > GetActualEdgeThreshold(); 791 | leftColumn[1] = ( leftColumn[1] - ComputeLocalContrastV( -1, 1, neighbourhood ) ) > GetActualEdgeThreshold(); 792 | 793 | ce[0].x = ( qe0.x - ComputeLocalContrastV( 0, 0, neighbourhood ) ) > GetActualEdgeThreshold(); 794 | ce[0].y = ( qe0.y - ComputeLocalContrastH( 0, 0, neighbourhood ) ) > GetActualEdgeThreshold(); 795 | ce[1].x = ( qe1.x - ComputeLocalContrastV( 1, 0, neighbourhood ) ) > GetActualEdgeThreshold(); 796 | ce[1].y = ( qe1.y - ComputeLocalContrastH( 1, 0, neighbourhood ) ) > GetActualEdgeThreshold(); 797 | ce[2].x = ( qe2.x - ComputeLocalContrastV( 0, 1, neighbourhood ) ) > GetActualEdgeThreshold(); 798 | ce[2].y = ( qe2.y - ComputeLocalContrastH( 0, 1, neighbourhood ) ) > GetActualEdgeThreshold(); 799 | ce[3].x = ( qe3.x - ComputeLocalContrastV( 1, 1, neighbourhood ) ) > GetActualEdgeThreshold(); 800 | ce[3].y = ( qe3.y - ComputeLocalContrastH( 1, 1, neighbourhood ) ) > GetActualEdgeThreshold(); 801 | #else 802 | topRow[0] = topRow[0] > GetActualEdgeThreshold(); 803 | topRow[1] = topRow[1] > GetActualEdgeThreshold(); 804 | leftColumn[0] = leftColumn[0]> GetActualEdgeThreshold(); 805 | leftColumn[1] = leftColumn[1]> GetActualEdgeThreshold(); 806 | ce[0].x = qe0.x > GetActualEdgeThreshold(); 807 | ce[0].y = qe0.y > GetActualEdgeThreshold(); 808 | ce[1].x = qe1.x > GetActualEdgeThreshold(); 809 | ce[1].y = qe1.y > GetActualEdgeThreshold(); 810 | ce[2].x = qe2.x > GetActualEdgeThreshold(); 811 | ce[2].y = qe2.y > GetActualEdgeThreshold(); 812 | ce[3].x = qe3.x > GetActualEdgeThreshold(); 813 | ce[3].y = qe3.y > GetActualEdgeThreshold(); 814 | #endif 815 | 816 | //left 817 | ce[0].z = leftColumn[0]; 818 | ce[1].z = ce[0].x; 819 | ce[2].z = leftColumn[1]; 820 | ce[3].z = ce[2].x; 821 | 822 | // top 823 | ce[0].w = topRow[0]; 824 | ce[1].w = topRow[1]; 825 | ce[2].w = ce[0].y; 826 | ce[3].w = ce[1].y; 827 | 828 | [unroll] 829 | for( i = 0; i < 4; i++ ) 830 | { 831 | const uint2 localPixelPos = pixelPos + qeOffsets[i]; 832 | 833 | const lpfloat4 edges = ce[i]; 834 | 835 | // if there's at least one two edge corner, this is a candidate for simple or complex shape processing... 836 | bool isCandidate = ( edges.x * edges.y + edges.y * edges.z + edges.z * edges.w + edges.w * edges.x ) != 0; 837 | if( isCandidate ) 838 | { 839 | uint counterIndex; g_workingControlBuffer.InterlockedAdd( 4*4, 1, counterIndex ); 840 | g_workingShapeCandidates[counterIndex] = (localPixelPos.x << 18) | (msaaSampleIndex << 14) | localPixelPos.y; 841 | } 842 | 843 | // Write out edges - we write out all, including empty pixels, to make sure shape detection edge tracing 844 | // doesn't continue on previous frame's edges that no longer exist. 845 | uint packedEdge = PackEdges( edges ); 846 | #if CMAA_MSAA_SAMPLE_COUNT > 1 847 | outEdges[i] |= packedEdge << (msaaSampleIndex * 4); 848 | #else 849 | outEdges[i] = packedEdge; 850 | #endif 851 | } 852 | } 853 | } 854 | } 855 | 856 | // finally, write the edges! 857 | [branch] 858 | if( inOutputKernel ) 859 | { 860 | #if CMAA_PACK_SINGLE_SAMPLE_EDGE_TO_HALF_WIDTH && CMAA_MSAA_SAMPLE_COUNT == 1 861 | #if CMAA2_EDGE_UNORM 862 | g_workingEdges[ int2(pixelPos.x/2, pixelPos.y+0) ] = ((outEdges[1] << 4) | outEdges[0]) / 255.0; 863 | g_workingEdges[ int2(pixelPos.x/2, pixelPos.y+1) ] = ((outEdges[3] << 4) | outEdges[2]) / 255.0; 864 | #else 865 | g_workingEdges[ int2(pixelPos.x/2, pixelPos.y+0) ] = (outEdges[1] << 4) | outEdges[0]; 866 | g_workingEdges[ int2(pixelPos.x/2, pixelPos.y+1) ] = (outEdges[3] << 4) | outEdges[2]; 867 | #endif 868 | #else 869 | { 870 | [unroll] for( uint i = 0; i < 4; i++ ) 871 | g_workingEdges[pixelPos + qeOffsets[i]] = outEdges[i]; 872 | } 873 | #endif 874 | } 875 | } 876 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 877 | 878 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 879 | // Compute shaders used to generate DispatchIndirec() control buffer 880 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 881 | // 882 | // Compute dispatch arguments for the DispatchIndirect() that calls ProcessCandidatesCS and DeferredColorApply2x2CS 883 | [numthreads( 1, 1, 1 )] 884 | void ComputeDispatchArgsCS( uint3 groupID : SV_GroupID ) 885 | { 886 | // activated once on Dispatch( 2, 1, 1 ) 887 | if( groupID.x == 1 ) 888 | { 889 | // get current count 890 | uint shapeCandidateCount = g_workingControlBuffer.Load(4*4); 891 | 892 | // check for overflow! 893 | uint appendBufferMaxCount; uint appendBufferStride; 894 | GET_BUFFER_DIMENSIONS(g_workingShapeCandidates, appendBufferMaxCount, appendBufferStride); 895 | shapeCandidateCount = min( shapeCandidateCount, appendBufferMaxCount ); 896 | 897 | // write dispatch indirect arguments for ProcessCandidatesCS 898 | g_workingExecuteIndirectBuffer.Store( 4*0, ( shapeCandidateCount + CMAA2_PROCESS_CANDIDATES_NUM_THREADS - 1 ) / CMAA2_PROCESS_CANDIDATES_NUM_THREADS ); 899 | g_workingExecuteIndirectBuffer.Store( 4*1, 1 ); 900 | g_workingExecuteIndirectBuffer.Store( 4*2, 1 ); 901 | 902 | // write actual number of items to process in ProcessCandidatesCS 903 | g_workingControlBuffer.Store( 4*3, shapeCandidateCount ); 904 | } 905 | // activated once on Dispatch( 1, 2, 1 ) 906 | else if( groupID.y == 1 ) 907 | { 908 | // get current count 909 | uint blendLocationCount = g_workingControlBuffer.Load(4*8); 910 | 911 | // check for overflow! 912 | { 913 | uint appendBufferMaxCount; uint appendBufferStride; 914 | GET_BUFFER_DIMENSIONS(g_workingDeferredBlendLocationList, appendBufferMaxCount, appendBufferStride); 915 | blendLocationCount = min( blendLocationCount, appendBufferMaxCount ); 916 | } 917 | 918 | // write dispatch indirect arguments for DeferredColorApply2x2CS 919 | #if CMAA2_DEFERRED_APPLY_THREADGROUP_SWAP 920 | g_workingExecuteIndirectBuffer.Store( 4*0, 1 ); 921 | g_workingExecuteIndirectBuffer.Store( 4*1, ( blendLocationCount + CMAA2_DEFERRED_APPLY_NUM_THREADS - 1 ) / CMAA2_DEFERRED_APPLY_NUM_THREADS ); 922 | #else 923 | g_workingExecuteIndirectBuffer.Store( 4*0, ( blendLocationCount + CMAA2_DEFERRED_APPLY_NUM_THREADS - 1 ) / CMAA2_DEFERRED_APPLY_NUM_THREADS ); 924 | g_workingExecuteIndirectBuffer.Store( 4*1, 1 ); 925 | #endif 926 | g_workingExecuteIndirectBuffer.Store( 4*2, 1 ); 927 | 928 | // write actual number of items to process in DeferredColorApply2x2CS 929 | g_workingControlBuffer.Store( 4*3, blendLocationCount); 930 | 931 | // clear counters for next frame 932 | g_workingControlBuffer.Store( 4*4 , 0 ); 933 | g_workingControlBuffer.Store( 4*8 , 0 ); 934 | g_workingControlBuffer.Store( 4*12, 0 ); 935 | } 936 | } 937 | // 938 | /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 939 | 940 | 941 | void FindZLineLengths( out lpfloat lineLengthLeft, out lpfloat lineLengthRight, uint2 screenPos, uniform bool horizontal, uniform bool invertedZShape, const float2 stepRight, uint msaaSampleIndex ) 942 | { 943 | // this enables additional conservativeness test but is pretty detrimental to the final effect so left disabled by default even when CMAA2_EXTRA_SHARPNESS is enabled 944 | #define CMAA2_EXTRA_CONSERVATIVENESS2 0 945 | ///////////////////////////////////////////////////////////////////////////////////////////////////////// 946 | // TODO: a cleaner and faster way to get to these - a precalculated array indexing maybe? 947 | uint maskLeft, bitsContinueLeft, maskRight, bitsContinueRight; 948 | { 949 | // Horizontal (vertical is the same, just rotated 90- counter-clockwise) 950 | // Inverted Z case: // Normal Z case: 951 | // __ // __ 952 | // X| // X| 953 | // -- // -- 954 | uint maskTraceLeft, maskTraceRight; 955 | #if CMAA2_EXTRA_CONSERVATIVENESS2 956 | uint maskStopLeft, maskStopRight; 957 | #endif 958 | if( horizontal ) 959 | { 960 | maskTraceLeft = 0x08; // tracing top edge 961 | maskTraceRight = 0x02; // tracing bottom edge 962 | #if CMAA2_EXTRA_CONSERVATIVENESS2 963 | maskStopLeft = 0x01; // stop on right edge 964 | maskStopRight = 0x04; // stop on left edge 965 | #endif 966 | } 967 | else 968 | { 969 | maskTraceLeft = 0x04; // tracing left edge 970 | maskTraceRight = 0x01; // tracing right edge 971 | #if CMAA2_EXTRA_CONSERVATIVENESS2 972 | maskStopLeft = 0x08; // stop on top edge 973 | maskStopRight = 0x02; // stop on bottom edge 974 | #endif 975 | } 976 | if( invertedZShape ) 977 | { 978 | uint temp = maskTraceLeft; 979 | maskTraceLeft = maskTraceRight; 980 | maskTraceRight = temp; 981 | } 982 | maskLeft = maskTraceLeft; 983 | bitsContinueLeft = maskTraceLeft; 984 | maskRight = maskTraceRight; 985 | #if CMAA2_EXTRA_CONSERVATIVENESS2 986 | maskLeft |= maskStopLeft; 987 | maskRight |= maskStopRight; 988 | #endif 989 | bitsContinueRight = maskTraceRight; 990 | } 991 | ///////////////////////////////////////////////////////////////////////////////////////////////////////// 992 | 993 | bool continueLeft = true; 994 | bool continueRight = true; 995 | lineLengthLeft = 1; 996 | lineLengthRight = 1; 997 | [loop] 998 | for( ; ; ) 999 | { 1000 | uint edgeLeft = LoadEdge( screenPos.xy - stepRight * float(lineLengthLeft) , int2( 0, 0 ), msaaSampleIndex ); 1001 | uint edgeRight = LoadEdge( screenPos.xy + stepRight * ( float(lineLengthRight) + 1 ) , int2( 0, 0 ), msaaSampleIndex ); 1002 | 1003 | // stop on encountering 'stopping' edge (as defined by masks) 1004 | continueLeft = continueLeft && ( ( edgeLeft & maskLeft ) == bitsContinueLeft ); 1005 | continueRight = continueRight && ( ( edgeRight & maskRight ) == bitsContinueRight ); 1006 | 1007 | lineLengthLeft += continueLeft; 1008 | lineLengthRight += continueRight; 1009 | 1010 | lpfloat maxLR = max( lineLengthRight, lineLengthLeft ); 1011 | 1012 | // both stopped? cause the search end by setting maxLR to max length. 1013 | if( !continueLeft && !continueRight ) 1014 | maxLR = (lpfloat)c_maxLineLength; 1015 | 1016 | // either the longer one is ahead of the smaller (already stopped) one by more than a factor of x, or both 1017 | // are stopped - end the search. 1018 | #if CMAA2_EXTRA_SHARPNESS 1019 | if( maxLR >= min( (lpfloat)c_maxLineLength, (1.20 * min( lineLengthRight, lineLengthLeft ) - 0.20) ) ) 1020 | #else 1021 | if( maxLR >= min( (lpfloat)c_maxLineLength, (1.25 * min( lineLengthRight, lineLengthLeft ) - 0.25) ) ) 1022 | #endif 1023 | break; 1024 | } 1025 | } 1026 | 1027 | // these are blendZ settings, determined empirically :) 1028 | static const lpfloat c_symmetryCorrectionOffset = lpfloat( 0.22 ); 1029 | #if CMAA2_EXTRA_SHARPNESS 1030 | static const lpfloat c_dampeningEffect = lpfloat( 0.11 ); 1031 | #else 1032 | static const lpfloat c_dampeningEffect = lpfloat( 0.15 ); 1033 | #endif 1034 | 1035 | #if CMAA2_COLLECT_EXPAND_BLEND_ITEMS 1036 | bool CollectBlendZs( uint2 screenPos, bool horizontal, bool invertedZShape, lpfloat shapeQualityScore, lpfloat lineLengthLeft, lpfloat lineLengthRight, float2 stepRight, uint msaaSampleIndex ) 1037 | { 1038 | lpfloat leftOdd = c_symmetryCorrectionOffset * lpfloat( lineLengthLeft % 2 ); 1039 | lpfloat rightOdd = c_symmetryCorrectionOffset * lpfloat( lineLengthRight % 2 ); 1040 | 1041 | lpfloat dampenEffect = saturate( lpfloat(lineLengthLeft + lineLengthRight - shapeQualityScore) * c_dampeningEffect ) ; 1042 | 1043 | lpfloat loopFrom = -floor( ( lineLengthLeft + 1 ) / 2 ) + 1.0; 1044 | lpfloat loopTo = floor( ( lineLengthRight + 1 ) / 2 ); 1045 | 1046 | uint itemIndex; 1047 | const uint blendItemCount = loopTo-loopFrom+1; 1048 | InterlockedAdd( g_groupSharedBlendItemCount, blendItemCount, itemIndex ); 1049 | // safety 1050 | if( (itemIndex+blendItemCount) > CMAA2_BLEND_ITEM_SLM_SIZE ) 1051 | return false; 1052 | 1053 | lpfloat totalLength = lpfloat(loopTo - loopFrom) + 1 - leftOdd - rightOdd; 1054 | lpfloat lerpStep = lpfloat(1.0) / totalLength; 1055 | 1056 | lpfloat lerpFromK = (0.5 - leftOdd - loopFrom) * lerpStep; 1057 | 1058 | uint itemHeader = (screenPos.x << 18) | (msaaSampleIndex << 14) | screenPos.y; 1059 | uint itemValStatic = (horizontal << 31) | (invertedZShape << 30); 1060 | 1061 | for( lpfloat i = loopFrom; i <= loopTo; i++ ) 1062 | { 1063 | lpfloat lerpVal = lerpStep * i + lerpFromK; 1064 | 1065 | lpfloat secondPart = (i>0); 1066 | lpfloat srcOffset = 1.0 - secondPart * 2.0; 1067 | 1068 | lpfloat lerpK = (lerpStep * i + lerpFromK) * srcOffset + secondPart; 1069 | lerpK *= dampenEffect; 1070 | 1071 | int2 encodedItem; 1072 | encodedItem.x = itemHeader; 1073 | encodedItem.y = itemValStatic | ((uint(i+256) /*& 0x3FF*/) << 20) | ( (uint(srcOffset+256) /*& 0x3FF*/ ) << 10 ) | uint( saturate(lerpK) * 1023 + 0.5 ); 1074 | g_groupSharedBlendItems[itemIndex++] = encodedItem; 1075 | } 1076 | return true; 1077 | } 1078 | #endif 1079 | 1080 | void BlendZs( uint2 screenPos, bool horizontal, bool invertedZShape, lpfloat shapeQualityScore, lpfloat lineLengthLeft, lpfloat lineLengthRight, float2 stepRight, uint msaaSampleIndex ) 1081 | { 1082 | float2 blendDir = ( horizontal ) ? ( float2( 0, -1 ) ) : ( float2( -1, 0 ) ); 1083 | 1084 | if( invertedZShape ) 1085 | blendDir = -blendDir; 1086 | 1087 | lpfloat leftOdd = c_symmetryCorrectionOffset * lpfloat( lineLengthLeft % 2 ); 1088 | lpfloat rightOdd = c_symmetryCorrectionOffset * lpfloat( lineLengthRight % 2 ); 1089 | 1090 | lpfloat dampenEffect = saturate( lpfloat(lineLengthLeft + lineLengthRight - shapeQualityScore) * c_dampeningEffect ) ; 1091 | 1092 | lpfloat loopFrom = -floor( ( lineLengthLeft + 1 ) / 2 ) + 1.0; 1093 | lpfloat loopTo = floor( ( lineLengthRight + 1 ) / 2 ); 1094 | 1095 | lpfloat totalLength = lpfloat(loopTo - loopFrom) + 1 - leftOdd - rightOdd; 1096 | lpfloat lerpStep = lpfloat(1.0) / totalLength; 1097 | 1098 | lpfloat lerpFromK = (0.5 - leftOdd - loopFrom) * lerpStep; 1099 | 1100 | for( lpfloat i = loopFrom; i <= loopTo; i++ ) 1101 | { 1102 | lpfloat lerpVal = lerpStep * i + lerpFromK; 1103 | 1104 | lpfloat secondPart = (i>0); 1105 | lpfloat srcOffset = 1.0 - secondPart * 2.0; 1106 | 1107 | lpfloat lerpK = (lerpStep * i + lerpFromK) * srcOffset + secondPart; 1108 | lerpK *= dampenEffect; 1109 | 1110 | float2 pixelPos = screenPos + stepRight * float(i); 1111 | 1112 | lpfloat3 colorCenter = LoadSourceColor( pixelPos, int2( 0, 0 ), msaaSampleIndex ).rgb; 1113 | lpfloat3 colorFrom = LoadSourceColor( pixelPos.xy + blendDir * float(srcOffset).xx, int2( 0, 0 ), msaaSampleIndex ).rgb; 1114 | 1115 | lpfloat3 output = lerp( colorCenter.rgb, colorFrom.rgb, lerpK ); 1116 | 1117 | StoreColorSample( pixelPos.xy, output, true, msaaSampleIndex ); 1118 | } 1119 | } 1120 | 1121 | // TODO: 1122 | // There were issues with moving this (including the calling code) to half-float on some hardware (broke in certain cases on RX 480). 1123 | // Further investigation is required. 1124 | void DetectZsHorizontal( in lpfloat4 edges, in lpfloat4 edgesM1P0, in lpfloat4 edgesP1P0, in lpfloat4 edgesP2P0, out lpfloat invertedZScore, out lpfloat normalZScore ) 1125 | { 1126 | // Inverted Z case: 1127 | // __ 1128 | // X| 1129 | // -- 1130 | { 1131 | invertedZScore = edges.r * edges.g * edgesP1P0.a; 1132 | invertedZScore *= 2.0 + ((edgesM1P0.g + edgesP2P0.a) ) - (edges.a + edgesP1P0.g) - 0.7 * (edgesP2P0.g + edgesM1P0.a + edges.b + edgesP1P0.r); 1133 | } 1134 | 1135 | // Normal Z case: 1136 | // __ 1137 | // X| 1138 | // -- 1139 | { 1140 | normalZScore = edges.r * edges.a * edgesP1P0.g; 1141 | normalZScore *= 2.0 + ((edgesM1P0.a + edgesP2P0.g) ) - (edges.g + edgesP1P0.a) - 0.7 * (edgesP2P0.a + edgesM1P0.g + edges.b + edgesP1P0.r); 1142 | } 1143 | } 1144 | 1145 | [numthreads( CMAA2_PROCESS_CANDIDATES_NUM_THREADS, 1, 1 )] 1146 | void ProcessCandidatesCS( uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID ) 1147 | { 1148 | #if CMAA2_COLLECT_EXPAND_BLEND_ITEMS 1149 | if( groupThreadID.x == 0 ) 1150 | g_groupSharedBlendItemCount = 0; 1151 | GroupMemoryBarrierWithGroupSync( ); 1152 | #endif 1153 | 1154 | uint msaaSampleIndex = 0; 1155 | const uint numCandidates = g_workingControlBuffer.Load(4*3); //g_workingControlBuffer[3]; 1156 | if( dispatchThreadID.x < numCandidates ) 1157 | { 1158 | 1159 | uint pixelID = g_workingShapeCandidates[dispatchThreadID.x]; 1160 | 1161 | #if 0 // debug display 1162 | uint2 screenSize; 1163 | g_inoutColorReadonly.GetDimensions( screenSize.x, screenSize.y ); 1164 | StoreColorSample( uint2(dispatchThreadID.x % screenSize.x, dispatchThreadID.x / screenSize.x), lpfloat3( 1, 1, 0 ), false, msaaSampleIndex ); 1165 | return; 1166 | #endif 1167 | 1168 | uint2 pixelPos = uint2( (pixelID >> 18) /*& 0x3FFF*/, pixelID & 0x3FFF ); 1169 | #if CMAA_MSAA_SAMPLE_COUNT > 1 1170 | msaaSampleIndex = (pixelID >> 14) & 0x07; 1171 | #endif 1172 | 1173 | #if CMAA_MSAA_SAMPLE_COUNT > 1 1174 | int4 loadPosCenter = int4( pixelPos, msaaSampleIndex, 0 ); 1175 | #else 1176 | int3 loadPosCenter = int3( pixelPos, 0 ); 1177 | #endif 1178 | 1179 | uint edgesCenterPacked = LoadEdge( pixelPos, int2( 0, 0 ), msaaSampleIndex ); 1180 | lpfloat4 edges = UnpackEdgesFlt( edgesCenterPacked ); 1181 | lpfloat4 edgesLeft = UnpackEdgesFlt( LoadEdge( pixelPos, int2( -1, 0 ), msaaSampleIndex ) ); 1182 | lpfloat4 edgesRight = UnpackEdgesFlt( LoadEdge( pixelPos, int2( 1, 0 ), msaaSampleIndex ) ); 1183 | lpfloat4 edgesBottom= UnpackEdgesFlt( LoadEdge( pixelPos, int2( 0, 1 ), msaaSampleIndex ) ); 1184 | lpfloat4 edgesTop = UnpackEdgesFlt( LoadEdge( pixelPos, int2( 0, -1 ), msaaSampleIndex ) ); 1185 | 1186 | // simple shapes 1187 | { 1188 | lpfloat4 blendVal = ComputeSimpleShapeBlendValues( edges, edgesLeft, edgesRight, edgesTop, edgesBottom, true ); 1189 | 1190 | const lpfloat fourWeightSum = dot( blendVal, lpfloat4( 1, 1, 1, 1 ) ); 1191 | const lpfloat centerWeight = 1.0 - fourWeightSum; 1192 | 1193 | lpfloat3 outColor = LoadSourceColor( pixelPos, int2( 0, 0 ), msaaSampleIndex ).rgb * centerWeight; 1194 | [flatten] 1195 | if( blendVal.x > 0.0 ) // from left 1196 | { 1197 | lpfloat3 pixelL = LoadSourceColor( pixelPos, int2( -1, 0 ), msaaSampleIndex ).rgb; 1198 | outColor.rgb += blendVal.x * pixelL; 1199 | } 1200 | [flatten] 1201 | if( blendVal.y > 0.0 ) // from above 1202 | { 1203 | lpfloat3 pixelT = LoadSourceColor( pixelPos, int2( 0, -1 ), msaaSampleIndex ).rgb; 1204 | outColor.rgb += blendVal.y * pixelT; 1205 | } 1206 | [flatten] 1207 | if( blendVal.z > 0.0 ) // from right 1208 | { 1209 | lpfloat3 pixelR = LoadSourceColor( pixelPos, int2( 1, 0 ), msaaSampleIndex ).rgb; 1210 | outColor.rgb += blendVal.z * pixelR; 1211 | } 1212 | [flatten] 1213 | if( blendVal.w > 0.0 ) // from below 1214 | { 1215 | lpfloat3 pixelB = LoadSourceColor( pixelPos, int2( 0, 1 ), msaaSampleIndex ).rgb; 1216 | outColor.rgb += blendVal.w * pixelB; 1217 | } 1218 | 1219 | StoreColorSample( pixelPos.xy, outColor, false, msaaSampleIndex ); 1220 | } 1221 | 1222 | // complex shapes - detect 1223 | { 1224 | lpfloat invertedZScore; 1225 | lpfloat normalZScore; 1226 | lpfloat maxScore; 1227 | bool horizontal = true; 1228 | bool invertedZ = false; 1229 | // lpfloat shapeQualityScore; // 0 - best quality, 1 - some edges missing but ok, 2 & 3 - dubious but better than nothing 1230 | 1231 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1232 | // horizontal 1233 | { 1234 | lpfloat4 edgesM1P0 = edgesLeft; 1235 | lpfloat4 edgesP1P0 = edgesRight; 1236 | lpfloat4 edgesP2P0 = UnpackEdgesFlt( LoadEdge( pixelPos, int2( 2, 0 ), msaaSampleIndex ) ); 1237 | 1238 | DetectZsHorizontal( edges, edgesM1P0, edgesP1P0, edgesP2P0, invertedZScore, normalZScore ); 1239 | maxScore = max( invertedZScore, normalZScore ); 1240 | 1241 | if( maxScore > 0 ) 1242 | { 1243 | invertedZ = invertedZScore > normalZScore; 1244 | } 1245 | } 1246 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1247 | 1248 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1249 | // vertical 1250 | { 1251 | // Reuse the same code for vertical (used for horizontal above), but rotate input data 90 degrees counter-clockwise, so that: 1252 | // left becomes bottom 1253 | // top becomes left 1254 | // right becomes top 1255 | // bottom becomes right 1256 | 1257 | // we also have to rotate edges, thus .argb 1258 | lpfloat4 edgesM1P0 = edgesBottom; 1259 | lpfloat4 edgesP1P0 = edgesTop; 1260 | lpfloat4 edgesP2P0 = UnpackEdgesFlt( LoadEdge( pixelPos, int2( 0, -2 ), msaaSampleIndex ) ); 1261 | 1262 | DetectZsHorizontal( edges.argb, edgesM1P0.argb, edgesP1P0.argb, edgesP2P0.argb, invertedZScore, normalZScore ); 1263 | lpfloat vertScore = max( invertedZScore, normalZScore ); 1264 | 1265 | if( vertScore > maxScore ) 1266 | { 1267 | maxScore = vertScore; 1268 | horizontal = false; 1269 | invertedZ = invertedZScore > normalZScore; 1270 | //shapeQualityScore = floor( clamp(4.0 - maxScore, 0.0, 3.0) ); 1271 | } 1272 | } 1273 | ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 1274 | 1275 | if( maxScore > 0 ) 1276 | { 1277 | #if CMAA2_EXTRA_SHARPNESS 1278 | lpfloat shapeQualityScore = round( clamp(4.0 - maxScore, 0.0, 3.0) ); // 0 - best quality, 1 - some edges missing but ok, 2 & 3 - dubious but better than nothing 1279 | #else 1280 | lpfloat shapeQualityScore = floor( clamp(4.0 - maxScore, 0.0, 3.0) ); // 0 - best quality, 1 - some edges missing but ok, 2 & 3 - dubious but better than nothing 1281 | #endif 1282 | 1283 | const float2 stepRight = ( horizontal ) ? ( float2( 1, 0 ) ) : ( float2( 0, -1 ) ); 1284 | lpfloat lineLengthLeft, lineLengthRight; 1285 | FindZLineLengths( lineLengthLeft, lineLengthRight, pixelPos, horizontal, invertedZ, stepRight, msaaSampleIndex ); 1286 | 1287 | lineLengthLeft -= shapeQualityScore; 1288 | lineLengthRight -= shapeQualityScore; 1289 | 1290 | if( ( lineLengthLeft + lineLengthRight ) >= (5.0) ) 1291 | { 1292 | #if CMAA2_COLLECT_EXPAND_BLEND_ITEMS 1293 | // try adding to SLM but fall back to in-place processing if full (which only really happens in synthetic test cases) 1294 | if( !CollectBlendZs( pixelPos, horizontal, invertedZ, shapeQualityScore, lineLengthLeft, lineLengthRight, stepRight, msaaSampleIndex ) ) 1295 | #endif 1296 | BlendZs( pixelPos, horizontal, invertedZ, shapeQualityScore, lineLengthLeft, lineLengthRight, stepRight, msaaSampleIndex ); 1297 | } 1298 | } 1299 | } 1300 | 1301 | } 1302 | 1303 | #if CMAA2_COLLECT_EXPAND_BLEND_ITEMS 1304 | GroupMemoryBarrierWithGroupSync( ); 1305 | 1306 | uint totalItemCount = min( CMAA2_BLEND_ITEM_SLM_SIZE, g_groupSharedBlendItemCount ); 1307 | 1308 | // spread items into waves 1309 | uint loops = (totalItemCount+(CMAA2_PROCESS_CANDIDATES_NUM_THREADS-1)-groupThreadID.x)/CMAA2_PROCESS_CANDIDATES_NUM_THREADS; 1310 | 1311 | for( uint loop = 0; loop < loops; loop++ ) 1312 | { 1313 | uint index = loop*CMAA2_PROCESS_CANDIDATES_NUM_THREADS + groupThreadID.x; 1314 | 1315 | uint2 itemVal = g_groupSharedBlendItems[index]; 1316 | 1317 | uint2 startingPos = uint2( (itemVal.x >> 18) /*& 0x3FFF*/, itemVal.x & 0x3FFF ); 1318 | uint itemMSAASampleIndex= 0; 1319 | #if CMAA_MSAA_SAMPLE_COUNT > 1 1320 | itemMSAASampleIndex = (itemVal.x >> 14) & 0x07; 1321 | #endif 1322 | 1323 | bool itemHorizontal = (itemVal.y >> 31) & 1; 1324 | bool itemInvertedZ = (itemVal.y >> 30) & 1; 1325 | lpfloat itemStepIndex = float((itemVal.y >> 20) & 0x3FF) - 256.0; 1326 | lpfloat itemSrcOffset = ((itemVal.y >> 10) & 0x3FF) - 256.0; 1327 | lpfloat itemLerpK = (itemVal.y & 0x3FF) / 1023.0; 1328 | 1329 | lpfloat2 itemStepRight = ( itemHorizontal ) ? ( lpfloat2( 1, 0 ) ) : ( lpfloat2( 0, -1 ) ); 1330 | lpfloat2 itemBlendDir = ( itemHorizontal ) ? ( lpfloat2( 0, -1 ) ) : ( lpfloat2( -1, 0 ) ); 1331 | if( itemInvertedZ ) 1332 | itemBlendDir = -itemBlendDir; 1333 | 1334 | uint2 itemPixelPos = startingPos + itemStepRight * lpfloat(itemStepIndex); 1335 | 1336 | lpfloat3 colorCenter = LoadSourceColor( itemPixelPos, int2( 0, 0 ), itemMSAASampleIndex ).rgb; 1337 | lpfloat3 colorFrom = LoadSourceColor( itemPixelPos.xy + itemBlendDir * lpfloat(itemSrcOffset).xx, int2( 0, 0 ), itemMSAASampleIndex ).rgb; 1338 | 1339 | lpfloat3 outputColor = lerp( colorCenter.rgb, colorFrom.rgb, itemLerpK ); 1340 | 1341 | StoreColorSample( itemPixelPos.xy, outputColor, true, itemMSAASampleIndex ); 1342 | } 1343 | #endif 1344 | 1345 | } 1346 | 1347 | #if CMAA2_DEFERRED_APPLY_THREADGROUP_SWAP 1348 | [numthreads( 4, CMAA2_DEFERRED_APPLY_NUM_THREADS, 1 )] 1349 | #else 1350 | [numthreads( CMAA2_DEFERRED_APPLY_NUM_THREADS, 4, 1 )] 1351 | #endif 1352 | void DeferredColorApply2x2CS( uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID ) 1353 | { 1354 | const uint numCandidates = g_workingControlBuffer.Load(4*3); 1355 | #if CMAA2_DEFERRED_APPLY_THREADGROUP_SWAP 1356 | const uint currentCandidate = dispatchThreadID.y; 1357 | const uint currentQuadOffsetXY = groupThreadID.x; 1358 | #else 1359 | const uint currentCandidate = dispatchThreadID.x; 1360 | const uint currentQuadOffsetXY = groupThreadID.y; 1361 | #endif 1362 | 1363 | if( currentCandidate >= numCandidates ) 1364 | return; 1365 | 1366 | uint pixelID = g_workingDeferredBlendLocationList[currentCandidate]; 1367 | uint2 quadPos = uint2( (pixelID >> 16), pixelID & 0xFFFF ); 1368 | const int2 qeOffsets[4] = { {0, 0}, {1, 0}, {0, 1}, {1, 1} }; 1369 | uint2 pixelPos = quadPos*2+qeOffsets[currentQuadOffsetXY]; 1370 | 1371 | #if !defined(PLATFORM_NO_TEXTURE_ATOMICS) 1372 | uint counterIndexWithHeader = g_workingDeferredBlendItemListHeads[quadPos]; 1373 | #else 1374 | uint quadPosFlat = quadPos.x + quadPos.y * g_workingDeferredBlendItemListHeads_Width; 1375 | uint counterIndexWithHeader = g_workingDeferredBlendItemListHeads[quadPosFlat]; 1376 | #endif 1377 | 1378 | int counter = 0; 1379 | 1380 | #if CMAA_MSAA_SAMPLE_COUNT > 1 1381 | lpfloat4 outColors[CMAA_MSAA_SAMPLE_COUNT]; 1382 | [unroll] 1383 | for( uint msaaSampleIndex = 0; msaaSampleIndex < CMAA_MSAA_SAMPLE_COUNT; msaaSampleIndex++ ) 1384 | outColors[msaaSampleIndex] = lpfloat4( 0, 0, 0, 0 ); 1385 | bool hasValue = false; 1386 | #else 1387 | lpfloat4 outColors = lpfloat4( 0, 0, 0, 0 ); 1388 | #endif 1389 | 1390 | const uint maxLoops = 32*CMAA_MSAA_SAMPLE_COUNT; // do the loop to prevent bad data hanging the GPU <- probably not needed 1391 | { 1392 | for( uint i = 0; (counterIndexWithHeader != 0xFFFFFFFF) && ( i < maxLoops); i ++ ) 1393 | { 1394 | // decode item-specific info: {2 bits for 2x2 quad location}, {3 bits for MSAA sample index}, {1 bit for isComplexShape flag}, {26 bits for address} 1395 | uint offsetXY = (counterIndexWithHeader >> 30) & 0x03; 1396 | uint msaaSampleIndex = (counterIndexWithHeader >> 27) & 0x07; 1397 | bool isComplexShape = (counterIndexWithHeader >> 26) & 0x01; 1398 | 1399 | uint2 val = g_workingDeferredBlendItemList[ counterIndexWithHeader & ((1 << 26) - 1) ]; 1400 | 1401 | counterIndexWithHeader = val.x; 1402 | 1403 | if( offsetXY == currentQuadOffsetXY ) 1404 | { 1405 | lpfloat3 color = InternalUnpackColor(val.y); 1406 | lpfloat weight = 0.8 + 1.0 * lpfloat(isComplexShape); 1407 | #if CMAA_MSAA_SAMPLE_COUNT > 1 1408 | outColors[msaaSampleIndex] += lpfloat4( color * weight, weight ); 1409 | hasValue = true; 1410 | #else 1411 | outColors += lpfloat4( color * weight, weight ); 1412 | #endif 1413 | } 1414 | //numberOfElements[offsetXY]++; 1415 | } 1416 | } 1417 | 1418 | #if CMAA_MSAA_SAMPLE_COUNT > 1 1419 | if( !hasValue ) return; 1420 | #else 1421 | if( outColors.a == 0 ) return; 1422 | #endif 1423 | 1424 | { 1425 | #if CMAA_MSAA_SAMPLE_COUNT > 1 1426 | lpfloat4 outColor = 0; 1427 | for( uint msaaSampleIndex = 0; msaaSampleIndex < CMAA_MSAA_SAMPLE_COUNT; msaaSampleIndex++ ) 1428 | { 1429 | if( outColors[msaaSampleIndex].a != 0 ) 1430 | outColor.xyz += outColors[msaaSampleIndex].rgb / (outColors[msaaSampleIndex].a); 1431 | else 1432 | outColor.xyz += LoadSourceColor( pixelPos, int2(0, 0), msaaSampleIndex ); 1433 | } 1434 | outColor /= (lpfloat)CMAA_MSAA_SAMPLE_COUNT; 1435 | #else 1436 | lpfloat4 outColor = outColors; 1437 | outColor.rgb /= outColor.a; 1438 | #endif 1439 | FinalUAVStore( pixelPos, lpfloat3(outColor.rgb) ); 1440 | } 1441 | } 1442 | 1443 | [numthreads( 16, 16, 1 )] 1444 | void DebugDrawEdgesCS( uint2 dispatchThreadID : SV_DispatchThreadID ) 1445 | { 1446 | int msaaSampleIndex = 0; 1447 | lpfloat4 edges = UnpackEdgesFlt( LoadEdge( dispatchThreadID, int2( 0, 0 ), msaaSampleIndex ) ); 1448 | 1449 | // show MSAA control mask 1450 | // uint v = g_inColorMSComplexityMaskReadonly.Load( int3( dispatchThreadID, 0 ) ); 1451 | // FinalUAVStore( dispatchThreadID, float3( v, v, v ) ); 1452 | // return; 1453 | 1454 | #if 0 1455 | #if CMAA_MSAA_SAMPLE_COUNT > 1 1456 | uint2 pixelPos = dispatchThreadID.xy / 2 * 2; 1457 | /* 1458 | uint all2x2MSSamplesDifferent = 0; 1459 | 1460 | [unroll] for( uint x = 0; x < 4; x++ ) 1461 | [unroll] for( uint y = 0; y < 4; y++ ) 1462 | all2x2MSSamplesDifferent |= g_inColorMSComplexityMaskReadonly.Load( int3( pixelPos, 0 ), int2( x-1, y-1 ) ) > 0; 1463 | bool firstLoopIsEnough = all2x2MSSamplesDifferent == 0; 1464 | */ 1465 | 1466 | #if CMAA_MSAA_USE_COMPLEXITY_MASK 1467 | float2 texSize; 1468 | g_inColorMSComplexityMaskReadonly.GetDimensions( texSize.x, texSize.y ); 1469 | float2 gatherUV = float2(pixelPos) / texSize; 1470 | float4 TL = g_inColorMSComplexityMaskReadonly.GatherRed( g_gather_point_clamp_Sampler, gatherUV, int2( 0, 0 ) ); 1471 | float4 TR = g_inColorMSComplexityMaskReadonly.GatherRed( g_gather_point_clamp_Sampler, gatherUV, int2( 2, 0 ) ); 1472 | float4 BL = g_inColorMSComplexityMaskReadonly.GatherRed( g_gather_point_clamp_Sampler, gatherUV, int2( 0, 2 ) ); 1473 | float4 BR = g_inColorMSComplexityMaskReadonly.GatherRed( g_gather_point_clamp_Sampler, gatherUV, int2( 2, 2 ) ); 1474 | float4 sumAll = TL+TR+BL+BR; 1475 | bool firstLoopIsEnough = !any(sumAll); 1476 | 1477 | //all2x2MSSamplesDifferent = (all2x2MSSamplesDifferent != 0)?(CMAA_MSAA_SAMPLE_COUNT):(1); 1478 | FinalUAVStore( dispatchThreadID, (firstLoopIsEnough).xxx ); 1479 | return; 1480 | #endif 1481 | #endif 1482 | #endif 1483 | 1484 | 1485 | //if( any(edges) ) 1486 | { 1487 | lpfloat4 outputColor = lpfloat4( lerp( edges.xyz, (0.5).xxx, edges.a * 0.2 ), 1.0 ); 1488 | FinalUAVStore( dispatchThreadID, outputColor.rgb ); 1489 | } 1490 | 1491 | //#if CMAA2_EDGE_DETECTION_LUMA_PATH == 2 1492 | // FinalUAVStore( dispatchThreadID, g_inLumaReadonly.Load( int3( dispatchThreadID.xy, 0 ) ).r ); 1493 | //#endif 1494 | } 1495 | 1496 | #endif // #ifndef __cplusplus 1497 | 1498 | #endif // #ifndef __CMAA2_HLSL__ 1499 | -------------------------------------------------------------------------------- /Core/CMAA2.hlsl.meta: -------------------------------------------------------------------------------- 1 | fileFormatVersion: 2 2 | guid: 2d0fad4f2384140138d571b10ff5779e 3 | ShaderIncludeImporter: 4 | externalObjects: {} 5 | userData: 6 | assetBundleName: 7 | assetBundleVariant: 8 | -------------------------------------------------------------------------------- /Core/CMAA2RenderFeature.cs: -------------------------------------------------------------------------------- 1 | using UnityEngine; 2 | using UnityEngine.Rendering; 3 | using UnityEngine.Rendering.RenderGraphModule; 4 | using UnityEngine.Rendering.Universal; 5 | 6 | namespace CMAA2.Core 7 | { 8 | public class CMAA2RenderFeature : ScriptableRendererFeature 9 | { 10 | public ComputeShader CMAA2Compute; 11 | 12 | private CMAA2RenderPass _pass; 13 | 14 | public override void Create() 15 | { 16 | _pass = new CMAA2RenderPass(CMAA2Compute) 17 | { 18 | renderPassEvent = RenderPassEvent.BeforeRenderingPostProcessing, 19 | }; 20 | } 21 | 22 | public override void AddRenderPasses(ScriptableRenderer renderer, ref RenderingData renderingData) 23 | { 24 | renderer.EnqueuePass(_pass); 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /Core/CMAA2RenderFeature.cs.meta: -------------------------------------------------------------------------------- 1 | fileFormatVersion: 2 2 | guid: 3a15937ac5f4421997dec111705d6d02 3 | MonoImporter: 4 | externalObjects: {} 5 | serializedVersion: 2 6 | defaultReferences: 7 | - CMAA2Compute: {fileID: 7200000, guid: bdca705f137f466380668861f6afe032, type: 3} 8 | executionOrder: 0 9 | icon: {instanceID: 0} 10 | userData: 11 | assetBundleName: 12 | assetBundleVariant: 13 | -------------------------------------------------------------------------------- /Core/CMAA2RenderPass.cs: -------------------------------------------------------------------------------- 1 | #define CMAA2_REQUIRE_UNITY_FIX 2 | 3 | using UnityEngine; 4 | using UnityEngine.Experimental.Rendering; 5 | using UnityEngine.Rendering; 6 | using UnityEngine.Rendering.RenderGraphModule; 7 | using UnityEngine.Rendering.Universal; 8 | 9 | namespace CMAA2.Core 10 | { 11 | public class CMAA2RenderPass : ScriptableRenderPass 12 | { 13 | private const int m_TextureSampleCount = 1; 14 | 15 | private readonly CMAA2Compute _compute; 16 | 17 | public CMAA2RenderPass(ComputeShader cmaa2Compute) 18 | { 19 | _compute = new CMAA2Compute(cmaa2Compute); 20 | } 21 | 22 | private class PassData 23 | { 24 | public CMAA2Compute Compute; 25 | public TextureHandle ActualFrameColor; 26 | 27 | public Vector2Int FrameBufferSize; 28 | public TextureHandle ColorBackBuffer; // RWTexture2D : u0 29 | public TextureHandle WorkingEdges; // RWTexture2D : u1 30 | public SizedBufferHandle WorkingShapeCandidates; // RWStructuredBuffer : u2 31 | public SizedBufferHandle WorkingDeferredBlendLocationList; // RWStructuredBuffer : u3 32 | public BufferHandle WorkingDeferredBlendItemList; // RWStructuredBuffer : u4 33 | public AtomicTextureHandle WorkingDeferredBlendItemListHeads; // [RWTexture2D|RWStructuredBuffer] : u5 34 | public BufferHandle WorkingControlBuffer; // RWByteAddressBuffer : u6 35 | public BufferHandle WorkingExecuteIndirectBuffer; // RWByteAddressBuffer : u7 36 | } 37 | 38 | public override void RecordRenderGraph(RenderGraph renderGraph, ContextContainer frameData) 39 | { 40 | var resourceData = frameData.Get(); 41 | var cameraData = frameData.Get(); 42 | var targetDesc = cameraData.cameraTargetDescriptor; 43 | 44 | var resX = targetDesc.width; 45 | var resY = targetDesc.height; 46 | 47 | using var builder = renderGraph.AddUnsafePass(passName: "CMAA2", passData: out var passData); 48 | passData.Compute = _compute; 49 | 50 | passData.FrameBufferSize = new Vector2Int(resX, resY); 51 | passData.ActualFrameColor = resourceData.activeColorTexture; 52 | builder.UseTexture(input: resourceData.activeColorTexture); 53 | 54 | var colorBackBufferDesc = new TextureDesc(resX, resY) 55 | { 56 | name = "_ColorBackBufferRW", 57 | format = resourceData.cameraColor.GetDescriptor(renderGraph).colorFormat, 58 | enableRandomWrite = true, 59 | }; 60 | passData.ColorBackBuffer = builder.CreateTransientTexture(colorBackBufferDesc); 61 | builder.UseTexture(passData.ColorBackBuffer, AccessFlags.ReadWrite); 62 | 63 | // create all temporary storage buffers 64 | { 65 | int edgesResX = resX; 66 | if (m_TextureSampleCount == 1) edgesResX = (resX + 1) / 2; 67 | var graphicsFormat = m_TextureSampleCount switch 68 | { 69 | 1 or 2 => GraphicsFormat.R8_UInt, 70 | 4 => GraphicsFormat.R16_UInt, 71 | 8 => GraphicsFormat.R32_UInt, 72 | _ => GraphicsFormat.R8_UInt, 73 | }; 74 | var uintUAVTextureDesc = new TextureDesc(width: edgesResX, height: resY) 75 | { 76 | format = graphicsFormat, 77 | enableRandomWrite = true, 78 | }; 79 | passData.WorkingEdges = builder.CreateTransientTexture(desc: in uintUAVTextureDesc); 80 | builder.UseTexture(passData.WorkingEdges, AccessFlags.ReadWrite); 81 | 82 | passData.WorkingDeferredBlendItemListHeads = AtomicTextureHandle.CreateTransientUint( 83 | builder, 84 | (resX + 1) / 2, 85 | (resY + 1) / 2 86 | ); 87 | } 88 | 89 | // Bufers 90 | int requiredCandidatePixels = resX * resY / 4 * m_TextureSampleCount; 91 | int requiredDeferredColorApplyBuffer = resX * resY / 2 * m_TextureSampleCount; 92 | int requiredListHeadsPixels = (resX * resY + 3) / 6; 93 | 94 | // Create buffer for storing a list of all pixel candidates to process (potential AA shapes, both simple and complex) 95 | { 96 | var desc = new BufferDesc( 97 | count: requiredCandidatePixels, 98 | stride: sizeof(uint), 99 | target: GraphicsBuffer.Target.Structured 100 | ); 101 | passData.WorkingShapeCandidates = new SizedBufferHandle( 102 | builder.CreateTransientBuffer(desc: in desc), 103 | desc.count 104 | ); 105 | #if CMAA2_REQUIRE_UNITY_FIX 106 | builder.UseBuffer(passData.WorkingShapeCandidates.Buffer, AccessFlags.ReadWrite); 107 | #endif 108 | } 109 | 110 | // Create buffer for storing linked list of all output values to blend 111 | { 112 | var desc = new BufferDesc( 113 | requiredDeferredColorApplyBuffer, 114 | sizeof(uint) * 2, 115 | GraphicsBuffer.Target.Structured 116 | ); 117 | passData.WorkingDeferredBlendItemList = builder.CreateTransientBuffer(desc); 118 | #if CMAA2_REQUIRE_UNITY_FIX 119 | builder.UseBuffer(passData.WorkingDeferredBlendItemList, AccessFlags.ReadWrite); 120 | #endif 121 | } 122 | 123 | // Create buffer for storing a list of coordinates of linked list heads quads, to allow for combined processing in the last step 124 | { 125 | var desc = new BufferDesc( 126 | count: requiredListHeadsPixels, 127 | stride: sizeof(uint), 128 | target: GraphicsBuffer.Target.Structured 129 | ); 130 | passData.WorkingDeferredBlendLocationList = new SizedBufferHandle( 131 | builder.CreateTransientBuffer(desc), 132 | desc.count 133 | ); 134 | #if CMAA2_REQUIRE_UNITY_FIX 135 | builder.UseBuffer(passData.WorkingDeferredBlendLocationList.Buffer, AccessFlags.ReadWrite); 136 | #endif 137 | } 138 | 139 | // Control buffer (always the same size, doesn't need re-creating but oh well) 140 | { 141 | var desc = new BufferDesc(count: 16, stride: sizeof(uint), target: GraphicsBuffer.Target.Raw); 142 | passData.WorkingControlBuffer = builder.CreateTransientBuffer(desc: in desc); 143 | #if CMAA2_REQUIRE_UNITY_FIX 144 | builder.UseBuffer(passData.WorkingControlBuffer, AccessFlags.ReadWrite); 145 | #endif 146 | } 147 | 148 | // Control buffer (always the same size, doesn't need re-creating but oh well) 149 | { 150 | var desc = new BufferDesc( 151 | count: 4, 152 | stride: sizeof(uint), 153 | target: GraphicsBuffer.Target.Raw | GraphicsBuffer.Target.IndirectArguments 154 | ); 155 | passData.WorkingExecuteIndirectBuffer = builder.CreateTransientBuffer(desc: in desc); 156 | #if CMAA2_REQUIRE_UNITY_FIX 157 | builder.UseBuffer(passData.WorkingExecuteIndirectBuffer, AccessFlags.ReadWrite); 158 | #endif 159 | } 160 | 161 | builder.AllowPassCulling(false); 162 | builder.SetRenderFunc(Render); 163 | } 164 | 165 | private static void Render(PassData data, UnsafeGraphContext context) 166 | { 167 | var nativeCmd = CommandBufferHelpers.GetNativeCommandBuffer(context.cmd); 168 | nativeCmd.Blit(data.ActualFrameColor, data.ColorBackBuffer); 169 | 170 | // first pass edge detect 171 | data.Compute.EdgesColor2x2CS( 172 | cmd: context.cmd, 173 | inColorTexture: data.ColorBackBuffer, 174 | textureResolution: data.FrameBufferSize, 175 | workingEdges: data.WorkingEdges, 176 | workingShapeCandidates: data.WorkingShapeCandidates, 177 | workingDeferredBlendItemListHeads: data.WorkingDeferredBlendItemListHeads, 178 | workingControlBuffer: data.WorkingControlBuffer); 179 | 180 | // Set up for the first DispatchIndirect 181 | data.Compute.ComputeDispatchArgsCS( 182 | cmd: context.cmd, 183 | threadGroupsX: 2, 184 | threadGroupsY: 1, 185 | workingShapeCandidates: data.WorkingShapeCandidates, 186 | workingDeferredBlendLocationList: data.WorkingDeferredBlendLocationList, 187 | workingControlBuffer: data.WorkingControlBuffer, 188 | workingExecuteIndirectBuffer: data.WorkingExecuteIndirectBuffer 189 | ); 190 | 191 | // Process shape candidates DispatchIndirect 192 | data.Compute.ProcessCandidatesCS( 193 | cmd: context.cmd, 194 | workingExecuteDirectBuffer: data.WorkingExecuteIndirectBuffer, 195 | inColor: data.ColorBackBuffer, 196 | workingEdges: data.WorkingEdges, 197 | workingDeferredBlendItemListHeads: data.WorkingDeferredBlendItemListHeads, 198 | workingControlBuffer: data.WorkingControlBuffer, 199 | workingDeferredBlendItemList: data.WorkingDeferredBlendItemList, 200 | workingShapeCandidates: data.WorkingShapeCandidates, 201 | workingDeferredBlendLocationList: data.WorkingDeferredBlendLocationList 202 | ); 203 | 204 | // Set up for the second DispatchIndirect 205 | data.Compute.ComputeDispatchArgsCS( 206 | cmd: context.cmd, 207 | threadGroupsX: 1, 208 | threadGroupsY: 2, 209 | workingShapeCandidates: data.WorkingShapeCandidates, 210 | workingDeferredBlendLocationList: data.WorkingDeferredBlendLocationList, 211 | workingControlBuffer: data.WorkingControlBuffer, 212 | workingExecuteIndirectBuffer: data.WorkingExecuteIndirectBuffer 213 | ); 214 | 215 | // Resolve & apply blended colors 216 | data.Compute.DeferredColorApply2x2CS( 217 | context.cmd, 218 | workingExecuteIndirectBuffer: data.WorkingExecuteIndirectBuffer, 219 | outColor: data.ColorBackBuffer, 220 | workingControlBuffer: data.WorkingControlBuffer, 221 | workingDeferredBlendItemList: data.WorkingDeferredBlendItemList, 222 | workingDeferredBlendItemListHeads: data.WorkingDeferredBlendItemListHeads, 223 | workingDeferredBlendLocationList: data.WorkingDeferredBlendLocationList 224 | ); 225 | 226 | nativeCmd.Blit(data.ColorBackBuffer, data.ActualFrameColor); 227 | } 228 | } 229 | 230 | public struct SizedBufferHandle 231 | { 232 | public Vector4 Dimensions => new Vector4(Size, 0); 233 | 234 | public readonly int Size; 235 | public readonly BufferHandle Buffer; 236 | 237 | public SizedBufferHandle(BufferHandle bufferHandle, int size) 238 | { 239 | Buffer = bufferHandle; 240 | Size = size; 241 | } 242 | } 243 | } 244 | -------------------------------------------------------------------------------- /Core/CMAA2RenderPass.cs.meta: -------------------------------------------------------------------------------- 1 | fileFormatVersion: 2 2 | guid: 12ba659d27ac490cb07534018ec02d43 3 | timeCreated: 1748371432 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Alexander Malyutin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /LICENSE.meta: -------------------------------------------------------------------------------- 1 | fileFormatVersion: 2 2 | guid: 98c46ce7a67ec410395698cf6180da62 3 | DefaultImporter: 4 | externalObjects: {} 5 | userData: 6 | assetBundleName: 7 | assetBundleVariant: 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CMAA2 for Unity URP 2 | 3 | **Conservative Morphological Anti-Aliasing 2.0 (CMAA2)** ported to the Unity Universal Render Pipeline. 4 | 5 | CMAA2 is a post-process anti-aliasing technique focused on delivering high-quality edge smoothing while preserving the sharpness of the original image. 6 | 7 | Details of the original implementation and performance analysis are available in Intel’s article: 8 | https://www.intel.com/content/dam/develop/external/us/en/documents/conservative-morphological-anti-aliasing.pdf 9 | 10 | | CMAA Off | CMAA On | 11 | |------------------------|-----------------------| 12 | | ![cmaa-2-disabled-out] | ![cmaa-2-enabled-out] | 13 | 14 | ## Installation 15 | 16 | TODO: 17 | 18 | ## Acknowledgements 19 | 20 | This project includes a modified version of [`CMAA2.hlsl`](https://github.com/GameTechDev/CMAA2/blob/master/Projects/CMAA2/CMAA2/CMAA2.hlsl) from Intel’s [GameTechDev/CMAA2](https://github.com/GameTechDev/CMAA2) project. 21 | The original code is licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0). 22 | See [`THIRD_PARTY_LICENSES/CMAA2-LICENSE`](THIRD_PARTY_LICENSES/CMAA2-LICENSE) for details. 23 | 24 | License 25 | ------- 26 | This project is MIT License - see the [LICENSE](LICENSE) file for details 27 | 28 | 29 | [cmaa-2-disabled-out]: https://github.com/user-attachments/assets/68805e27-e569-4da8-86ff-60912f0709b0 30 | [cmaa-2-enabled-out]: https://github.com/user-attachments/assets/b41fb60c-af01-4f1f-83de-4f68b342e8cc 31 | -------------------------------------------------------------------------------- /README.md.meta: -------------------------------------------------------------------------------- 1 | fileFormatVersion: 2 2 | guid: 28fcd06bdc08e4882a16fef24fe44c35 3 | TextScriptImporter: 4 | externalObjects: {} 5 | userData: 6 | assetBundleName: 7 | assetBundleVariant: 8 | -------------------------------------------------------------------------------- /THIRD_PARTY_LICENSES.meta: -------------------------------------------------------------------------------- 1 | fileFormatVersion: 2 2 | guid: c4eba531045348f3b58542f8aaf063a0 3 | timeCreated: 1748700773 -------------------------------------------------------------------------------- /THIRD_PARTY_LICENSES/CMAA2-LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, and 12 | distribution as defined by Sections 1 through 9 of this document. 13 | 14 | "Licensor" shall mean the copyright owner or entity authorized by the copyright 15 | owner that is granting the License. 16 | 17 | "Legal Entity" shall mean the union of the acting entity and all other entities 18 | that control, are controlled by, or are under common control with that entity. 19 | For the purposes of this definition, "control" means (i) the power, direct or 20 | indirect, to cause the direction or management of such entity, whether by 21 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity exercising 25 | permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, including 28 | but not limited to software source code, documentation source, and configuration 29 | files. 30 | 31 | "Object" form shall mean any form resulting from mechanical transformation or 32 | translation of a Source form, including but not limited to compiled object code, 33 | generated documentation, and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or Object form, made 36 | available under the License, as indicated by a copyright notice that is included 37 | in or attached to the work (an example is provided in the Appendix below). 38 | 39 | "Derivative Works" shall mean any work, whether in Source or Object form, that 40 | is based on (or derived from) the Work and for which the editorial revisions, 41 | annotations, elaborations, or other modifications represent, as a whole, an 42 | original work of authorship. For the purposes of this License, Derivative Works 43 | shall not include works that remain separable from, or merely link (or bind by 44 | name) to the interfaces of, the Work and Derivative Works thereof. 45 | 46 | "Contribution" shall mean any work of authorship, including the original version 47 | of the Work and any modifications or additions to that Work or Derivative Works 48 | thereof, that is intentionally submitted to Licensor for inclusion in the Work 49 | by the copyright owner or by an individual or Legal Entity authorized to submit 50 | on behalf of the copyright owner. For the purposes of this definition, 51 | "submitted" means any form of electronic, verbal, or written communication sent 52 | to the Licensor or its representatives, including but not limited to 53 | communication on electronic mailing lists, source code control systems, and 54 | issue tracking systems that are managed by, or on behalf of, the Licensor for 55 | the purpose of discussing and improving the Work, but excluding communication 56 | that is conspicuously marked or otherwise designated in writing by the copyright 57 | owner as "Not a Contribution." 58 | 59 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf 60 | of whom a Contribution has been received by Licensor and subsequently 61 | incorporated within the Work. 62 | 63 | 2. Grant of Copyright License. Subject to the terms and conditions of this 64 | License, each Contributor hereby grants to You a perpetual, worldwide, 65 | non-exclusive, no-charge, royalty-free, irrevocable copyright license to 66 | reproduce, prepare Derivative Works of, publicly display, publicly perform, 67 | sublicense, and distribute the Work and such Derivative Works in Source or 68 | Object form. 69 | 70 | 3. Grant of Patent License. Subject to the terms and conditions of this License, 71 | each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, 72 | no-charge, royalty-free, irrevocable (except as stated in this section) patent 73 | license to make, have made, use, offer to sell, sell, import, and otherwise 74 | transfer the Work, where such license applies only to those patent claims 75 | licensable by such Contributor that are necessarily infringed by their 76 | Contribution(s) alone or by combination of their Contribution(s) with the Work 77 | to which such Contribution(s) was submitted. If You institute patent litigation 78 | against any entity (including a cross-claim or counterclaim in a lawsuit) 79 | alleging that the Work or a Contribution incorporated within the Work 80 | constitutes direct or contributory patent infringement, then any patent licenses 81 | granted to You under this License for that Work shall terminate as of the date 82 | such litigation is filed. 83 | 84 | 4. Redistribution. You may reproduce and distribute copies of the Work or 85 | Derivative Works thereof in any medium, with or without modifications, and in 86 | Source or Object form, provided that You meet the following conditions: 87 | You must give any other recipients of the Work or Derivative Works a copy of 88 | this License; and 89 | 90 | 91 | You must cause any modified files to carry prominent notices stating that You 92 | changed the files; and 93 | 94 | 95 | You must retain, in the Source form of any Derivative Works that You 96 | distribute, all copyright, patent, trademark, and attribution notices from the 97 | Source form of the Work, excluding those notices that do not pertain to any 98 | part of the Derivative Works; and 99 | 100 | 101 | If the Work includes a "NOTICE" text file as part of its distribution, then 102 | any Derivative Works that You distribute must include a readable copy of the 103 | attribution notices contained within such NOTICE file, excluding those notices 104 | that do not pertain to any part of the Derivative Works, in at least one of 105 | the following places: within a NOTICE text file distributed as part of the 106 | Derivative Works; within the Source form or documentation, if provided along 107 | with the Derivative Works; or, within a display generated by the Derivative 108 | Works, if and wherever such third-party notices normally appear. The contents 109 | of the NOTICE file are for informational purposes only and do not modify the 110 | License. You may add Your own attribution notices within Derivative Works that 111 | You distribute, alongside or as an addendum to the NOTICE text from the Work, 112 | provided that such additional attribution notices cannot be construed as 113 | modifying the License. 114 | You may add Your own copyright statement to Your modifications and may provide 115 | additional or different license terms and conditions for use, reproduction, or 116 | distribution of Your modifications, or for any such Derivative Works as a whole, 117 | provided Your use, reproduction, and distribution of the Work otherwise complies 118 | with the conditions stated in this License. 119 | 120 | 5. Submission of Contributions. Unless You explicitly state otherwise, any 121 | Contribution intentionally submitted for inclusion in the Work by You to the 122 | Licensor shall be under the terms and conditions of this License, without any 123 | additional terms or conditions. Notwithstanding the above, nothing herein shall 124 | supersede or modify the terms of any separate license agreement you may have 125 | executed with Licensor regarding such Contributions. 126 | 127 | 6. Trademarks. This License does not grant permission to use the trade names, 128 | trademarks, service marks, or product names of the Licensor, except as required 129 | for reasonable and customary use in describing the origin of the Work and 130 | reproducing the content of the NOTICE file. 131 | 132 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in 133 | writing, Licensor provides the Work (and each Contributor provides its 134 | Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 135 | KIND, either express or implied, including, without limitation, any warranties 136 | or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 137 | PARTICULAR PURPOSE. You are solely responsible for determining the 138 | appropriateness of using or redistributing the Work and assume any risks 139 | associated with Your exercise of permissions under this License. 140 | 141 | 8. Limitation of Liability. In no event and under no legal theory, whether in 142 | tort (including negligence), contract, or otherwise, unless required by 143 | applicable law (such as deliberate and grossly negligent acts) or agreed to in 144 | writing, shall any Contributor be liable to You for damages, including any 145 | direct, indirect, special, incidental, or consequential damages of any character 146 | arising as a result of this License or out of the use or inability to use the 147 | Work (including but not limited to damages for loss of goodwill, work stoppage, 148 | computer failure or malfunction, or any and all other commercial damages or 149 | losses), even if such Contributor has been advised of the possibility of such 150 | damages. 151 | 152 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or 153 | Derivative Works thereof, You may choose to offer, and charge a fee for, 154 | acceptance of support, warranty, indemnity, or other liability obligations 155 | and/or rights consistent with this License. However, in accepting such 156 | obligations, You may act only on Your own behalf and on Your sole 157 | responsibility, not on behalf of any other Contributor, and only if You agree to 158 | indemnify, defend, and hold each Contributor harmless for any liability incurred 159 | by, or claims asserted against, such Contributor by reason of your accepting any 160 | such warranty or additional liability. 161 | 162 | END OF TERMS AND CONDITIONS 163 | 164 | APPENDIX: How to apply the Apache License to your work 165 | 166 | To apply the Apache License to your work, attach the following boilerplate 167 | notice, with the fields enclosed by brackets "[]" replaced with your own 168 | identifying information. (Don't include the brackets!) The text should be 169 | enclosed in the appropriate comment syntax for the file format. We also 170 | recommend that a file or class name and description of purpose be included on 171 | the same "printed page" as the copyright notice for easier identification within 172 | third-party archives. 173 | 174 | Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, 175 | Version 2.0 (the "License"); you may not use this file except in compliance with 176 | the License. You may obtain a copy of the License at 177 | http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or 178 | agreed to in writing, software distributed under the License is distributed on 179 | an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 180 | or implied. See the License for the specific language governing permissions and 181 | limitations under the License. -------------------------------------------------------------------------------- /THIRD_PARTY_LICENSES/CMAA2-LICENSE.meta: -------------------------------------------------------------------------------- 1 | fileFormatVersion: 2 2 | guid: cd23281e4ddd46c7939ee7c2e6b240ea 3 | timeCreated: 1748700785 -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "com.alexmalyutindev.cmaa2", 3 | "version": "1.0.0", 4 | "displayName": "CMAA2", 5 | "dependencies": { 6 | "com.unity.render-pipelines.universal": "17.1.0" 7 | } 8 | } -------------------------------------------------------------------------------- /package.json.meta: -------------------------------------------------------------------------------- 1 | fileFormatVersion: 2 2 | guid: cdd8eac26f90743089384df9517aa03f 3 | PackageManifestImporter: 4 | externalObjects: {} 5 | userData: 6 | assetBundleName: 7 | assetBundleVariant: 8 | --------------------------------------------------------------------------------