├── AccessFloat4x4InShader.shader ├── AccessMatrix4x4InCSharp.cs ├── PowerVR cycle count document.shader └── README.md /AccessFloat4x4InShader.shader: -------------------------------------------------------------------------------- 1 | //example .shader to show different ways to access hlsl float4x4's component 2 | Shader "AccessFloat4x4InShader/ExampleCode" 3 | { 4 | SubShader 5 | { 6 | Pass 7 | { 8 | CGPROGRAM 9 | #pragma vertex vert 10 | #pragma fragment frag 11 | 12 | #include "UnityCG.cginc" 13 | 14 | struct appdata 15 | { 16 | float4 vertex : POSITION; 17 | }; 18 | 19 | struct v2f 20 | { 21 | float4 vertex : SV_POSITION; 22 | }; 23 | 24 | float4x4 MY_MATRIX_M; //set by AccessMatrix4x4InCSharp.cs 25 | 26 | v2f vert (appdata v) 27 | { 28 | v2f o; 29 | 30 | //https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-per-component-math 31 | //hlsl float4x4 access method 32 | /* (C0) (C1) (C2) (C3) 33 | * _m00, _m01, _m02, _m03 (Row0) 34 | * _m10, _m11, _m12, _m13 (Row1) 35 | * _m20, _m21, _m22, _m23 (Row2) 36 | * _m30, _m31, _m32, _m33 (Row3) 37 | */ 38 | //example matrix (Scale & Translate) 39 | /* (C0) (C1) (C2) (C3) 40 | * Sx , _m01, _m02, Tx (Row0) 41 | * _m10, Sy , _m12, Ty (Row1) 42 | * _m20, _m21, Sz , Tz (Row2) 43 | * _m30, _m31, _m32, _m33 (Row3) 44 | */ 45 | 46 | ///////////////////////////////////////////////////////////////////// 47 | //ways to extract renderer's transform.position from a M matrix 48 | ///////////////////////////////////////////////////////////////////// 49 | float3 translationWS; 50 | translationWS = float3(MY_MATRIX_M._m03, MY_MATRIX_M._m13, MY_MATRIX_M._m23); //(method1) extract position from float4x4 51 | translationWS = float3(MY_MATRIX_M[0][3], MY_MATRIX_M[1][3], MY_MATRIX_M[2][3]); //(method2) extract position from float4x4 52 | translationWS = float3(MY_MATRIX_M[0].w, MY_MATRIX_M[1].w, MY_MATRIX_M[2].w); //(method3) extract position from float4x4 53 | ///////////////////////////////////////////////////////////////////// 54 | //ways to extract transform.lossyScale from a M matrix 55 | ///////////////////////////////////////////////////////////////////// 56 | float3 scaleWS; 57 | scaleWS.x = length(float3(MY_MATRIX_M[0].x, MY_MATRIX_M[1].x, MY_MATRIX_M[2].x)); 58 | scaleWS.y = length(float3(MY_MATRIX_M[0].y, MY_MATRIX_M[1].y, MY_MATRIX_M[2].y)); 59 | scaleWS.z = length(float3(MY_MATRIX_M[0].z, MY_MATRIX_M[1].z, MY_MATRIX_M[2].z)); 60 | 61 | //////////////////////////////////////////////// 62 | //build T and IV_T from scratch 63 | float4x4 T = (float4x4)0; 64 | T._m00 = 1; 65 | T._m11 = 1; 66 | T._m22 = 1; 67 | T._m33 = 1; 68 | T._m03 = translationWS.x; 69 | T._m13 = translationWS.y; 70 | T._m23 = translationWS.z; 71 | float4x4 IV_T = T; 72 | IV_T._m03 = -IV_T._m03; 73 | IV_T._m13 = -IV_T._m13; 74 | IV_T._m23 = -IV_T._m23; 75 | //build S and IV_S from scratch 76 | float4x4 S = (float4x4)0; 77 | S._m00 = scaleWS.x; 78 | S._m11 = scaleWS.y; 79 | S._m22 = scaleWS.z; 80 | S._m33 = 1; 81 | float4x4 IV_S = S; 82 | IV_S._m00 = 1.0/ IV_S._m00; 83 | IV_S._m11 = 1.0/ IV_S._m11; 84 | IV_S._m22 = 1.0/ IV_S._m22; 85 | 86 | //build R using T & S 87 | float4x4 R = mul(IV_S,mul(IV_T,MY_MATRIX_M)); //first remove T, then remove S 88 | 89 | //rebuild M (T*S*R) 90 | float4x4 M = mul(T,mul(S,R)); //in shader, can't do matrix mul using S * R, use mul(S,R) !!! 91 | 92 | //apply M 93 | v.vertex = mul(M, float4(v.vertex.xyz, 1)); 94 | 95 | //complete VP as usual 96 | o.vertex = UnityWorldToClipPos(v.vertex); 97 | return o; 98 | } 99 | 100 | fixed4 frag (v2f i) : SV_Target 101 | { 102 | return 1; 103 | } 104 | ENDCG 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /AccessMatrix4x4InCSharp.cs: -------------------------------------------------------------------------------- 1 | //example C# script to show different ways to access Matrix4x4's component 2 | using UnityEngine; 3 | 4 | public class AccessMatrix4x4InCSharp : MonoBehaviour 5 | { 6 | void Update() 7 | { 8 | //unity Matrix4x4 access method in C# 9 | /* (C0) (C1) (C2) (C3) 10 | * m00, m01, m02, m03 (Row0) 11 | * m10, m11, m12, m13 (Row1) 12 | * m20, m21, m22, m23 (Row2) 13 | * m30, m31, m32, m33 (Row3) 14 | */ 15 | //example matrix (Scale & Translate) 16 | /* (C0) (C1) (C2) (C3) 17 | * Sx , m01, m02, Tx (Row0) 18 | * m10, Sy , m12, Ty (Row1) 19 | * m20, m21, Sz , Tz (Row2) 20 | * m30, m31, m32, m33 (Row3) 21 | */ 22 | 23 | Matrix4x4 m = transform.localToWorldMatrix; 24 | 25 | ///////////////////////////////////////////////////////////////////// 26 | //ways to extract transform.position from a M matrix 27 | ///////////////////////////////////////////////////////////////////// 28 | Vector3 posWS; 29 | posWS = new Vector3(m.m03, m.m13, m.m23); //(method 1) extract correct translation from matrix 30 | posWS = m.GetColumn(3); //(method 2) extract correct translation from matrix 31 | posWS = new Vector3(m.GetRow(0).w, m.GetRow(1).w, m.GetRow(2).w); //(method 3) extract correct translation from matrix 32 | posWS = new Vector3(m[0, 3], m[1, 3], m[2, 3]); //(method 4) extract correct translation from matrix 33 | Debug.Log($"transform.position = {posWS}"); 34 | 35 | ///////////////////////////////////////////////////////////////////// 36 | //ways to extract transform.lossyScale from a M matrix 37 | ///////////////////////////////////////////////////////////////////// 38 | Vector3 scaleWS; 39 | //(wrong method) can get correct scale ONLY if rotation is all 0 40 | scaleWS = new Vector3(m.m00, m.m11, m.m22); 41 | //(right method) can get correct scale no matter what rotation is, due to the fact that rotation matrix's each column's length must equals 1 => sqrt(cos^2+sin^2+0) must equals 1 42 | scaleWS = new Vector3(m.GetColumn(0).magnitude, m.GetColumn(1).magnitude, m.GetColumn(2).magnitude); 43 | Debug.Log($"transform.lossyScale = {scaleWS}"); 44 | 45 | ///////////////////////////////////////////////////////////////////// 46 | //ways to extract transform.rotation from a M matrix 47 | ///////////////////////////////////////////////////////////////////// 48 | Matrix4x4 R; 49 | 50 | //(method 1) get rotation matrix 51 | Quaternion r = m.rotation; 52 | R = Matrix4x4.Rotate(r); 53 | 54 | //(method 2) get rotation matrix 55 | //first remove scale 56 | Matrix4x4 INV_S = Matrix4x4.identity; 57 | INV_S.m00 = 1f / scaleWS.x; 58 | INV_S.m11 = 1f / scaleWS.y; 59 | INV_S.m22 = 1f / scaleWS.z; 60 | R = m * INV_S; 61 | //then remove position 62 | R.m03 = 0; 63 | R.m13 = 0; 64 | R.m23 = 0; 65 | //finally, at this line matrix will remain rotation 66 | 67 | //////////////////////////////////////////////// 68 | //build T from scratch 69 | Matrix4x4 T = Matrix4x4.identity; 70 | T.m03 = posWS.x; 71 | T.m13 = posWS.y; 72 | T.m23 = posWS.z; 73 | //build S from scratch 74 | Matrix4x4 S = Matrix4x4.identity; 75 | S.m00 = scaleWS.x; 76 | S.m11 = scaleWS.y; 77 | S.m22 = scaleWS.z; 78 | 79 | Matrix4x4 MY_MATRIX_M = T * R * S; 80 | if(GetComponent()) 81 | GetComponent().material.SetMatrix("MY_MATRIX_M", MY_MATRIX_M); //it is the same as UNITY_MATRIX_M in shader 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /PowerVR cycle count document.shader: -------------------------------------------------------------------------------- 1 | // [PowerVR cycle count document] 2 | 3 | // Filename : PowerVR Low level GLSL Optimization 4 | // Version : PowerVR SDK REL_17.1@4658063a External Issue 5 | // Issue Date : 07 Apr 2017 6 | // Author : Marton Tamas 7 | // Full version PDF: http://cdn.imgtec.com/sdk-documentation/PowerVR+Low+level+GLSL+Optimization.pdf?fbclid=IwAR08O5o4pAJcgCGbB4nMf13vG-OeAm7xGkwzmNDSqIUBg_21w6JcYt0q3MY 8 | // This document describes ways to optimize GLSL code for PowerVR Series 6 architecture. 9 | 10 | // Based on: 11 | // http://www.humus.name/Articles/Persson_LowLevelThinking.pdf 12 | // http://www.humus.name/Articles/Persson_LowlevelShaderOptimization.pdf 13 | 14 | ////////////////////////////////////////////////// 15 | // 2. Low level optimizations 16 | ////////////////////////////////////////////////// 17 | 18 | // 2.1. PowerVR Series 6 USC diagram 19 | // Generally shader performance on PowerVR Series 6 architecture GPUs depends on the number of 20 | // cycles it takes to execute a shader. 21 | 22 | // 2.2 Writing expressions in MAD form 23 | fragColor.x = (t.x + t.y) * (t.x - t.y); //2 cycles 24 | fragColor.x = t.x * t.x + (-t.y * t.y); //1 cycle 25 | 26 | // 2.3 Division 27 | fragColor.x = (t.x * t.y + t.z) / t.x; //3 cycles 28 | fragColor.x = t.y + t.z * (1.0 / t.x); //2 cycles 29 | 30 | // 2.4 Sign 31 | fragColor.x = sign(t.x) * t.y; //3 cycles 32 | fragColor.x = (t.x >= 0.0 ? 1.0 : -1.0) * t.y; //2 cycles, so if case (t.x == 0) is not needed it is better to use conditional form instead of sign(). 33 | 34 | // 2.5 Rcp/rsqrt/sqrt 35 | fragColor.x = 1.0 / t.x; //1 cycle 36 | fragColor.x = inversesqrt(t.x); //1 cycle 37 | fragColor.x = sqrt(t.x); //2 cycles, sqrt() on the other hand is implemented as: 1 / (1/sqrt(x)), Which results in a 2 cycle cost. 38 | 39 | fragColor.x = t.x * inversesqrt(t.x); //2 cycles 40 | 41 | fragColor.x = sqrt(t.x) > 0.5 ? 0.5 : 1.0; //3 cycles 42 | fragColor.x = (t.x * inversesqrt(t.x)) > 0.5 ? 0.5 : 1.0; //2 cycles, in this case the test instructions can fit into the second instruction. 43 | 44 | // 2.6 Abs/Neg/Saturate 45 | fragColor.x = abs(t.x * t.y); //2 cycles 46 | fragColor.x = abs(t.x) * abs(t.y); //1 cycle 47 | 48 | fragColor.x = -dot(t.xyz, t.yzx); //3 cycles 49 | fragColor.x = dot(-t.xyz, t.yzx); //2 cycles 50 | 51 | fragColor.x = 1.0 - clamp(t.x, 0.0, 1.0); //2 cycles 52 | fragColor.x = clamp(1.0 - t.x, 0.0, 1.0); //1 cycle 53 | 54 | fragColor.x = min(dot(t, t), 1.0) > 0.5 ? t.x : t.y; //5 cycles 55 | fragColor.x = clamp(dot(t, t), 0.0, 1.0) > 0.5 ? t.x : t.y; //4 cycles 56 | 57 | // normalize() is decomposed into: 58 | vec3 normalize( vec3 v ) 59 | { 60 | return v * inverssqrt( dot( v, v ) ); 61 | } 62 | 63 | fragColor.xyz = normalize(-t.xyz); //7 cycles 64 | fragColor.xyz = -normalize(t.xyz); //6 cycles 65 | 66 | ////////////////////////////////////////////////// 67 | // 3. Transcendental functions 68 | ////////////////////////////////////////////////// 69 | 70 | // 3.1. Exp/Log 71 | fragColor.x = exp2(t.x); //1 cycle 72 | fragColor.x = log2(t.x); //1 cycle 73 | 74 | // Exp is implemented as: 75 | float exp2( float x ) 76 | { 77 | return exp2(x * 1.442695); //2 cycles 78 | } 79 | // Log is implemented as: 80 | float log2( float x ) 81 | { 82 | return log2(x * 0.693147); //2 cycles 83 | } 84 | // Pow(x, y) is implemented as: 85 | float pow( float x, float y ) 86 | { 87 | return exp2(log2(x) * y); //3 cycles 88 | } 89 | 90 | // 3.2. Sin/Cos/Sinh/Cosh 91 | fragColor.x = sin(t.x); //4 cycles 92 | fragColor.x = cos(t.x); //4 cycles 93 | 94 | fragColor.x = cosh(t.x); //3 cycles 95 | fragColor.x = sinh(t.x); //3 cycles 96 | 97 | // 3.3. Asin/Acos/Atan /Degrees/Radians 98 | fragColor.x = asin(t.x); //67 cycles (VERY high cost!) 99 | fragColor.x = acos(t.x); //79 cycles (VERY high cost!) 100 | fragColor.x = atan(t.x); //12 cycles (lots of conditionals), Atan is still costly, but it could be used if needed. 101 | 102 | fragColor.x = degrees(t.x); //1 cycle 103 | fragColor.x = radians(t.x); //1 cycle 104 | 105 | ////////////////////////////////////////////////// 106 | // 4. Intrinsic functions 107 | ////////////////////////////////////////////////// 108 | 109 | // 4.1. Vector*Matrix 110 | fragColor = t * m1; //4x4 matrix, 8 cycles 111 | fragColor.xyz = t.xyz * m2; //3x3 matrix, 4 cycles 112 | 113 | // 4.2. Mixed Scalar/Vector math 114 | fragColor.x = length(t-v); 115 | fragColor.y = distance(v, t); // total of 7 cycles 116 | fragColor.x = length(t-v); 117 | fragColor.y = distance(t, v); // total of 9 cycles 118 | 119 | fragColor.xyz = normalize(t.xyz); //6 cycles 120 | fragColor.xyz = inversesqrt(dot(t.xyz, t.xyz)) * t.xyz; //5 cycles 121 | 122 | fragColor.xyz = 50.0 * normalize(t.xyz); //7 cycles 123 | fragColor.xyz = (50.0 * inversesqrt(dot(t.xyz, t.xyz))) * t.xyz; //6 cycles 124 | 125 | // Cross() can be expanded to: 126 | vec3 cross( vec3 a, vec3 b ) 127 | { 128 | return vec3( a.y * b.z - b.y * a.z, 129 | a.z * b.x - b.z * a.x, 130 | a.x * b.y - b.y * a.y ); 131 | } 132 | // Distance can be expanded to: 133 | float distance( vec3 a, vec3 b ) 134 | { 135 | vec3 tmp = a – b; 136 | return sqrt(dot(tmp, tmp)); 137 | } 138 | // Dot can be expanded to: 139 | float dot( vec3 a, vec3 b ) 140 | { 141 | return a.x * b.x + a.y * b.y + a.z * b.z; 142 | } 143 | // Faceforward can be expanded to: 144 | vec3 faceforward( vec3 n, vec3 I, vec3 Nref ) 145 | { 146 | if( dot(Nref, I) < 0 ) 147 | { 148 | return n; 149 | } 150 | else 151 | { 152 | return –n: 153 | } 154 | } 155 | // Length can be expanded to: 156 | float length( vec3 v ) 157 | { 158 | return sqrt(dot(v, v)); 159 | } 160 | // Normalize can be expanded to: 161 | vec3 normalize( vec3 v ) 162 | { 163 | return v / sqrt(dot(v, v)); 164 | } 165 | // Reflect can be expanded to: 166 | vec3 reflect( vec3 N, vec3 I ) 167 | { 168 | return I - 2.0 * dot(N, I) * N; 169 | } 170 | // Refract can be expanded to: 171 | vec3 refract( vec3 n, vec3 I, float eta ) 172 | { 173 | float k = 1.0 - eta * eta * (1.0 - dot(N, I) * dot(N, I)); 174 | if (k < 0.0) 175 | return 0.0; 176 | else 177 | return eta * I - (eta * dot(N, I) + sqrt(k)) * N; 178 | } 179 | 180 | // 4.3. Operation grouping 181 | fragColor.xyz = t.xyz * t.x * t.y * t.wzx * t.z * t.w; //7 cycles 182 | fragColor.xyz = (t.x * t.y * t.z * t.w) * (t.xyz * t.wzx); //4 cycles 183 | 184 | ////////////////////////////////////////////////// 185 | // 5. FP16 overview 186 | ////////////////////////////////////////////////// 187 | 188 | // 5.3. Exploiting the SOP/MAD FP16 pipeline 189 | // After applying all this knowledge, we can show off the power of this pipeline by using everything in one cycle: 190 | // All in 1 cycle 191 | mediump vec4 fp16 = t; 192 | highp vec4 res; 193 | res.x = clamp(min(-fp16.y * abs(fp16.z), clamp(fp16.w, 0.0, 1.0) * abs(fp16.x)), 0.0, 1.0); 194 | res.y = clamp(abs(fp16.w) * -fp16.z + clamp(fp16.x, 0.0, 1.0), 0.0, 1.0); 195 | fragColor = res; 196 | {sop, sop} 197 | 198 | 199 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Shader assembly / low level optimization 2 | - http://www.humus.name/Articles/Persson_LowLevelThinking.pdf 3 | - http://www.humus.name/Articles/Persson_LowlevelShaderOptimization.pdf 4 | 5 | # PowerVR shader cycle count 6 | - http://cdn.imgtec.com/sdk-documentation/PowerVR+Low+level+GLSL+Optimization.pdf?fbclid=IwAR08O5o4pAJcgCGbB4nMf13vG-OeAm7xGkwzmNDSqIUBg_21w6JcYt0q3MY 7 | - https://github.com/ColinLeung-NiloCat/ShaderNotes/blob/8d095546bb61cb26edbe19122cb1028fa49f2e3a/PowerVR%20cycle%20count%20document.shader 8 | 9 | # From ObjectSpace to WindowSpace all transform realtime demo 10 | - http://www.realtimerendering.com/udacity/transforms.html 11 | 12 | # Unity C#/shader Matrix 13 | - https://answers.unity.com/questions/1359718/what-do-the-values-in-the-matrix4x4-for-cameraproj.html?childToView=1359877#answer-1359877 14 | - https://forum.unity.com/threads/can-i-get-the-scale-in-the-transform-of-the-object-i-attach-a-shader-to-if-so-how.418345/ 15 | - https://answers.unity.com/questions/1435216/are-these-rotation-matrices-right.html 16 | --------------------------------------------------------------------------------