├── AccessFloat4x4InShader.shader
├── AccessMatrix4x4InCSharp.cs
├── PowerVR cycle count document.shader
└── README.md


/AccessFloat4x4InShader.shader:
--------------------------------------------------------------------------------
  1 | ﻿//example .shader to show different ways to access hlsl float4x4's component
  2 | Shader "AccessFloat4x4InShader/ExampleCode"
  3 | {
  4 |     SubShader
  5 |     {
  6 |         Pass
  7 |         {
  8 |             CGPROGRAM
  9 |             #pragma vertex vert
 10 |             #pragma fragment frag
 11 | 
 12 |             #include "UnityCG.cginc"
 13 | 
 14 |             struct appdata
 15 |             {
 16 |                 float4 vertex : POSITION;
 17 |             };
 18 | 
 19 |             struct v2f
 20 |             {
 21 |                 float4 vertex : SV_POSITION;
 22 |             };
 23 | 
 24 |             float4x4 MY_MATRIX_M; //set by AccessMatrix4x4InCSharp.cs
 25 | 
 26 |             v2f vert (appdata v)
 27 |             {
 28 |                 v2f o;
 29 | 
 30 |                 //https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-per-component-math
 31 |                 //hlsl float4x4 access method
 32 |                 /* (C0)  (C1)  (C2)  (C3)
 33 |                  * _m00, _m01, _m02, _m03 (Row0)
 34 |                  * _m10, _m11, _m12, _m13 (Row1)
 35 |                  * _m20, _m21, _m22, _m23 (Row2)
 36 |                  * _m30, _m31, _m32, _m33 (Row3)
 37 |                 */
 38 |                 //example matrix (Scale & Translate)
 39 |                 /* (C0)  (C1)  (C2)  (C3)
 40 |                  *  Sx , _m01, _m02,  Tx  (Row0)
 41 |                  * _m10,  Sy , _m12,  Ty  (Row1)
 42 |                  * _m20, _m21,  Sz ,  Tz  (Row2)
 43 |                  * _m30, _m31, _m32, _m33 (Row3)
 44 |                 */
 45 | 
 46 |                 /////////////////////////////////////////////////////////////////////
 47 |                 //ways to extract renderer's transform.position from a M matrix
 48 |                 /////////////////////////////////////////////////////////////////////
 49 |                 float3 translationWS;
 50 |                 translationWS = float3(MY_MATRIX_M._m03, MY_MATRIX_M._m13, MY_MATRIX_M._m23);    //(method1) extract position from float4x4
 51 |                 translationWS = float3(MY_MATRIX_M[0][3], MY_MATRIX_M[1][3], MY_MATRIX_M[2][3]); //(method2) extract position from float4x4
 52 |                 translationWS = float3(MY_MATRIX_M[0].w, MY_MATRIX_M[1].w, MY_MATRIX_M[2].w);    //(method3) extract position from float4x4
 53 |                 /////////////////////////////////////////////////////////////////////
 54 |                 //ways to extract transform.lossyScale from a M matrix
 55 |                 /////////////////////////////////////////////////////////////////////
 56 |                 float3 scaleWS;
 57 |                 scaleWS.x = length(float3(MY_MATRIX_M[0].x, MY_MATRIX_M[1].x, MY_MATRIX_M[2].x));
 58 |                 scaleWS.y = length(float3(MY_MATRIX_M[0].y, MY_MATRIX_M[1].y, MY_MATRIX_M[2].y));
 59 |                 scaleWS.z = length(float3(MY_MATRIX_M[0].z, MY_MATRIX_M[1].z, MY_MATRIX_M[2].z));
 60 | 
 61 |                 ////////////////////////////////////////////////
 62 |                 //build T and IV_T from scratch
 63 |                 float4x4 T = (float4x4)0;
 64 |                 T._m00 = 1;
 65 |                 T._m11 = 1;
 66 |                 T._m22 = 1;
 67 |                 T._m33 = 1;
 68 |                 T._m03 = translationWS.x;
 69 |                 T._m13 = translationWS.y;
 70 |                 T._m23 = translationWS.z;
 71 |                 float4x4 IV_T = T;
 72 |                 IV_T._m03 = -IV_T._m03;
 73 |                 IV_T._m13 = -IV_T._m13;
 74 |                 IV_T._m23 = -IV_T._m23;
 75 |                 //build S and IV_S from scratch
 76 |                 float4x4 S = (float4x4)0;
 77 |                 S._m00 = scaleWS.x;
 78 |                 S._m11 = scaleWS.y;
 79 |                 S._m22 = scaleWS.z;
 80 |                 S._m33 = 1;
 81 |                 float4x4 IV_S = S;
 82 |                 IV_S._m00 = 1.0/ IV_S._m00;
 83 |                 IV_S._m11 = 1.0/ IV_S._m11;
 84 |                 IV_S._m22 = 1.0/ IV_S._m22;
 85 |   
 86 |                 //build R using T & S
 87 |                 float4x4 R = mul(IV_S,mul(IV_T,MY_MATRIX_M)); //first remove T, then remove S
 88 |                 
 89 |                 //rebuild M (T*S*R)
 90 |                 float4x4 M = mul(T,mul(S,R)); //in shader, can't do matrix mul using S * R, use mul(S,R) !!!
 91 |                 
 92 |                 //apply M
 93 |                 v.vertex = mul(M, float4(v.vertex.xyz, 1));
 94 | 
 95 |                 //complete VP as usual
 96 |                 o.vertex = UnityWorldToClipPos(v.vertex);
 97 |                 return o;
 98 |             }
 99 | 
100 |             fixed4 frag (v2f i) : SV_Target
101 |             {
102 |                 return 1;
103 |             }
104 |             ENDCG
105 |         }
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/AccessMatrix4x4InCSharp.cs:
--------------------------------------------------------------------------------
 1 | ﻿//example C# script to show different ways to access Matrix4x4's component
 2 | using UnityEngine;
 3 | 
 4 | public class AccessMatrix4x4InCSharp : MonoBehaviour
 5 | {
 6 |     void Update()
 7 |     {
 8 |         //unity Matrix4x4 access method in C#
 9 |         /* (C0) (C1) (C2) (C3)
10 |          * m00, m01, m02, m03 (Row0)
11 |          * m10, m11, m12, m13 (Row1)
12 |          * m20, m21, m22, m23 (Row2)
13 |          * m30, m31, m32, m33 (Row3)
14 |         */
15 |         //example matrix (Scale & Translate)
16 |         /* (C0) (C1) (C2) (C3)
17 |          * Sx , m01, m02, Tx  (Row0)
18 |          * m10, Sy , m12, Ty  (Row1)
19 |          * m20, m21, Sz , Tz  (Row2)
20 |          * m30, m31, m32, m33 (Row3)
21 |         */
22 | 
23 |         Matrix4x4 m = transform.localToWorldMatrix;
24 | 
25 |         /////////////////////////////////////////////////////////////////////
26 |         //ways to extract transform.position from a M matrix
27 |         /////////////////////////////////////////////////////////////////////
28 |         Vector3 posWS;
29 |         posWS = new Vector3(m.m03, m.m13, m.m23);                           //(method 1) extract correct translation from matrix
30 |         posWS = m.GetColumn(3);                                             //(method 2) extract correct translation from matrix
31 |         posWS = new Vector3(m.GetRow(0).w, m.GetRow(1).w, m.GetRow(2).w);   //(method 3) extract correct translation from matrix
32 |         posWS = new Vector3(m[0, 3], m[1, 3], m[2, 3]);                     //(method 4) extract correct translation from matrix
33 |         Debug.Log($"transform.position = {posWS}");
34 | 
35 |         /////////////////////////////////////////////////////////////////////
36 |         //ways to extract transform.lossyScale from a M matrix
37 |         /////////////////////////////////////////////////////////////////////
38 |         Vector3 scaleWS;
39 |         //(wrong method) can get correct scale ONLY if rotation is all 0
40 |         scaleWS = new Vector3(m.m00, m.m11, m.m22);                         
41 |         //(right method) can get correct scale no matter what rotation is, due to the fact that rotation matrix's each column's length must equals 1 => sqrt(cos^2+sin^2+0) must equals 1
42 |         scaleWS = new Vector3(m.GetColumn(0).magnitude, m.GetColumn(1).magnitude, m.GetColumn(2).magnitude); 
43 |         Debug.Log($"transform.lossyScale = {scaleWS}");
44 | 
45 |         /////////////////////////////////////////////////////////////////////
46 |         //ways to extract transform.rotation from a M matrix
47 |         /////////////////////////////////////////////////////////////////////
48 |         Matrix4x4 R;
49 | 
50 |         //(method 1) get rotation matrix
51 |         Quaternion r = m.rotation;
52 |         R = Matrix4x4.Rotate(r);  
53 | 
54 |         //(method 2) get rotation matrix
55 |         //first remove scale
56 |         Matrix4x4 INV_S = Matrix4x4.identity;
57 |         INV_S.m00 = 1f / scaleWS.x;
58 |         INV_S.m11 = 1f / scaleWS.y;
59 |         INV_S.m22 = 1f / scaleWS.z;
60 |         R = m * INV_S;
61 |         //then remove position
62 |         R.m03 = 0;
63 |         R.m13 = 0;
64 |         R.m23 = 0;
65 |         //finally, at this line matrix will remain rotation
66 | 
67 |         ////////////////////////////////////////////////
68 |         //build T from scratch
69 |         Matrix4x4 T = Matrix4x4.identity;
70 |         T.m03 = posWS.x;
71 |         T.m13 = posWS.y;
72 |         T.m23 = posWS.z;
73 |         //build S from scratch
74 |         Matrix4x4 S = Matrix4x4.identity;
75 |         S.m00 = scaleWS.x;
76 |         S.m11 = scaleWS.y;
77 |         S.m22 = scaleWS.z;
78 | 
79 |         Matrix4x4 MY_MATRIX_M = T * R * S;
80 |         if(GetComponent<Renderer>())
81 |             GetComponent<Renderer>().material.SetMatrix("MY_MATRIX_M", MY_MATRIX_M); //it is the same as UNITY_MATRIX_M in shader
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/PowerVR cycle count document.shader:
--------------------------------------------------------------------------------
  1 | // [PowerVR cycle count document]
  2 | 
  3 | // Filename : PowerVR Low level GLSL Optimization
  4 | // Version : PowerVR SDK REL_17.1@4658063a External Issue
  5 | // Issue Date : 07 Apr 2017
  6 | // Author : Marton Tamas
  7 | // Full version PDF: http://cdn.imgtec.com/sdk-documentation/PowerVR+Low+level+GLSL+Optimization.pdf?fbclid=IwAR08O5o4pAJcgCGbB4nMf13vG-OeAm7xGkwzmNDSqIUBg_21w6JcYt0q3MY
  8 | // This document describes ways to optimize GLSL code for PowerVR Series 6 architecture.
  9 | 
 10 | // Based on:
 11 | // http://www.humus.name/Articles/Persson_LowLevelThinking.pdf
 12 | // http://www.humus.name/Articles/Persson_LowlevelShaderOptimization.pdf
 13 | 
 14 | //////////////////////////////////////////////////
 15 | // 2. Low level optimizations
 16 | //////////////////////////////////////////////////
 17 | 
 18 | // 2.1. PowerVR Series 6 USC diagram
 19 | // Generally shader performance on PowerVR Series 6 architecture GPUs depends on the number of
 20 | // cycles it takes to execute a shader.
 21 | 
 22 | // 2.2 Writing expressions in MAD form
 23 | fragColor.x = (t.x + t.y) * (t.x - t.y); //2 cycles
 24 | fragColor.x = t.x * t.x + (-t.y * t.y); //1 cycle
 25 | 
 26 | // 2.3 Division
 27 | fragColor.x = (t.x * t.y + t.z) / t.x; //3 cycles
 28 | fragColor.x = t.y + t.z * (1.0 / t.x); //2 cycles
 29 | 
 30 | // 2.4 Sign
 31 | fragColor.x = sign(t.x) * t.y; //3 cycles
 32 | fragColor.x = (t.x >= 0.0 ? 1.0 : -1.0) * t.y; //2 cycles, so if case (t.x == 0) is not needed it is better to use conditional form instead of sign().
 33 | 
 34 | // 2.5 Rcp/rsqrt/sqrt
 35 | fragColor.x = 1.0 / t.x; //1 cycle
 36 | fragColor.x = inversesqrt(t.x); //1 cycle
 37 | fragColor.x = sqrt(t.x); //2 cycles, sqrt() on the other hand is implemented as: 1 / (1/sqrt(x)), Which results in a 2 cycle cost.
 38 | 
 39 | fragColor.x = t.x * inversesqrt(t.x); //2 cycles
 40 | 
 41 | fragColor.x = sqrt(t.x) > 0.5 ? 0.5 : 1.0; //3 cycles
 42 | fragColor.x = (t.x * inversesqrt(t.x)) > 0.5 ? 0.5 : 1.0; //2 cycles, in this case the test instructions can fit into the second instruction.
 43 | 
 44 | // 2.6 Abs/Neg/Saturate
 45 | fragColor.x = abs(t.x * t.y); //2 cycles
 46 | fragColor.x = abs(t.x) * abs(t.y); //1 cycle
 47 | 
 48 | fragColor.x = -dot(t.xyz, t.yzx); //3 cycles
 49 | fragColor.x = dot(-t.xyz, t.yzx); //2 cycles
 50 | 
 51 | fragColor.x = 1.0 - clamp(t.x, 0.0, 1.0); //2 cycles
 52 | fragColor.x = clamp(1.0 - t.x, 0.0, 1.0); //1 cycle
 53 | 
 54 | fragColor.x = min(dot(t, t), 1.0) > 0.5 ? t.x : t.y; //5 cycles
 55 | fragColor.x = clamp(dot(t, t), 0.0, 1.0) > 0.5 ? t.x : t.y; //4 cycles
 56 | 
 57 | // normalize() is decomposed into:
 58 | vec3 normalize( vec3 v )
 59 | {
 60 | 	return v * inverssqrt( dot( v, v ) );
 61 | }
 62 | 
 63 | fragColor.xyz = normalize(-t.xyz); //7 cycles
 64 | fragColor.xyz = -normalize(t.xyz); //6 cycles
 65 | 
 66 | //////////////////////////////////////////////////
 67 | // 3. Transcendental functions
 68 | //////////////////////////////////////////////////
 69 | 
 70 | // 3.1. Exp/Log
 71 | fragColor.x = exp2(t.x); //1 cycle
 72 | fragColor.x = log2(t.x); //1 cycle
 73 | 
 74 | // Exp is implemented as:
 75 | float exp2( float x )
 76 | {
 77 | 	return exp2(x * 1.442695); //2 cycles
 78 | }
 79 | // Log is implemented as:
 80 | float log2( float x )
 81 | {
 82 | 	return log2(x * 0.693147); //2 cycles
 83 | }
 84 | // Pow(x, y) is implemented as:
 85 | float pow( float x, float y )
 86 | {
 87 | 	return exp2(log2(x) * y); //3 cycles
 88 | }
 89 | 
 90 | // 3.2. Sin/Cos/Sinh/Cosh
 91 | fragColor.x = sin(t.x); //4 cycles
 92 | fragColor.x = cos(t.x); //4 cycles
 93 | 
 94 | fragColor.x = cosh(t.x); //3 cycles
 95 | fragColor.x = sinh(t.x); //3 cycles
 96 | 
 97 | // 3.3. Asin/Acos/Atan /Degrees/Radians
 98 | fragColor.x = asin(t.x); //67 cycles (VERY high cost!)
 99 | fragColor.x = acos(t.x); //79 cycles (VERY high cost!)
100 | fragColor.x = atan(t.x); //12 cycles (lots of conditionals), Atan is still costly, but it could be used if needed.
101 | 
102 | fragColor.x = degrees(t.x); //1 cycle
103 | fragColor.x = radians(t.x); //1 cycle
104 | 
105 | //////////////////////////////////////////////////
106 | // 4. Intrinsic functions
107 | //////////////////////////////////////////////////
108 | 
109 | // 4.1. Vector*Matrix
110 | fragColor = t * m1; //4x4 matrix, 8 cycles
111 | fragColor.xyz = t.xyz * m2; //3x3 matrix, 4 cycles
112 | 
113 | // 4.2. Mixed Scalar/Vector math
114 | fragColor.x = length(t-v); 
115 | fragColor.y = distance(v, t); // total of 7 cycles
116 | fragColor.x = length(t-v); 
117 | fragColor.y = distance(t, v); // total of 9 cycles
118 | 
119 | fragColor.xyz = normalize(t.xyz); //6 cycles
120 | fragColor.xyz = inversesqrt(dot(t.xyz, t.xyz)) * t.xyz; //5 cycles
121 | 
122 | fragColor.xyz = 50.0 * normalize(t.xyz); //7 cycles
123 | fragColor.xyz = (50.0 * inversesqrt(dot(t.xyz, t.xyz))) * t.xyz; //6 cycles
124 | 
125 | // Cross() can be expanded to:
126 | vec3 cross( vec3 a, vec3 b )
127 | {
128 | 	return vec3( a.y * b.z - b.y * a.z,
129 | 	a.z * b.x - b.z * a.x,
130 | 	a.x * b.y - b.y * a.y );
131 | }
132 | // Distance can be expanded to:
133 | float distance( vec3 a, vec3 b )
134 | {
135 | 	vec3 tmp = a – b;
136 | 	return sqrt(dot(tmp, tmp));
137 | }
138 | // Dot can be expanded to:
139 | float dot( vec3 a, vec3 b )
140 | {
141 | 	return a.x * b.x + a.y * b.y + a.z * b.z;
142 | }
143 | // Faceforward can be expanded to:
144 | vec3 faceforward( vec3 n, vec3 I, vec3 Nref )
145 | {
146 | 	if( dot(Nref, I) < 0 )
147 | 	{
148 | 		return n;
149 | 	}
150 | 	else
151 | 	{
152 | 		return –n:
153 | 	}
154 | }
155 | // Length can be expanded to:
156 | float length( vec3 v )
157 | {
158 | 	return sqrt(dot(v, v));
159 | }
160 | // Normalize can be expanded to:
161 | vec3 normalize( vec3 v )
162 | {
163 | 	return v / sqrt(dot(v, v));
164 | }
165 | // Reflect can be expanded to:
166 | vec3 reflect( vec3 N, vec3 I )
167 | {
168 | 	return I - 2.0 * dot(N, I) * N;
169 | }
170 | // Refract can be expanded to:
171 | vec3 refract( vec3 n, vec3 I, float eta )
172 | {
173 | 	float k = 1.0 - eta * eta * (1.0 - dot(N, I) * dot(N, I));
174 | 	if (k < 0.0)
175 | 		return 0.0;
176 | 	else
177 | 		return eta * I - (eta * dot(N, I) + sqrt(k)) * N;
178 | }
179 | 
180 | // 4.3. Operation grouping
181 | fragColor.xyz = t.xyz * t.x * t.y * t.wzx * t.z * t.w; //7 cycles
182 | fragColor.xyz = (t.x * t.y * t.z * t.w) * (t.xyz * t.wzx); //4 cycles
183 | 
184 | //////////////////////////////////////////////////
185 | // 5. FP16 overview
186 | //////////////////////////////////////////////////
187 | 
188 | // 5.3. Exploiting the SOP/MAD FP16 pipeline
189 | // After applying all this knowledge, we can show off the power of this pipeline by using everything in one cycle:
190 | // All in 1 cycle
191 | mediump vec4 fp16 = t;
192 | highp vec4 res;
193 | res.x = clamp(min(-fp16.y * abs(fp16.z), clamp(fp16.w, 0.0, 1.0) * abs(fp16.x)), 0.0, 1.0);
194 | res.y = clamp(abs(fp16.w) * -fp16.z + clamp(fp16.x, 0.0, 1.0), 0.0, 1.0);
195 | fragColor = res;
196 | {sop, sop}
197 | 
198 | 
199 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Shader assembly / low level optimization
 2 | - http://www.humus.name/Articles/Persson_LowLevelThinking.pdf
 3 | - http://www.humus.name/Articles/Persson_LowlevelShaderOptimization.pdf
 4 | 
 5 | # PowerVR shader cycle count
 6 | - http://cdn.imgtec.com/sdk-documentation/PowerVR+Low+level+GLSL+Optimization.pdf?fbclid=IwAR08O5o4pAJcgCGbB4nMf13vG-OeAm7xGkwzmNDSqIUBg_21w6JcYt0q3MY
 7 | - https://github.com/ColinLeung-NiloCat/ShaderNotes/blob/8d095546bb61cb26edbe19122cb1028fa49f2e3a/PowerVR%20cycle%20count%20document.shader
 8 | 
 9 | # From ObjectSpace to WindowSpace all transform realtime demo
10 | - http://www.realtimerendering.com/udacity/transforms.html
11 | 
12 | # Unity C#/shader Matrix
13 | - https://answers.unity.com/questions/1359718/what-do-the-values-in-the-matrix4x4-for-cameraproj.html?childToView=1359877#answer-1359877
14 | - https://forum.unity.com/threads/can-i-get-the-scale-in-the-transform-of-the-object-i-attach-a-shader-to-if-so-how.418345/
15 | - https://answers.unity.com/questions/1435216/are-these-rotation-matrices-right.html
16 | 


--------------------------------------------------------------------------------