├── ComputeUtils.hlsl
├── ComputeUtils.py
├── ComputeUtilsTests.hlsl
└── LICENSE


/ComputeUtils.hlsl:
--------------------------------------------------------------------------------
  1 | #ifndef __COMPUTE_UTILS_HEADER__
  2 | #define __COMPUTE_UTILS_HEADER__
  3 | 
  4 | #ifndef GROUP_SIZE
  5 |     #error "ComputeUtils.hlsl requires definition of GROUP_SIZE"
  6 | #endif
  7 | #ifndef GROUP_SIZE_LOG_2
  8 |     #if GROUP_SIZE == 32
  9 |         #define GROUP_SIZE_LOG_2 5
 10 |     #elif GROUP_SIZE == 64
 11 |         #define GROUP_SIZE_LOG_2 6
 12 |     #elif GROUP_SIZE == 128
 13 |         #define GROUP_SIZE_LOG_2 7
 14 |     #elif GROUP_SIZE == 256
 15 |         #define GROUP_SIZE_LOG_2 8
 16 |     #else
 17 |         #error "ComputeUtils.hlsl requires definition of GROUP_SIZE_LOG_2, which must be the log2 of GROUP_SIZE"
 18 |     #endif
 19 | #endif
 20 | 
 21 | #define DWORD_BIT_SIZE_LOG2 5
 22 | #define DWORD_BIT_SIZE (1 << 5)
 23 | #define BIT_MASK_SIZE ((GROUP_SIZE + DWORD_BIT_SIZE - 1)/ DWORD_BIT_SIZE)
 24 | 
 25 | namespace ComputeUtils
 26 | {
 27 | 
 28 | groupshared uint gs_BitMask[BIT_MASK_SIZE];
 29 | 
 30 | void PrefixBitSum(
 31 |     uint groupThreadIndex,
 32 |     bool bitValue,
 33 |     out uint outOffset,
 34 |     out uint outCount)
 35 | {
 36 |     if (groupThreadIndex < BIT_MASK_SIZE)
 37 |         gs_BitMask[groupThreadIndex] = 0;
 38 | 
 39 |     GroupMemoryBarrierWithGroupSync();
 40 | 
 41 |     uint maskOffset = groupThreadIndex >> DWORD_BIT_SIZE_LOG2;
 42 |     uint maskBit = (groupThreadIndex & (DWORD_BIT_SIZE - 1));
 43 |     uint mask = 1u << maskBit;
 44 | 
 45 |     [branch]
 46 |     if (bitValue)
 47 |     {
 48 |         uint unused;
 49 |         InterlockedOr(gs_BitMask[maskOffset], mask, unused);
 50 |     }
 51 | 
 52 |     GroupMemoryBarrierWithGroupSync();
 53 | 
 54 |     outOffset = 0;
 55 |     if (bitValue)
 56 |     {
 57 |         for (uint i = 0; i < maskOffset; ++i)
 58 |             outOffset += countbits(gs_BitMask[i]);
 59 |         uint v = gs_BitMask[maskOffset];
 60 |         outOffset += countbits((mask - 1u) & v);
 61 |     }
 62 | 
 63 |     outCount = 0;
 64 |     {
 65 |         [unroll]
 66 |         for (uint i = 0; i < BIT_MASK_SIZE; ++i)
 67 |             outCount += countbits(gs_BitMask[i]);
 68 |     }
 69 | }
 70 | 
 71 | groupshared uint gs_PrefixCache[GROUP_SIZE];
 72 | 
 73 | void PrefixExclusive(
 74 |     uint groupThreadIndex,
 75 |     uint value,
 76 |     out uint outOffset,
 77 |     out uint outCount)
 78 | {
 79 |     gs_PrefixCache[groupThreadIndex] = value;
 80 | 
 81 |     GroupMemoryBarrierWithGroupSync();
 82 | 
 83 |     for (uint i = 1; i < GROUP_SIZE; i <<= 1)
 84 |     {
 85 |         uint sampleVal = groupThreadIndex >= i ? gs_PrefixCache[groupThreadIndex - i] : 0u;
 86 | 
 87 |         GroupMemoryBarrierWithGroupSync();
 88 | 
 89 |         gs_PrefixCache[groupThreadIndex] += sampleVal;
 90 | 
 91 |         GroupMemoryBarrierWithGroupSync();
 92 |     }
 93 | 
 94 |     outOffset = gs_PrefixCache[groupThreadIndex] - value;
 95 |     outCount = gs_PrefixCache[GROUP_SIZE - 1];
 96 | }
 97 | 
 98 | void PrefixInclusive(
 99 |     uint groupThreadIndex,
100 |     uint value,
101 |     out uint outOffset,
102 |     out uint outCount)
103 | {
104 |     PrefixExclusive(groupThreadIndex, value, outOffset, outCount);
105 |     outOffset += value;
106 | }
107 | 
108 | uint CalculateGlobalStorageOffset(
109 |     RWByteAddressBuffer counterBuffer,
110 |     uint groupThreadIndex,
111 |     bool bitValue,
112 |     out uint totalCount)
113 | {
114 |     uint localOffset;
115 |     PrefixBitSum(groupThreadIndex, bitValue, localOffset, totalCount);
116 | 
117 |     if (groupThreadIndex == 0 && totalCount > 0)
118 |     {
119 |         uint globalOffset = 0;
120 |         counterBuffer.InterlockedAdd(0, totalCount, globalOffset);
121 |         gs_BitMask[0] = globalOffset;
122 |     }
123 | 
124 |     GroupMemoryBarrierWithGroupSync();
125 | 
126 |     return gs_BitMask[0] + localOffset;
127 | }
128 | 
129 | uint CalculateGlobalStorageOffset(
130 |     RWByteAddressBuffer counterBuffer,
131 |     uint groupThreadIndex,
132 |     bool bitValue)
133 | {
134 |     uint unused0;
135 |     return CalculateGlobalStorageOffset(counterBuffer, groupThreadIndex, bitValue, unused0); 
136 | }
137 | 
138 | uint CalculateGlobalValueStorageOffset(
139 |     RWByteAddressBuffer counterBuffer,
140 |     uint groupThreadIndex,
141 |     uint valueCount,
142 |     out uint totalCount)
143 | {
144 |     uint localOffset;
145 |     PrefixExclusive(groupThreadIndex, valueCount, localOffset, totalCount);
146 | 
147 |     if (groupThreadIndex == 0 && totalCount > 0)
148 |     {
149 |         uint globalOffset = 0;
150 |         counterBuffer.InterlockedAdd(0, totalCount, globalOffset);
151 |         gs_PrefixCache[0] = globalOffset;
152 |     }
153 | 
154 |     GroupMemoryBarrierWithGroupSync();
155 | 
156 |     return gs_PrefixCache[0] + localOffset;
157 | }
158 | 
159 | uint CalculateGlobalValueStorageOffset(
160 |     RWByteAddressBuffer counterBuffer,
161 |     uint groupThreadIndex,
162 |     uint valueCount)
163 | {
164 |     uint unused0;
165 |     return CalculateGlobalValueStorageOffset(counterBuffer, groupThreadIndex, valueCount, unused0);
166 | }
167 | 
168 | #define BROADCAST_COMPACT_IDX_PER_DWORD (32/GROUP_SIZE_LOG_2)
169 | #define BROADCAST_COMPACT_DWORDS ((GROUP_SIZE + BROADCAST_COMPACT_IDX_PER_DWORD - 1)/BROADCAST_COMPACT_IDX_PER_DWORD)
170 | groupshared uint gs_BroadcastLocalCount[GROUP_SIZE];
171 | groupshared uint gs_BroadcastPackedIdxMap[BROADCAST_COMPACT_DWORDS];
172 |  
173 | bool BroadcastWork(
174 |     uint groupThreadIndex,
175 |     uint workCount,
176 |     out uint outIndex,
177 |     out uint outParentCount,
178 |     out uint outParentGroupID)
179 | {
180 |     uint unused0;
181 |     if (groupThreadIndex < BROADCAST_COMPACT_DWORDS)
182 |         gs_BroadcastPackedIdxMap[groupThreadIndex] = 0;
183 | 
184 |     bool validWorkCount = workCount != 0;
185 |     uint compactIdx;
186 |     PrefixBitSum(groupThreadIndex, validWorkCount, compactIdx, unused0);
187 |     if (validWorkCount)
188 |         InterlockedOr(gs_BroadcastPackedIdxMap[compactIdx/BROADCAST_COMPACT_IDX_PER_DWORD], groupThreadIndex << ((compactIdx%BROADCAST_COMPACT_IDX_PER_DWORD)*GROUP_SIZE_LOG_2), unused0);
189 | 
190 |     uint groupOffset, groupCount; 
191 |     PrefixInclusive(groupThreadIndex, workCount, groupOffset, groupCount);
192 | 
193 |     if (groupThreadIndex < BIT_MASK_SIZE)
194 |         gs_BitMask[groupThreadIndex] = 0;
195 | 
196 |     int leftInLDS = max((int)GROUP_SIZE - (int)(groupOffset - workCount), 0);
197 |     int actualWorkCount = min(leftInLDS, workCount);
198 |     gs_BroadcastLocalCount[groupThreadIndex] = actualWorkCount;
199 |     
200 |     GroupMemoryBarrierWithGroupSync();
201 | 
202 |     [branch]
203 |     if (actualWorkCount > 0 && groupOffset < GROUP_SIZE)
204 |         InterlockedOr(gs_BitMask[groupOffset >> DWORD_BIT_SIZE_LOG2], 1u << (groupOffset & (DWORD_BIT_SIZE - 1u)), unused0);
205 | 
206 |     GroupMemoryBarrierWithGroupSync();
207 | 
208 |     uint compactSampleIdx = 0; 
209 |     {
210 |         uint groupDWOffset = groupThreadIndex >> DWORD_BIT_SIZE_LOG2;
211 |         uint groupBitMask = 1u << (groupThreadIndex & (DWORD_BIT_SIZE - 1u));
212 |         for (uint i = 0; i < groupDWOffset; ++i)
213 |             compactSampleIdx += countbits(gs_BitMask[i]);
214 |         uint v = gs_BitMask[groupDWOffset];
215 |         compactSampleIdx += countbits(((groupBitMask - 1u) | groupBitMask) & v);
216 |     }
217 | 
218 |     outParentGroupID = (gs_BroadcastPackedIdxMap[compactSampleIdx/BROADCAST_COMPACT_IDX_PER_DWORD] >> ((compactSampleIdx%BROADCAST_COMPACT_IDX_PER_DWORD)*GROUP_SIZE_LOG_2)) & ((1u << GROUP_SIZE_LOG_2) - 1u);
219 | 
220 |     uint parentPrefixExclusive = (outParentGroupID == 0 ? 0 : gs_PrefixCache[outParentGroupID - 1u]);
221 |     outParentCount = gs_BroadcastLocalCount[outParentGroupID];
222 |     outIndex = groupThreadIndex - parentPrefixExclusive;
223 |     return outIndex < outParentCount;
224 | }
225 | 
226 | }
227 | 
228 | #endif
229 | 


--------------------------------------------------------------------------------
/ComputeUtils.py:
--------------------------------------------------------------------------------
  1 | import coalpy.gpu as g
  2 | import numpy as nm
  3 | 
  4 | ## Compute Utils Test Script ##
  5 | 
  6 | #variables
  7 | g_layout_workloads = [540, 340, 299, 689, 229, 770, 8]
  8 | g_round_count = 8450
  9 | #g_round_count = 500
 10 | g_instance_count = len(g_layout_workloads) * g_round_count 
 11 | g_total_output_count = sum(g_layout_workloads) * g_round_count;
 12 | print ("instances: %d output: %d" % (g_instance_count, g_total_output_count))
 13 | #######
 14 | 
 15 | def load_gpu_buffer(gpu_buffer, numpy_type='int'):
 16 |     request = g.ResourceDownloadRequest(resource = gpu_buffer)
 17 |     request.resolve()
 18 |     return nm.frombuffer(request.data_as_bytearray(), dtype=numpy_type)
 19 | 
 20 | def test_broadcast_work(workloads, rounds, output_list, output_list_count):
 21 |     expected_mask_workloads = [((1 << v) - 1) for v in workloads]
 22 |     instance_counts = len(expected_mask_workloads) * rounds 
 23 |     expected_masks = [expected_mask_workloads[i%len(expected_mask_workloads)] for i in range(0, instance_counts)]
 24 |     output_masks = [0 for i in range(0, instance_counts)]
 25 | 
 26 |     for i in range(0, output_list_count):
 27 |         output_masks[output_list[(i*2)]] |= 1 << int(output_list[(i*2) + 1])
 28 | 
 29 |     for i in range(0, len(output_masks)):
 30 |         if (expected_masks[i] != output_masks[i]):
 31 |             return False
 32 | 
 33 |     return True
 34 | 
 35 | g_init_shader = g.Shader(file = "ComputeUtilsTests.hlsl", main_function = "InitInputBufferMain")
 36 | g_init_shader.resolve()
 37 | 
 38 | g_distribute_shader = g.Shader(file = "ComputeUtilsTests.hlsl", main_function = "DistributeMain")
 39 | g_distribute_shader.resolve()
 40 | 
 41 | g_distribute_naive_shader = g.Shader(file = "ComputeUtilsTests.hlsl", main_function = "DistributeNaiveMain")
 42 | g_distribute_naive_shader.resolve()
 43 | 
 44 | work_layout_buffer = g.Buffer(type=g.BufferType.Raw, element_count=len(g_layout_workloads))
 45 | work_buffer = g.Buffer(type=g.BufferType.Raw, element_count = g_instance_count * 2)
 46 | 
 47 | counter_buffer = g.Buffer(type=g.BufferType.Raw, element_count = 1)
 48 | output_counter_buffer = g.Buffer(type=g.BufferType.Raw, element_count = 1)
 49 | output_buffer = g.Buffer(type=g.BufferType.Raw, element_count = g_total_output_count * 2)
 50 | 
 51 | output_naive_counter_buffer = g.Buffer(type=g.BufferType.Raw, element_count = 1)
 52 | output_naive_buffer = g.Buffer(type=g.BufferType.Raw, element_count = g_total_output_count * 2)
 53 | 
 54 | debug_counter_buffer = g.Buffer(type=g.BufferType.Raw, element_count = 1)
 55 | debug_buffer = g.Buffer(type=g.BufferType.Raw, element_count = 400 * 4)
 56 | 
 57 | cmd = g.CommandList()
 58 | 
 59 | g.begin_collect_markers()
 60 | 
 61 | cmd.upload_resource(
 62 |     source = g_layout_workloads, 
 63 |     destination = work_layout_buffer)
 64 | 
 65 | cmd.upload_resource(
 66 |     source = [0], 
 67 |     destination = counter_buffer)
 68 | 
 69 | cmd.upload_resource(
 70 |     source = [0], 
 71 |     destination = output_counter_buffer)
 72 | 
 73 | cmd.upload_resource(
 74 |     source = [0], 
 75 |     destination = output_naive_counter_buffer)
 76 | 
 77 | cmd.dispatch(
 78 |     constants = [ int(g_instance_count), int(len(g_layout_workloads)), int(0), int(0)],
 79 |     inputs = work_layout_buffer,
 80 |     outputs = work_buffer,
 81 |     shader = g_init_shader,
 82 |     x = int((g_instance_count +63)/64),
 83 |     y = 1,
 84 |     z = 1)
 85 | 
 86 | cmd.begin_marker("GroupDistribute")
 87 | cmd.dispatch(
 88 |     constants = [ int(g_instance_count), int(0), int(0), int(0)],
 89 |     inputs = work_buffer,
 90 |     outputs = [counter_buffer, output_buffer, output_counter_buffer, debug_buffer, debug_counter_buffer],
 91 |     shader = g_distribute_shader,
 92 |     x = 3500,
 93 |     y = 1,
 94 |     z = 1)
 95 | cmd.end_marker()
 96 | 
 97 | cmd.begin_marker("GroupDistributeNaive")
 98 | cmd.dispatch(
 99 |     constants = [ int(g_instance_count), int(0), int(0), int(0)],
100 |     inputs = work_buffer,
101 |     outputs = [counter_buffer, output_naive_buffer, output_naive_counter_buffer, debug_buffer, debug_counter_buffer],
102 |     shader = g_distribute_naive_shader,
103 |     x = int((g_instance_count + 63)/64),
104 |     y = 1,
105 |     z = 1)
106 | cmd.end_marker()
107 | 
108 | g.schedule(cmd)
109 | 
110 | marker_results = g.end_collect_markers()
111 | gpu_timestamps = load_gpu_buffer(marker_results.timestamp_buffer, nm.uint64)
112 | perf_data = [(name, (gpu_timestamps[ets] - gpu_timestamps[bts])/marker_results.timestamp_frequency)  for (name, p, bts, ets) in marker_results.markers]
113 | print (perf_data)
114 | 
115 | # Output
116 | """
117 | output_counter_buffer_readback = load_gpu_buffer(output_counter_buffer)[0]
118 | output_buffer_readback = load_gpu_buffer(output_buffer)
119 | print("Distributed Received: %d" % output_counter_buffer_readback)
120 | print(test_broadcast_work(g_layout_workloads, g_round_count, output_buffer_readback, output_counter_buffer_readback))
121 | 
122 | output_counter_buffer_readback = load_gpu_buffer(output_naive_counter_buffer)[0]
123 | output_buffer_readback = load_gpu_buffer(output_naive_buffer)
124 | print("Distributed Naive Received: %d" % output_counter_buffer_readback)
125 | print(test_broadcast_work(g_layout_workloads, g_round_count, output_buffer_readback, output_counter_buffer_readback))
126 | """
127 | 


--------------------------------------------------------------------------------
/ComputeUtilsTests.hlsl:
--------------------------------------------------------------------------------
  1 | #define GROUP_SIZE 64
  2 | #include "ComputeUtils.hlsl"
  3 | 
  4 | ByteAddressBuffer g_workLayout : register(t0);
  5 | RWByteAddressBuffer g_outputWorkBuffer : register(u0);
  6 | 
  7 | cbuffer Constants : register(b0)
  8 | {
  9 |     uint g_InstanceCount;
 10 |     uint g_LayoutWorkloadsCount;
 11 |     uint2 g_padding;
 12 | }
 13 | 
 14 | [numthreads(GROUP_SIZE,1,1)]
 15 | void InitInputBufferMain(uint3 dispatchThreadID : SV_DispatchThreadID)
 16 | {
 17 |     if (dispatchThreadID.x >= g_InstanceCount)
 18 |         return;
 19 | 
 20 |     uint workLayoutCount = g_workLayout.Load(0);
 21 |     uint workID = dispatchThreadID.x;
 22 |     uint workCount = g_workLayout.Load((workID % g_LayoutWorkloadsCount) << 2);
 23 |     g_outputWorkBuffer.Store2(dispatchThreadID.x << 3, uint2(workID, workCount));
 24 | }
 25 | 
 26 | ByteAddressBuffer g_InputBuffer : register(t0);
 27 | RWByteAddressBuffer g_CounterBuffer : register(u0);
 28 | RWByteAddressBuffer g_OutputBuffer : register(u1);
 29 | RWByteAddressBuffer g_OutputCounter : register(u2);
 30 | RWByteAddressBuffer g_DebugBuffer : register(u3);
 31 | RWByteAddressBuffer g_DebugBufferCounter : register(u4);
 32 | 
 33 | groupshared bool gs_GroupActive;
 34 | groupshared bool gs_LoadInstanceBatch;
 35 | groupshared uint gs_GroupInputOffset;
 36 | groupshared uint gs_InstanceIDCache[GROUP_SIZE];
 37 | groupshared uint gs_InstanceIDWorkLeft[GROUP_SIZE];
 38 | groupshared uint gs_InstanceIDWorkOffset[GROUP_SIZE];
 39 | 
 40 | [numthreads(GROUP_SIZE, 1, 1)]
 41 | void DistributeMain(
 42 |     uint3 dispatchThreadID : SV_DispatchThreadID,
 43 |     uint groupThreadIndex : SV_GroupIndex)
 44 | {
 45 |     if (groupThreadIndex == 0)
 46 |     {
 47 |         gs_GroupActive = true;
 48 |         gs_LoadInstanceBatch = true;
 49 |     }
 50 | 
 51 |     GroupMemoryBarrierWithGroupSync();
 52 | 
 53 |     while (gs_GroupActive)
 54 |     {
 55 |         if (gs_LoadInstanceBatch)
 56 |         {
 57 |             if (groupThreadIndex == 0)
 58 |             {
 59 |                 uint inputOffset;
 60 |                 g_CounterBuffer.InterlockedAdd(0, GROUP_SIZE, inputOffset);
 61 |                 gs_GroupInputOffset = inputOffset;
 62 |                 gs_GroupActive = inputOffset < g_InstanceCount;
 63 |             }
 64 | 
 65 |             GroupMemoryBarrierWithGroupSync();
 66 | 
 67 |             uint sampleIndex = gs_GroupInputOffset + groupThreadIndex;
 68 |             uint2 data = sampleIndex < g_InstanceCount ? g_InputBuffer.Load2(sampleIndex << 3) : uint2(0,0);
 69 |             gs_InstanceIDCache[groupThreadIndex] = data.x;
 70 |             gs_InstanceIDWorkLeft[groupThreadIndex] = data.y;
 71 |             gs_InstanceIDWorkOffset[groupThreadIndex] = 0;
 72 |             gs_LoadInstanceBatch = false;
 73 | 
 74 |             GroupMemoryBarrierWithGroupSync();
 75 |         }
 76 | 
 77 |         if (!gs_GroupActive)
 78 |             return;
 79 | 
 80 |         uint workIndex;
 81 |         uint workParentCount;
 82 |         uint workParentID;
 83 |         bool validWork = ComputeUtils::BroadcastWork(
 84 |             groupThreadIndex,
 85 |             gs_InstanceIDWorkLeft[groupThreadIndex], 
 86 |             workIndex, workParentCount, workParentID);
 87 | 
 88 |         uint workOffset = workIndex + gs_InstanceIDWorkOffset[workParentID];
 89 | 
 90 |         uint outputCount;
 91 |         uint outputIndex = ComputeUtils::CalculateGlobalStorageOffset(g_OutputCounter, groupThreadIndex, validWork, outputCount);
 92 |         if (validWork)
 93 |             g_OutputBuffer.Store2(outputIndex << 3, uint2(gs_InstanceIDCache[workParentID], workOffset));
 94 | 
 95 |         if (workIndex == 0)
 96 |         {
 97 |             gs_InstanceIDWorkLeft[workParentID] -= workParentCount;
 98 |             gs_InstanceIDWorkOffset[workParentID] += workParentCount;
 99 |         }
100 | 
101 |         if (outputCount == 0)
102 |             gs_LoadInstanceBatch = true;
103 | 
104 |         GroupMemoryBarrierWithGroupSync();
105 |     }
106 | }
107 | 
108 | [numthreads(GROUP_SIZE, 1, 1)]
109 | void DistributeNaiveMain(
110 |     uint3 dispatchThreadID : SV_DispatchThreadID)
111 | {
112 |     if (dispatchThreadID.x >= g_InstanceCount)
113 |         return;
114 |     
115 |     uint2 instanceJob = g_InputBuffer.Load2(dispatchThreadID.x << 3);
116 |     uint outputOffset;
117 |     g_OutputCounter.InterlockedAdd(0, instanceJob.y, outputOffset);
118 |     for (uint i = 0; i < instanceJob.y; ++i)
119 |         g_OutputBuffer.Store((outputOffset + i) << 3, uint2(instanceJob.x, i));
120 | }
121 | 
122 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Kleber Garcia
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------