├── README.md ├── async_reduce.cu ├── bulk ├── algorithm.hpp ├── algorithm │ ├── accumulate.hpp │ ├── adjacent_difference.hpp │ ├── copy.hpp │ ├── detail │ │ └── stable_merge_sort.hpp │ ├── for_each.hpp │ ├── gather.hpp │ ├── merge.hpp │ ├── reduce.hpp │ ├── reduce_by_key.hpp │ ├── scan.hpp │ ├── scatter.hpp │ └── sort.hpp ├── async.hpp ├── bulk.hpp ├── choose_sizes.hpp ├── detail │ ├── alignment.hpp │ ├── apply_from_tuple.hpp │ ├── async.inl │ ├── choose_sizes.inl │ ├── closure.hpp │ ├── config.hpp │ ├── cuda_launcher │ │ ├── cuda_launch_config.hpp │ │ ├── cuda_launcher.hpp │ │ ├── parameter_ptr.hpp │ │ ├── runtime_introspection.hpp │ │ ├── runtime_introspection.inl │ │ └── triple_chevron_launcher.hpp │ ├── cuda_task.hpp │ ├── guarded_cuda_runtime_api.hpp │ ├── head_flags.hpp │ ├── is_contiguous_iterator.hpp │ ├── pointer_traits.hpp │ ├── synchronize.hpp │ ├── tail_flags.hpp │ ├── terminate.hpp │ ├── throw_on_error.hpp │ ├── tuple_meta_transform.hpp │ └── tuple_transform.hpp ├── execution_policy.hpp ├── future.hpp ├── iterator.hpp ├── iterator │ └── strided_iterator.hpp ├── malloc.hpp └── uninitialized.hpp ├── decomposition.hpp ├── for_each.cu ├── futures.cu ├── head_flags.hpp ├── hello_world.cu ├── join_iterator.hpp ├── merge.cu ├── merge_sort_by_key.cu ├── ping_pong.cu ├── reduce.cu ├── reduce_by_key.cu ├── reduce_intervals.hpp ├── saxpy.cu ├── scan.cu ├── sum.cu ├── tail_flags.hpp └── time_invocation_cuda.hpp /README.md: -------------------------------------------------------------------------------- 1 | bulk 2 | ========== 3 | 4 | We have a lot parallel work to do, but all we have are these puny threads. Let's Bulk up! 5 | 6 | Bulk lets you describe a parallel task as a hierarchical grouping of *execution 7 | agents*. Individually, these agents are like tiny, lightweight threads, but 8 | when grouped together pack some serious muscle. 9 | 10 | We can launch parallel groups of agents with `bulk::async`: 11 | 12 | `hello_world.cu`: 13 | 14 | ``` 15 | #include 16 | #include 17 | #include 18 | 19 | struct hello 20 | { 21 | __host__ __device__ 22 | void operator()() 23 | { 24 | printf("Hello world!\n"); 25 | } 26 | 27 | __host__ __device__ 28 | void operator()(bulk::parallel_group<> &g) 29 | { 30 | printf("Hello world from thread %d\n", g.this_exec.index()); 31 | } 32 | }; 33 | 34 | int main() 35 | { 36 | // just launch one agent to say hello 37 | bulk::async(bulk::par(1), hello()); 38 | 39 | // launch 32 agents in parallel 40 | // bulk::root stands in for the root of the agent hierarchy 41 | // the hello functor uses this to identify each agent within its group 42 | bulk::async(bulk::par(32), hello(), bulk::root); 43 | 44 | cudaDeviceSynchronize(); 45 | 46 | return 0; 47 | } 48 | ``` 49 | 50 | From here it's a trivial exercise to get to SAXPY: 51 | 52 | ``` 53 | #include 54 | #include 55 | #include 56 | #include 57 | 58 | struct saxpy 59 | { 60 | __host__ __device__ 61 | void operator()(bulk::agent<> &self, float a, float *x, float *y) 62 | { 63 | int i = self.index(); 64 | y[i] = a * x[i] + y[i]; 65 | } 66 | }; 67 | 68 | int main() 69 | { 70 | size_t n = 1 << 24; 71 | thrust::device_vector x(n, 1); 72 | thrust::device_vector y(n, 1); 73 | 74 | float a = 13; 75 | 76 | // pass bulk::root.this_exec so the saxpy functor receives 77 | // the current execution agent directly 78 | bulk::async(bulk::par(n), saxpy(), bulk::root.this_exec, a, thrust::raw_pointer_cast(x.data()), thrust::raw_pointer_cast(y.data())); 79 | 80 | assert(y == thrust::device_vector(n, 14)); 81 | 82 | std::cout << "Nice SAXPY. Do you work out?" << std::endl; 83 | 84 | return 0; 85 | } 86 | ``` 87 | 88 | Algorithms built with Bulk are fast. 89 | 90 | [`reduce`](reduce.cu) Performance 91 | --------------------- 92 | 93 | ![][32b_float_reduce] 94 | ![][64b_float_reduce] 95 | 96 | [32b_float_reduce]: https://docs.google.com/spreadsheet/oimg?key=0Aj9b9uhQ9hZUdGVQazRVcGxIZGt2TjFybFNpR1hJQmc&oid=2&zx=5u68essty3v7 97 | [64b_float_reduce]: https://docs.google.com/spreadsheet/oimg?key=0Aj9b9uhQ9hZUdGVQazRVcGxIZGt2TjFybFNpR1hJQmc&oid=3&zx=kx4rsyamnhnj 98 | 99 | [`inclusive_scan`](scan.cu) Performance 100 | ---------------------------- 101 | 102 | ![][32b_float_scan] 103 | ![][64b_float_scan] 104 | 105 | [32b_float_scan]: https://docs.google.com/spreadsheet/oimg?key=0Aj9b9uhQ9hZUdGR4cXU4ekdPeXFTOTBTUG9NUDh3OWc&oid=2&zx=5ji93q18pi8m 106 | [64b_float_scan]: https://docs.google.com/spreadsheet/oimg?key=0Aj9b9uhQ9hZUdGR4cXU4ekdPeXFTOTBTUG9NUDh3OWc&oid=3&zx=ftlaacipyq13 107 | 108 | [`merge`](merge.cu) Performance 109 | ------------------- 110 | 111 | ![][32b_float_merge] 112 | ![][64b_float_merge] 113 | 114 | [32b_float_merge]: https://docs.google.com/spreadsheet/oimg?key=0Aj9b9uhQ9hZUdDE4cm9tTXJWS0RsOTYtNklZSWcxdFE&oid=4&zx=l6i8z7pk97nu 115 | [64b_float_merge]: https://docs.google.com/spreadsheet/oimg?key=0Aj9b9uhQ9hZUdDE4cm9tTXJWS0RsOTYtNklZSWcxdFE&oid=5&zx=c8b2ujje3wql 116 | 117 | [`reduce_by_key`](reduce_by_key.cu) Performance 118 | --------------------------- 119 | 120 | ![][32b_float_reduce_by_key] 121 | ![][64b_float_reduce_by_key] 122 | 123 | [32b_float_reduce_by_key]: https://docs.google.com/spreadsheet/oimg?key=0Aj9b9uhQ9hZUdDlYWDVhTDZiZXJvYUV6TlF5MUpNSXc&oid=2&zx=4vck6bwpyh52 124 | [64b_float_reduce_by_key]: https://docs.google.com/spreadsheet/oimg?key=0Aj9b9uhQ9hZUdDlYWDVhTDZiZXJvYUV6TlF5MUpNSXc&oid=3&zx=t72yxc8mvorj 125 | -------------------------------------------------------------------------------- /async_reduce.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | struct reduce_kernel 10 | { 11 | template 12 | __device__ void operator()(volatile bool *wait_for_me, Iterator first, Iterator last, Pointer result) 13 | { 14 | while(!*wait_for_me) 15 | { 16 | printf("waiting...\n"); 17 | } 18 | 19 | *result = thrust::reduce(thrust::device, first, last); 20 | } 21 | }; 22 | 23 | 24 | struct greenlight 25 | { 26 | __device__ void operator()(bool *set_me) 27 | { 28 | *set_me = true; 29 | } 30 | }; 31 | 32 | 33 | int main() 34 | { 35 | cudaStream_t s1,s2; 36 | cudaStreamCreate(&s1); 37 | cudaStreamCreate(&s2); 38 | 39 | using bulk::par; 40 | using bulk::async; 41 | 42 | thrust::device_vector vec(1 << 20); 43 | thrust::sequence(vec.begin(), vec.end()); 44 | 45 | thrust::device_vector result(1); 46 | thrust::device_vector flag(1); 47 | 48 | // note we launch the reduction before the greenlight 49 | async(par(s1,1), reduce_kernel(), thrust::raw_pointer_cast(flag.data()), vec.begin(), vec.end(), result.begin()); 50 | 51 | async(par(s2,1), greenlight(), thrust::raw_pointer_cast(flag.data())); 52 | 53 | cudaStreamDestroy(s1); 54 | cudaStreamDestroy(s2); 55 | 56 | std::cout << "result: " << thrust::reduce(vec.begin(), vec.end()) << std::endl; 57 | std::cout << "asynchronous result: " << result[0] << std::endl; 58 | 59 | assert(thrust::reduce(vec.begin(), vec.end()) == result[0]); 60 | 61 | return 0; 62 | } 63 | 64 | -------------------------------------------------------------------------------- /bulk/algorithm.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | -------------------------------------------------------------------------------- /bulk/algorithm/accumulate.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | BULK_NAMESPACE_PREFIX 26 | namespace bulk 27 | { 28 | 29 | 30 | template 35 | __forceinline__ __device__ 36 | T accumulate(const bounded > &exec, 37 | RandomAccessIterator first, 38 | RandomAccessIterator last, 39 | T init, 40 | BinaryFunction binary_op) 41 | { 42 | typedef typename bounded >::size_type size_type; 43 | 44 | size_type n = last - first; 45 | 46 | for(size_type i = 0; i < exec.bound(); ++i) 47 | { 48 | if(i < n) 49 | { 50 | init = binary_op(init, first[i]); 51 | } // end if 52 | } // end for i 53 | 54 | return init; 55 | } // end accumulate() 56 | 57 | 58 | namespace detail 59 | { 60 | namespace accumulate_detail 61 | { 62 | 63 | 64 | // XXX this implementation is simply an inplace inclusive scan 65 | // we could potentially do better with an implementation which uses Sean's bitfield reverse trick 66 | template 67 | __device__ T destructive_accumulate_n(ConcurrentGroup &g, RandomAccessIterator first, Size n, T init, BinaryFunction binary_op) 68 | { 69 | typedef typename ConcurrentGroup::size_type size_type; 70 | 71 | size_type tid = g.this_exec.index(); 72 | 73 | T x = init; 74 | if(tid < n) 75 | { 76 | x = first[tid]; 77 | } 78 | 79 | g.wait(); 80 | 81 | for(size_type offset = 1; offset < g.size(); offset += offset) 82 | { 83 | if(tid >= offset && tid - offset < n) 84 | { 85 | x = binary_op(first[tid - offset], x); 86 | } 87 | 88 | g.wait(); 89 | 90 | if(tid < n) 91 | { 92 | first[tid] = x; 93 | } 94 | 95 | g.wait(); 96 | } 97 | 98 | T result = binary_op(init, first[n - 1]); 99 | 100 | g.wait(); 101 | 102 | return result; 103 | } 104 | 105 | 106 | template 107 | struct buffer 108 | { 109 | typedef typename thrust::iterator_value::type value_type; 110 | 111 | union 112 | { 113 | uninitialized_array inputs; 114 | uninitialized_array sums; 115 | }; // end union 116 | }; // end buffer 117 | 118 | 119 | template 120 | __device__ 121 | T accumulate(bulk::concurrent_group,groupsize> &g, 122 | RandomAccessIterator first, 123 | RandomAccessIterator last, 124 | T init, 125 | BinaryFunction binary_op) 126 | { 127 | typedef typename bulk::concurrent_group,groupsize>::size_type size_type; 128 | 129 | const size_type elements_per_group = groupsize * grainsize; 130 | 131 | size_type tid = g.this_exec.index(); 132 | 133 | T sum = init; 134 | 135 | typename thrust::iterator_difference::type n = last - first; 136 | 137 | typedef detail::accumulate_detail::buffer< 138 | groupsize, 139 | grainsize, 140 | RandomAccessIterator, 141 | T 142 | > buffer_type; 143 | 144 | #if __CUDA_ARCH__ >= 200 145 | buffer_type *buffer = reinterpret_cast(bulk::malloc(g, sizeof(buffer_type))); 146 | #else 147 | __shared__ uninitialized buffer_impl; 148 | buffer_type *buffer = &buffer_impl.get(); 149 | #endif 150 | 151 | for(; first < last; first += elements_per_group) 152 | { 153 | // XXX each iteration is essentially a bounded accumulate 154 | 155 | size_type partition_size = thrust::min(elements_per_group, last - first); 156 | 157 | // copy partition into smem 158 | bulk::copy_n(g, first, partition_size, buffer->inputs.data()); 159 | 160 | T this_sum; 161 | size_type local_offset = grainsize * g.this_exec.index(); 162 | 163 | size_type local_size = thrust::max(0,thrust::min(grainsize, partition_size - grainsize * tid)); 164 | 165 | if(local_size) 166 | { 167 | this_sum = buffer->inputs[local_offset]; 168 | this_sum = bulk::accumulate(bound(g.this_exec), 169 | buffer->inputs.data() + local_offset + 1, 170 | buffer->inputs.data() + local_offset + local_size, 171 | this_sum, 172 | binary_op); 173 | } // end if 174 | 175 | g.wait(); 176 | 177 | if(local_size) 178 | { 179 | buffer->sums[tid] = this_sum; 180 | } // end if 181 | 182 | g.wait(); 183 | 184 | // sum over the group 185 | sum = accumulate_detail::destructive_accumulate_n(g, buffer->sums.data(), thrust::min(groupsize,n), sum, binary_op); 186 | } // end for 187 | 188 | #if __CUDA_ARCH__ >= 200 189 | bulk::free(g, buffer); 190 | #endif 191 | 192 | return sum; 193 | } // end accumulate 194 | } // end accumulate_detail 195 | } // end detail 196 | 197 | 198 | template 199 | __device__ 200 | T accumulate(bulk::concurrent_group, groupsize> &g, 201 | RandomAccessIterator first, 202 | RandomAccessIterator last, 203 | T init, 204 | BinaryFunction binary_op) 205 | { 206 | // use reduce when the operator is commutative 207 | if(thrust::detail::is_commutative::value) 208 | { 209 | init = bulk::reduce(g, first, last, init, binary_op); 210 | } // end if 211 | else 212 | { 213 | init = detail::accumulate_detail::accumulate(g, first, last, init, binary_op); 214 | } // end else 215 | 216 | return init; 217 | } // end accumulate() 218 | 219 | 220 | } // end bulk 221 | BULK_NAMESPACE_SUFFIX 222 | 223 | -------------------------------------------------------------------------------- /bulk/algorithm/adjacent_difference.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | BULK_NAMESPACE_PREFIX 23 | namespace bulk 24 | { 25 | 26 | 27 | template 32 | __device__ 33 | RandomAccessIterator2 adjacent_difference(bulk::agent &exec, 34 | RandomAccessIterator1 first, RandomAccessIterator1 last, 35 | RandomAccessIterator2 result, 36 | T init, 37 | BinaryOperation binary_op) 38 | { 39 | for(; first != last; ++first, ++result) 40 | { 41 | T temp = *first; 42 | *result = binary_op(temp, init); 43 | init = temp; 44 | } // end result 45 | 46 | return result; 47 | } // end adjacent_difference() 48 | 49 | 50 | template 56 | __device__ 57 | RandomAccessIterator2 adjacent_difference(bulk::concurrent_group,groupsize> &g, 58 | RandomAccessIterator1 first, RandomAccessIterator1 last, 59 | RandomAccessIterator2 result, 60 | T init, 61 | BinaryOperation binary_op) 62 | { 63 | // XXX this implementation allows first to be equal to result 64 | // when the input and output do not overlap, we can avoid the need for next_init 65 | // and the barriers 66 | 67 | typedef typename bulk::concurrent_group,groupsize>::size_type size_type; 68 | 69 | RandomAccessIterator2 return_me = result + (last - first); 70 | 71 | const size_type grainsize = g.this_exec.grainsize(); 72 | const size_type tile_size = g.size() * grainsize; 73 | 74 | // set the first iteration's init 75 | RandomAccessIterator1 first_init = first + grainsize * g.this_exec.index() - 1; 76 | if(first <= first_init && first_init < last) 77 | { 78 | init = *first_init; 79 | } 80 | 81 | g.wait(); 82 | 83 | for(; first < last; first += tile_size, result += tile_size) 84 | { 85 | size_type local_offset = grainsize * g.this_exec.index(); 86 | size_type local_size = thrust::max(0, thrust::min(grainsize, last - (first + local_offset))); 87 | 88 | // get the init for the next iteration 89 | T next_init = (first + local_offset + tile_size - 1 < last) ? first[tile_size-1] : init; 90 | 91 | g.wait(); 92 | 93 | // consume grainsize elements 94 | bulk::adjacent_difference(g.this_exec, 95 | first + local_offset, 96 | first + local_offset + local_size, 97 | result + local_offset, 98 | init, 99 | binary_op); 100 | 101 | init = next_init; 102 | } 103 | 104 | g.wait(); 105 | 106 | return return_me; 107 | } // end adjacent_difference() 108 | 109 | 110 | template 115 | __device__ 116 | RandomAccessIterator2 adjacent_difference(bulk::concurrent_group,groupsize> &g, 117 | RandomAccessIterator1 first, RandomAccessIterator1 last, 118 | RandomAccessIterator2 result, 119 | BinaryOperation binary_op) 120 | { 121 | if(first < last) 122 | { 123 | typename thrust::iterator_value::type init = *first; 124 | 125 | // we need to wait because first may be the same as result 126 | g.wait(); 127 | 128 | if(g.this_exec.index() == 0) 129 | { 130 | *result = init; 131 | } 132 | 133 | result = bulk::adjacent_difference(g, first + 1, last, result + 1, init, binary_op); 134 | } // end if 135 | 136 | return result; 137 | } // end adjacent_difference() 138 | 139 | 140 | } // end bulk 141 | BULK_NAMESPACE_SUFFIX 142 | 143 | -------------------------------------------------------------------------------- /bulk/algorithm/copy.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | 26 | BULK_NAMESPACE_PREFIX 27 | namespace bulk 28 | { 29 | 30 | 31 | template 36 | __forceinline__ __device__ 37 | RandomAccessIterator2 copy_n(const bounded > &b, 38 | RandomAccessIterator1 first, 39 | Size n, 40 | RandomAccessIterator2 result) 41 | { 42 | typedef typename bounded >::size_type size_type; 43 | 44 | if(bound <= n) 45 | { 46 | for(size_type i = 0; i < b.bound(); ++i, ++result, ++first) 47 | { 48 | *result = *first; 49 | } // end for i 50 | } // end if 51 | else 52 | { 53 | for(size_type i = 0; i < b.bound(); ++i, ++first) 54 | { 55 | if(i < n) 56 | { 57 | *result = *first; 58 | ++result; 59 | } // end if 60 | } // end for i 61 | } // end else 62 | 63 | return result; 64 | } // end copy_n() 65 | 66 | 67 | 68 | namespace detail 69 | { 70 | 71 | 72 | template 76 | __forceinline__ __device__ 77 | RandomAccessIterator2 simple_copy_n(ConcurrentGroup &g, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result) 78 | { 79 | for(Size i = g.this_exec.index(); 80 | i < n; 81 | i += g.size()) 82 | { 83 | result[i] = first[i]; 84 | } // end for i 85 | 86 | g.wait(); 87 | 88 | return result + n; 89 | } // end simple_copy_n() 90 | 91 | 92 | template 97 | __forceinline__ __device__ 98 | typename thrust::detail::enable_if< 99 | (size * grainsize > 0), 100 | RandomAccessIterator2 101 | >::type 102 | simple_copy_n(bulk::concurrent_group< 103 | agent, 104 | size 105 | > &g, 106 | RandomAccessIterator1 first, Size n, 107 | RandomAccessIterator2 result) 108 | { 109 | typedef bulk::concurrent_group< 110 | agent, 111 | size 112 | > group_type; 113 | 114 | RandomAccessIterator2 return_me = result + n; 115 | 116 | typedef typename group_type::size_type size_type; 117 | size_type chunk_size = size * grainsize; 118 | 119 | size_type tid = g.this_exec.index(); 120 | 121 | // important special case which avoids the expensive for loop below 122 | if(chunk_size == n) 123 | { 124 | // offset iterators by tid before loop 125 | first += tid; 126 | result += tid; 127 | 128 | for(size_type i = 0; i < grainsize; ++i, first += size, result += size) 129 | { 130 | *result = *first; 131 | } // end for 132 | } // end if 133 | else 134 | { 135 | // XXX i have a feeling the indexing could be rewritten to require less arithmetic 136 | for(RandomAccessIterator1 last = first + n; 137 | first < last; 138 | first += chunk_size, result += chunk_size) 139 | { 140 | // avoid conditional accesses when possible 141 | if((last - first) >= chunk_size) 142 | { 143 | for(size_type i = 0; i < grainsize; ++i) 144 | { 145 | size_type idx = size * i + tid; 146 | result[idx] = first[idx]; 147 | } // end for 148 | } // end if 149 | else 150 | { 151 | for(size_type i = 0; i < grainsize; ++i) 152 | { 153 | size_type idx = size * i + tid; 154 | if(idx < (last - first)) 155 | { 156 | result[idx] = first[idx]; 157 | } // end if 158 | } // end for 159 | } // end else 160 | } // end for 161 | } // end else 162 | 163 | g.wait(); 164 | 165 | return return_me; 166 | } // end simple_copy_n() 167 | 168 | 169 | template 174 | __forceinline__ __device__ 175 | RandomAccessIterator2 copy_n(concurrent_group< 176 | agent, 177 | size 178 | > &g, 179 | RandomAccessIterator1 first, 180 | Size n, 181 | RandomAccessIterator2 result) 182 | { 183 | return detail::simple_copy_n(g, first, n, result); 184 | } // end copy_n() 185 | 186 | 187 | } // end detail 188 | 189 | 190 | template 195 | __forceinline__ __device__ 196 | RandomAccessIterator2 197 | copy_n(bulk::concurrent_group &g, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result) 198 | { 199 | return detail::copy_n(g, first, n, result); 200 | } // end copy_n() 201 | 202 | 203 | template 204 | __device__ 205 | typename thrust::detail::enable_if< 206 | (bound <= groupsize * grainsize), 207 | RandomAccessIterator2 208 | >::type 209 | copy_n(bulk::bounded< 210 | bound, 211 | concurrent_group< 212 | agent, 213 | groupsize 214 | > 215 | > &g, 216 | RandomAccessIterator1 first, 217 | Size n, 218 | RandomAccessIterator2 result) 219 | { 220 | typedef bounded< 221 | bound, 222 | concurrent_group< 223 | agent, 224 | groupsize 225 | > 226 | > group_type; 227 | 228 | typedef typename group_type::size_type size_type; 229 | 230 | size_type tid = g.this_exec.index(); 231 | 232 | typedef typename thrust::iterator_value::type value_type; 233 | 234 | // XXX make this an uninitialized array 235 | value_type stage[grainsize]; 236 | 237 | // avoid conditional accesses when possible 238 | if(groupsize * grainsize <= n) 239 | { 240 | for(size_type i = 0; i < grainsize; ++i) 241 | { 242 | size_type src_idx = g.size() * i + tid; 243 | stage[i] = first[src_idx]; 244 | } // end for i 245 | 246 | for(size_type i = 0; i < grainsize; ++i) 247 | { 248 | size_type dst_idx = g.size() * i + tid; 249 | result[dst_idx] = stage[i]; 250 | } // end for i 251 | } // end if 252 | else 253 | { 254 | for(size_type i = 0; i < grainsize; ++i) 255 | { 256 | size_type src_idx = g.size() * i + tid; 257 | if(src_idx < n) 258 | { 259 | stage[i] = first[src_idx]; 260 | } // end if 261 | } // end for 262 | 263 | for(size_type i = 0; i < grainsize; ++i) 264 | { 265 | size_type dst_idx = g.size() * i + tid; 266 | if(dst_idx < n) 267 | { 268 | result[dst_idx] = stage[i]; 269 | } // end if 270 | } // end for 271 | } // end else 272 | 273 | g.wait(); 274 | 275 | return result + thrust::min(g.size() * grainsize, n); 276 | } // end copy_n() 277 | 278 | 279 | } // end bulk 280 | BULK_NAMESPACE_SUFFIX 281 | 282 | -------------------------------------------------------------------------------- /bulk/algorithm/detail/stable_merge_sort.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | BULK_NAMESPACE_PREFIX 29 | namespace bulk 30 | { 31 | 32 | 33 | // XXX forward declaration for inplace_merge_adjacent_partitions below 34 | template 39 | __forceinline__ __device__ 40 | void stable_sort_by_key(const bounded > &exec, 41 | RandomAccessIterator1 keys_first, RandomAccessIterator1 keys_last, 42 | RandomAccessIterator2 values_first, 43 | Compare comp); 44 | 45 | 46 | namespace detail 47 | { 48 | namespace stable_merge_sort_detail 49 | { 50 | 51 | 52 | template 53 | __device__ 54 | typename thrust::detail::enable_if< 55 | bound <= groupsize * grainsize 56 | >::type 57 | inplace_merge_adjacent_partitions(bulk::bounded, groupsize> > &g, 58 | KeyType local_keys[grainsize], ValType local_values[grainsize], void* stage_ptr, int count, int local_size, Compare comp) 59 | { 60 | union stage_t 61 | { 62 | KeyType *keys; 63 | ValType *vals; 64 | }; 65 | 66 | stage_t stage; 67 | stage.keys = reinterpret_cast(stage_ptr); 68 | 69 | typedef typename bulk::agent::size_type size_type; 70 | 71 | size_type local_offset = grainsize * g.this_exec.index(); 72 | 73 | // XXX this loop seems to assume that groupsize is a power of two 74 | // NPOT groupsize crashes merge sort 75 | for(size_type num_agents_per_merge = 2; num_agents_per_merge <= groupsize; num_agents_per_merge *= 2) 76 | { 77 | // copy keys into the stage so we can dynamically index them 78 | bulk::copy_n(bulk::bound(g.this_exec), local_keys, local_size, stage.keys + local_offset); 79 | 80 | g.wait(); 81 | 82 | // find the index of the first array this agent will merge 83 | size_type list = ~(num_agents_per_merge - 1) & g.this_exec.index(); 84 | size_type diag = thrust::min(count, grainsize * ((num_agents_per_merge - 1) & g.this_exec.index())); 85 | size_type start = grainsize * list; 86 | 87 | // the size of each of the two input arrays we're merging 88 | size_type input_size = grainsize * (num_agents_per_merge / 2); 89 | 90 | size_type partition_first1 = thrust::min(count, start); 91 | size_type partition_first2 = thrust::min(count, partition_first1 + input_size); 92 | size_type partition_last2 = thrust::min(count, partition_first2 + input_size); 93 | 94 | size_type n1 = partition_first2 - partition_first1; 95 | size_type n2 = partition_last2 - partition_first2; 96 | 97 | size_type mp = bulk::merge_path(stage.keys + partition_first1, n1, stage.keys + partition_first2, n2, diag, comp); 98 | 99 | // each agent merges sequentially locally 100 | // note the source index of each merged value so that we can gather values into merged order later 101 | size_type gather_indices[grainsize]; 102 | bulk::merge_by_key(bulk::bound(g.this_exec), 103 | stage.keys + partition_first1 + mp, stage.keys + partition_first2, 104 | stage.keys + partition_first2 + diag - mp, stage.keys + partition_last2, 105 | thrust::make_counting_iterator(partition_first1 + mp), 106 | thrust::make_counting_iterator(partition_first2 + diag - mp), 107 | local_keys, 108 | gather_indices, 109 | comp); 110 | 111 | // move values into the stage so we can index them 112 | bulk::copy_n(bulk::bound(g.this_exec), local_values, local_size, stage.vals + local_offset); 113 | 114 | // gather values into registers 115 | bulk::gather(bulk::bound(g.this_exec), gather_indices, gather_indices + local_size, stage.vals, local_values); 116 | 117 | g.wait(); 118 | } // end for 119 | } // end inplace_merge_adjacent_partitions() 120 | 121 | 122 | } // end stable_merge_sort_detail 123 | 124 | 125 | template 129 | __device__ 130 | typename thrust::detail::enable_if< 131 | bound <= groupsize * grainsize 132 | >::type 133 | stable_merge_sort_by_key(bulk::bounded,groupsize> > &g, 134 | RandomAccessIterator1 keys_first, RandomAccessIterator1 keys_last, 135 | RandomAccessIterator2 values_first, 136 | Compare comp) 137 | { 138 | typedef typename thrust::iterator_value::type key_type; 139 | typedef typename thrust::iterator_value::type value_type; 140 | 141 | typedef typename bulk::agent::size_type size_type; 142 | 143 | size_type n = keys_last - keys_first; 144 | const size_type tile_size = groupsize * grainsize; 145 | 146 | size_type local_offset = grainsize * g.this_exec.index(); 147 | size_type local_size = thrust::max(0, thrust::min(grainsize, n - local_offset)); 148 | 149 | #if __CUDA_ARCH__ >= 200 150 | union 151 | { 152 | key_type *keys; 153 | value_type *values; 154 | } stage; 155 | 156 | stage.keys = static_cast(bulk::malloc(g, tile_size * thrust::max(sizeof(key_type), sizeof(value_type)))); 157 | #else 158 | __shared__ union 159 | { 160 | key_type keys[tile_size]; 161 | value_type values[tile_size]; 162 | } stage; 163 | #endif 164 | 165 | // load each agent's keys into registers 166 | bulk::copy_n(bulk::bound(g), keys_first, n, stage.keys); 167 | 168 | key_type local_keys[grainsize]; 169 | bulk::copy_n(bulk::bound(g.this_exec), stage.keys + local_offset, local_size, local_keys); 170 | 171 | // load each agent's values into registers 172 | bulk::copy_n(bulk::bound(g), values_first, n, stage.values); 173 | 174 | value_type local_values[grainsize]; 175 | bulk::copy_n(bulk::bound(g.this_exec), stage.values + local_offset, local_size, local_values); 176 | 177 | // each agent sorts its local partition of the array 178 | bulk::stable_sort_by_key(bulk::bound(g.this_exec), local_keys, local_keys + local_size, local_values, comp); 179 | 180 | // merge adjacent partitions together 181 | // avoid dynamic sizes when possible 182 | if(n == tile_size) 183 | { 184 | stable_merge_sort_detail::inplace_merge_adjacent_partitions(g, local_keys, local_values, stage.keys, tile_size, grainsize, comp); 185 | } // end if 186 | else 187 | { 188 | stable_merge_sort_detail::inplace_merge_adjacent_partitions(g, local_keys, local_values, stage.keys, n, local_size, comp); 189 | } // end else 190 | 191 | // store the sorted keys back to the input 192 | bulk::copy_n(bulk::bound(g.this_exec), local_keys, local_size, stage.keys + local_offset); 193 | g.wait(); 194 | 195 | bulk::copy_n(bulk::bound(g), stage.keys, n, keys_first); 196 | 197 | // store the sorted values back to the input 198 | bulk::copy_n(bulk::bound(g.this_exec), local_values, local_size, stage.values + local_offset); 199 | g.wait(); 200 | 201 | bulk::copy_n(bulk::bound(g), stage.values, n, values_first); 202 | 203 | #if __CUDA_ARCH__ >= 200 204 | bulk::free(g, stage.keys); 205 | #endif 206 | } // end stable_merge_sort_by_key() 207 | 208 | 209 | } // end detail 210 | } // end bulk 211 | BULK_NAMESPACE_SUFFIX 212 | 213 | -------------------------------------------------------------------------------- /bulk/algorithm/for_each.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | 23 | BULK_NAMESPACE_PREFIX 24 | namespace bulk 25 | { 26 | 27 | 28 | template 32 | __device__ 33 | RandomAccessIterator for_each_n(ExecutionGroup &g, RandomAccessIterator first, Size n, Function f) 34 | { 35 | for(Size i = g.this_thread.index(); 36 | i < n; 37 | i += g.size()) 38 | { 39 | f(first[i]); 40 | } // end for i 41 | 42 | g.wait(); 43 | 44 | return first + n; 45 | } // end for_each() 46 | 47 | 48 | template 53 | __device__ 54 | RandomAccessIterator for_each_n(bounded > &b, 55 | RandomAccessIterator first, 56 | Size n, 57 | Function f) 58 | { 59 | typedef typename bounded >::size_type size_type; 60 | 61 | for(size_type i = 0; i < bound; ++i) 62 | { 63 | if(i < n) 64 | { 65 | f(first[i]); 66 | } // end if 67 | } // end for i 68 | 69 | return first + n; 70 | } // end for_each_n() 71 | 72 | 73 | } // end bulk 74 | BULK_NAMESPACE_SUFFIX 75 | 76 | -------------------------------------------------------------------------------- /bulk/algorithm/gather.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | 25 | BULK_NAMESPACE_PREFIX 26 | namespace bulk 27 | { 28 | 29 | 30 | // XXX eliminate me! 31 | template 36 | __forceinline__ __device__ 37 | RandomAccessIterator3 gather(const bounded > &, 38 | RandomAccessIterator1 map_first, 39 | RandomAccessIterator1 map_last, 40 | RandomAccessIterator2 input_first, 41 | RandomAccessIterator3 result) 42 | { 43 | typedef typename bulk::bounded >::size_type size_type; 44 | 45 | size_type n = map_last - map_first; 46 | 47 | if(bound <= n) 48 | { 49 | for(size_type i = 0; i < bound; ++i) 50 | { 51 | result[i] = input_first[map_first[i]]; 52 | } 53 | } 54 | else 55 | { 56 | for(size_type i = 0; i < bound; ++i) 57 | { 58 | if(i < n) 59 | { 60 | result[i] = input_first[map_first[i]]; 61 | } 62 | } 63 | } 64 | 65 | return result + n; 66 | } // end scatter_if() 67 | 68 | 69 | template 70 | __forceinline__ __device__ 71 | RandomAccessIterator3 gather(ExecutionGroup &g, 72 | RandomAccessIterator1 map_first, 73 | RandomAccessIterator1 map_last, 74 | RandomAccessIterator2 input_first, 75 | RandomAccessIterator3 result) 76 | { 77 | return bulk::copy_n(g, 78 | thrust::make_permutation_iterator(input_first, map_first), 79 | map_last - map_first, 80 | result); 81 | } // end gather() 82 | 83 | 84 | } // end bulk 85 | BULK_NAMESPACE_SUFFIX 86 | 87 | -------------------------------------------------------------------------------- /bulk/algorithm/reduce.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | 28 | BULK_NAMESPACE_PREFIX 29 | namespace bulk 30 | { 31 | 32 | 33 | template 38 | __forceinline__ __device__ 39 | T reduce(const bulk::bounded > &exec, 40 | RandomAccessIterator first, 41 | RandomAccessIterator last, 42 | T init, 43 | BinaryFunction binary_op) 44 | { 45 | typedef typename bulk::bounded >::size_type size_type; 46 | 47 | size_type n = last - first; 48 | 49 | for(size_type i = 0; i < exec.bound(); ++i) 50 | { 51 | if(i < n) 52 | { 53 | init = binary_op(init, first[i]); 54 | } // end if 55 | } // end for i 56 | 57 | return init; 58 | } // end reduce() 59 | 60 | 61 | namespace detail 62 | { 63 | namespace reduce_detail 64 | { 65 | 66 | 67 | template 68 | __device__ T destructive_reduce_n(ConcurrentGroup &g, RandomAccessIterator first, Size n, T init, BinaryFunction binary_op) 69 | { 70 | typedef int size_type; 71 | 72 | size_type tid = g.this_exec.index(); 73 | 74 | Size m = n; 75 | 76 | while(m > 1) 77 | { 78 | Size half_m = m >> 1; 79 | 80 | if(tid < half_m) 81 | { 82 | T old_val = first[tid]; 83 | 84 | first[tid] = binary_op(old_val, first[m - tid - 1]); 85 | } // end if 86 | 87 | g.wait(); 88 | 89 | m -= half_m; 90 | } // end while 91 | 92 | g.wait(); 93 | 94 | T result = init; 95 | if(n > 0) 96 | { 97 | result = binary_op(result,first[0]); 98 | } // end if 99 | 100 | g.wait(); 101 | 102 | return result; 103 | } // end destructive_reduce_n() 104 | 105 | 106 | } // end reduce_detail 107 | } // end detail 108 | 109 | 110 | template 111 | __device__ 112 | T reduce(bulk::concurrent_group,groupsize> &g, 113 | RandomAccessIterator first, 114 | RandomAccessIterator last, 115 | T init, 116 | BinaryFunction binary_op) 117 | { 118 | typedef int size_type; 119 | 120 | const size_type elements_per_group = groupsize * grainsize; 121 | 122 | size_type tid = g.this_exec.index(); 123 | 124 | T this_sum; 125 | 126 | bool this_sum_defined = false; 127 | 128 | size_type n = last - first; 129 | 130 | // XXX we use offset as the loop counter variable instead of first 131 | // because elements_per_group can actually overflow some kinds of iterators 132 | // with small difference_types 133 | for(size_type offset = 0; offset < n; first += elements_per_group, offset += elements_per_group) 134 | { 135 | size_type partition_size = thrust::min(elements_per_group, last - first); 136 | 137 | typedef typename thrust::iterator_value::type input_type; 138 | 139 | // load input into register 140 | input_type local_inputs[grainsize]; 141 | 142 | // each agent strides through the input range 143 | // and copies into a local array 144 | strided_iterator local_first = make_strided_iterator(first + tid, static_cast(groupsize)); 145 | 146 | // XXX if we could precompute local_size for the else branch, 147 | // we could just call copy_n here 148 | // we can't precompute it (without a divide afaik), so we compute local_size in the else branch 149 | size_type local_size = 0; 150 | if(partition_size < elements_per_group) 151 | { 152 | // XXX i guess nvcc miscompiles this loop for counting_iterators 153 | // size_type index = tid; 154 | // for(size_type i = 0; i < grainsize; ++i, ++local_first, index += groupsize) 155 | // { 156 | // if(index < partition_size) 157 | // { 158 | // local_inputs[i] = *local_first; 159 | // ++local_size; 160 | // } // end if 161 | // } // end for 162 | // 163 | RandomAccessIterator iter = local_first.base(); 164 | size_type index = tid; 165 | for(size_type i = 0; i < grainsize; ++i, index += groupsize, iter += groupsize) 166 | { 167 | if(index < partition_size) 168 | { 169 | local_inputs[i] = *iter; 170 | ++local_size; 171 | } // end if 172 | } // end for 173 | } // end if 174 | else 175 | { 176 | local_size = grainsize; 177 | // XXX nvcc 6.5 RC miscompiles this loop when RandomAccessIterator is a counting_iterator 178 | // bulk::copy_n(bulk::bound(g.this_exec), 179 | // local_first, 180 | // local_size, 181 | // local_inputs); 182 | RandomAccessIterator iter = local_first.base(); 183 | for(size_type i = 0; i < grainsize; ++i, iter += groupsize) 184 | { 185 | local_inputs[i] = *iter; 186 | } // end for 187 | } // end else 188 | 189 | // reduce local_inputs sequentially 190 | this_sum = this_sum_defined ? 191 | bulk::reduce(bulk::bound(g.this_exec), local_inputs, local_inputs + local_size, this_sum, binary_op) : 192 | bulk::reduce(bulk::bound(g.this_exec), local_inputs + 1, local_inputs + local_size, T(local_inputs[0]), binary_op); 193 | 194 | this_sum_defined = true; 195 | } // end for 196 | 197 | #if __CUDA_ARCH__ >= 200 198 | T *buffer = reinterpret_cast(bulk::malloc(g, groupsize * sizeof(T))); 199 | #else 200 | __shared__ bulk::uninitialized_array buffer_impl; 201 | T *buffer = buffer_impl.data(); 202 | #endif 203 | 204 | if(this_sum_defined) 205 | { 206 | buffer[tid] = this_sum; 207 | } // end if 208 | 209 | g.wait(); 210 | 211 | // reduce across the group 212 | T result = bulk::detail::reduce_detail::destructive_reduce_n(g, buffer, thrust::min(groupsize,n), init, binary_op); 213 | 214 | #if __CUDA_ARCH__ >= 200 215 | bulk::free(g,buffer); 216 | #endif 217 | 218 | return result; 219 | } // end reduce 220 | 221 | 222 | template 223 | __device__ 224 | T reduce(bulk::concurrent_group<> &g, 225 | RandomAccessIterator first, 226 | RandomAccessIterator last, 227 | T init, 228 | BinaryFunction binary_op) 229 | { 230 | typedef int size_type; 231 | 232 | size_type tid = g.this_exec.index(); 233 | 234 | T this_sum; 235 | 236 | bool this_sum_defined = false; 237 | 238 | typename thrust::iterator_difference::type n = last - first; 239 | 240 | T *buffer = reinterpret_cast(bulk::malloc(g, g.size() * sizeof(T))); 241 | 242 | for(size_type i = tid; i < n; i += g.size()) 243 | { 244 | typedef typename thrust::iterator_value::type input_type; 245 | input_type x = first[i]; 246 | this_sum = this_sum_defined ? binary_op(this_sum, x) : x; 247 | 248 | this_sum_defined = true; 249 | } 250 | 251 | if(this_sum_defined) 252 | { 253 | buffer[tid] = this_sum; 254 | } // end if 255 | 256 | g.wait(); 257 | 258 | // reduce across the block 259 | T result = detail::reduce_detail::destructive_reduce_n(g, buffer, thrust::min(g.size(),n), init, binary_op); 260 | 261 | bulk::free(g,buffer); 262 | 263 | return result; 264 | } // end reduce 265 | 266 | 267 | } // end bulk 268 | BULK_NAMESPACE_SUFFIX 269 | 270 | -------------------------------------------------------------------------------- /bulk/algorithm/reduce_by_key.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | 32 | BULK_NAMESPACE_PREFIX 33 | namespace bulk 34 | { 35 | namespace detail 36 | { 37 | namespace reduce_by_key_detail 38 | { 39 | 40 | 41 | template 42 | struct scan_head_flags_functor 43 | { 44 | BinaryFunction binary_op; 45 | 46 | typedef thrust::tuple result_type; 47 | typedef result_type first_argument_type; 48 | typedef result_type second_argument_type; 49 | 50 | __host__ __device__ 51 | scan_head_flags_functor(BinaryFunction binary_op) 52 | : binary_op(binary_op) 53 | {} 54 | 55 | __host__ __device__ 56 | result_type operator()(const first_argument_type &a, const second_argument_type &b) 57 | { 58 | ValueType val = thrust::get<0>(b) ? thrust::get<1>(b) : binary_op(thrust::get<1>(a), thrust::get<1>(b)); 59 | FlagType flag = thrust::get<0>(a) + thrust::get<0>(b); 60 | return result_type(flag, val); 61 | } 62 | }; 63 | 64 | 65 | template 72 | __device__ 73 | void scatter_tails_n(ConcurrentGroup &group, 74 | InputIterator1 flags_first, 75 | Size n, 76 | InputIterator2 keys_first, 77 | InputIterator3 values_first, 78 | OutputIterator1 keys_result, 79 | OutputIterator2 values_result) 80 | { 81 | // for each tail element in [flags_first, flags_first + n) 82 | // scatter the key and value to that element's corresponding flag element - 1 83 | 84 | // the zip_iterators in this scatter_if can confuse nvcc's pointer space tracking for __CUDA_ARCH__ < 200 85 | // separate the scatters for __CUDA_ARCH__ < 200 86 | #if __CUDA_ARCH__ >= 200 87 | bulk::scatter_if(group, 88 | thrust::make_zip_iterator(thrust::make_tuple(values_first, keys_first)), 89 | thrust::make_zip_iterator(thrust::make_tuple(values_first + n - 1, keys_first)), 90 | thrust::make_transform_iterator(flags_first, thrust::placeholders::_1 - 1), 91 | bulk::detail::make_tail_flags(flags_first, flags_first + n).begin(), 92 | thrust::make_zip_iterator(thrust::make_tuple(values_result, keys_result))); 93 | #else 94 | bulk::scatter_if(group, 95 | values_first, 96 | values_first + n - 1, 97 | thrust::make_transform_iterator(flags_first, thrust::placeholders::_1 - 1), 98 | bulk::detail::make_tail_flags(flags_first, flags_first + n).begin(), 99 | values_result); 100 | 101 | bulk::scatter_if(group, 102 | keys_first, 103 | keys_first + n - 1, 104 | thrust::make_transform_iterator(flags_first, thrust::placeholders::_1 - 1), 105 | bulk::detail::make_tail_flags(flags_first, flags_first + n).begin(), 106 | keys_result); 107 | #endif 108 | } // end scatter_tails_n() 109 | 110 | 111 | } // end reduce_by_key_detail 112 | } // end detail 113 | 114 | 115 | template 125 | thrust::tuple< 126 | OutputIterator1, 127 | OutputIterator2, 128 | typename thrust::iterator_value::type, 129 | typename thrust::iterator_value::type 130 | > 131 | __device__ 132 | reduce_by_key(bulk::concurrent_group,groupsize> &g, 133 | InputIterator1 keys_first, InputIterator1 keys_last, 134 | InputIterator2 values_first, 135 | OutputIterator1 keys_result, 136 | OutputIterator2 values_result, 137 | T1 init_key, 138 | T2 init_value, 139 | BinaryPredicate pred, 140 | BinaryFunction binary_op) 141 | { 142 | typedef typename thrust::iterator_value::type value_type; // XXX this should be the type returned by BinaryFunction 143 | 144 | typedef typename bulk::concurrent_group,groupsize>::size_type size_type; 145 | 146 | const size_type interval_size = groupsize * grainsize; 147 | 148 | #if __CUDA_ARCH__ >= 200 149 | size_type *s_flags = reinterpret_cast(bulk::malloc(g, interval_size * sizeof(int))); 150 | value_type *s_values = reinterpret_cast(bulk::malloc(g, interval_size * sizeof(value_type))); 151 | #else 152 | __shared__ uninitialized_array s_flags_impl; 153 | size_type *s_flags = s_flags_impl.data(); 154 | 155 | __shared__ uninitialized_array s_values_impl; 156 | value_type *s_values = s_values_impl.data(); 157 | #endif 158 | 159 | for(; keys_first < keys_last; keys_first += interval_size, values_first += interval_size) 160 | { 161 | // upper bound on n is interval_size 162 | size_type n = thrust::min(interval_size, keys_last - keys_first); 163 | 164 | bulk::detail::head_flags_with_init< 165 | InputIterator1, 166 | BinaryPredicate, 167 | size_type 168 | > flags(keys_first, keys_first + n, init_key, pred); 169 | 170 | detail::reduce_by_key_detail::scan_head_flags_functor f(binary_op); 171 | 172 | // load input into smem 173 | bulk::copy_n(bulk::bound(g), 174 | thrust::make_zip_iterator(thrust::make_tuple(flags.begin(), values_first)), 175 | n, 176 | thrust::make_zip_iterator(thrust::make_tuple(s_flags, s_values))); 177 | 178 | // scan in smem 179 | bulk::inclusive_scan(bulk::bound(g), 180 | thrust::make_zip_iterator(thrust::make_tuple(s_flags, s_values)), 181 | thrust::make_zip_iterator(thrust::make_tuple(s_flags + n, s_values)), 182 | thrust::make_zip_iterator(thrust::make_tuple(s_flags, s_values)), 183 | thrust::make_tuple(1, init_value), 184 | f); 185 | 186 | // scatter tail results to the output 187 | detail::reduce_by_key_detail::scatter_tails_n(bulk::bound(g), 188 | s_flags, n, 189 | keys_first, s_values, 190 | keys_result, values_result); 191 | 192 | 193 | // if the init was not a carry, we need to insert it at the beginning of the result 194 | if(g.this_exec.index() == 0 && s_flags[0] > 1) 195 | { 196 | keys_result[0] = init_key; 197 | values_result[0] = init_value; 198 | } 199 | 200 | size_type result_size = s_flags[n - 1] - 1; 201 | 202 | keys_result += result_size; 203 | values_result += result_size; 204 | init_key = keys_first[n-1]; 205 | init_value = s_values[n - 1]; 206 | 207 | g.wait(); 208 | } // end for 209 | 210 | #if __CUDA_ARCH__ >= 200 211 | bulk::free(g, s_flags); 212 | bulk::free(g, s_values); 213 | #endif 214 | 215 | return thrust::make_tuple(keys_result, values_result, init_key, init_value); 216 | } // end reduce_by_key() 217 | 218 | 219 | } // end bulk 220 | BULK_NAMESPACE_SUFFIX 221 | 222 | -------------------------------------------------------------------------------- /bulk/algorithm/scatter.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | BULK_NAMESPACE_PREFIX 23 | namespace bulk 24 | { 25 | 26 | 27 | template 33 | __forceinline__ __device__ 34 | void scatter_if(const bounded > &exec, 35 | RandomAccessIterator1 first, 36 | RandomAccessIterator1 last, 37 | RandomAccessIterator2 map, 38 | RandomAccessIterator3 stencil, 39 | RandomAccessIterator4 result) 40 | { 41 | typedef int size_type; 42 | 43 | size_type n = last - first; 44 | 45 | for(size_type i = 0; i < bound; ++i) 46 | { 47 | if(i < n && stencil[i]) 48 | { 49 | result[map[i]] = first[i]; 50 | } // end if 51 | } // end for 52 | } // end scatter_if() 53 | 54 | 55 | template 62 | __device__ 63 | typename thrust::detail::enable_if< 64 | bound <= groupsize * grainsize 65 | >::type 66 | scatter_if(bulk::bounded< 67 | bound, 68 | bulk::concurrent_group,groupsize> 69 | > &g, 70 | RandomAccessIterator1 first, 71 | RandomAccessIterator1 last, 72 | RandomAccessIterator2 map, 73 | RandomAccessIterator3 stencil, 74 | RandomAccessIterator4 result) 75 | { 76 | typedef typename bulk::bounded< 77 | bound, 78 | bulk::concurrent_group,groupsize> 79 | >::size_type size_type; 80 | 81 | size_type n = last - first; 82 | 83 | size_type tid = g.this_exec.index(); 84 | 85 | // avoid branches when possible 86 | if(n == bound) 87 | { 88 | for(size_type i = 0; i < g.this_exec.grainsize(); ++i) 89 | { 90 | size_type idx = g.size() * i + tid; 91 | 92 | if(stencil[idx]) 93 | { 94 | result[map[idx]] = first[idx]; 95 | } // end if 96 | } // end for 97 | } // end if 98 | else if(n < bound) 99 | { 100 | for(size_type i = 0; i < g.this_exec.grainsize(); ++i) 101 | { 102 | size_type idx = g.size() * i + tid; 103 | 104 | if(idx < (last - first) && stencil[idx]) 105 | { 106 | result[map[idx]] = first[idx]; 107 | } // end if 108 | } // end for 109 | } // end if 110 | 111 | g.wait(); 112 | } // end scatter_if() 113 | 114 | 115 | template 121 | __device__ 122 | void scatter_if(bulk::concurrent_group,groupsize> &g, 123 | RandomAccessIterator1 first, 124 | RandomAccessIterator1 last, 125 | RandomAccessIterator2 map, 126 | RandomAccessIterator3 stencil, 127 | RandomAccessIterator4 result) 128 | { 129 | typedef typename bulk::concurrent_group,groupsize>::size_type size_type; 130 | 131 | size_type chunk_size = g.size() * grainsize; 132 | 133 | size_type n = last - first; 134 | 135 | size_type tid = g.this_exec.index(); 136 | 137 | // important special case which avoids the expensive for loop below 138 | if(chunk_size == n) 139 | { 140 | for(size_type i = 0; i < grainsize; ++i) 141 | { 142 | size_type idx = g.size() * i + tid; 143 | 144 | if(stencil[idx]) 145 | { 146 | result[map[idx]] = first[idx]; 147 | } // end if 148 | } // end for 149 | } // end if 150 | else if(n < chunk_size) 151 | { 152 | for(size_type i = 0; i < grainsize; ++i) 153 | { 154 | size_type idx = g.size() * i + tid; 155 | 156 | if(idx < (last - first) && stencil[idx]) 157 | { 158 | result[map[idx]] = first[idx]; 159 | } // end if 160 | } // end for 161 | } // end if 162 | else 163 | { 164 | for(; 165 | first < last; 166 | first += chunk_size, map += chunk_size, stencil += chunk_size) 167 | { 168 | if((last - first) >= chunk_size) 169 | { 170 | // avoid conditional accesses when possible 171 | for(size_type i = 0; i < grainsize; ++i) 172 | { 173 | size_type idx = g.size() * i + tid; 174 | 175 | if(stencil[idx]) 176 | { 177 | result[map[idx]] = first[idx]; 178 | } // end if 179 | } // end for 180 | } // end if 181 | else 182 | { 183 | for(size_type i = 0; i < grainsize; ++i) 184 | { 185 | size_type idx = g.size() * i + tid; 186 | 187 | if(idx < (last - first) && stencil[idx]) 188 | { 189 | result[map[idx]] = first[idx]; 190 | } // end if 191 | } // end for 192 | } // end else 193 | } // end for 194 | } // end else 195 | 196 | g.wait(); 197 | } // end scatter_if 198 | 199 | 200 | } // end bulk 201 | BULK_NAMESPACE_SUFFIX 202 | 203 | -------------------------------------------------------------------------------- /bulk/algorithm/sort.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | BULK_NAMESPACE_PREFIX 25 | namespace bulk 26 | { 27 | namespace detail 28 | { 29 | namespace sort_detail 30 | { 31 | 32 | 33 | template 34 | struct stable_odd_even_transpose_sort_by_key_impl 35 | { 36 | template 37 | static __device__ 38 | void sort(RandomAccessIterator1 keys, RandomAccessIterator2 values, int n, Compare comp) 39 | { 40 | for(int j = 1 & i; j < bound - 1; j += 2) 41 | { 42 | if(j + 1 < n && comp(keys[j + 1], keys[j])) 43 | { 44 | using thrust::swap; 45 | 46 | swap(keys[j], keys[j + 1]); 47 | swap(values[j], values[j + 1]); 48 | } 49 | } 50 | 51 | stable_odd_even_transpose_sort_by_key_impl::sort(keys, values, n, comp); 52 | } 53 | }; 54 | 55 | 56 | template struct stable_odd_even_transpose_sort_by_key_impl 57 | { 58 | template 59 | static __device__ void sort(RandomAccessIterator1, RandomAccessIterator2, int, Compare) { } 60 | }; 61 | 62 | 63 | template 68 | __forceinline__ __device__ 69 | void stable_odd_even_transpose_sort_by_key(const bounded > &, 70 | RandomAccessIterator1 keys_first, RandomAccessIterator1 keys_last, 71 | RandomAccessIterator2 values_first, 72 | Compare comp) 73 | { 74 | stable_odd_even_transpose_sort_by_key_impl<0, bound>::sort(keys_first, values_first, keys_last - keys_first, comp); 75 | } // end stable_odd_even_transpose_sort_by_key() 76 | 77 | 78 | template 79 | struct stable_odd_even_transpose_sort_impl 80 | { 81 | template 82 | static __device__ 83 | void sort(RandomAccessIterator keys, int n, Compare comp) 84 | { 85 | for(int j = 1 & i; j < bound - 1; j += 2) 86 | { 87 | if(j + 1 < n && comp(keys[j + 1], keys[j])) 88 | { 89 | using thrust::swap; 90 | 91 | swap(keys[j], keys[j + 1]); 92 | } 93 | } 94 | 95 | stable_odd_even_transpose_sort_impl::sort(keys, n, comp); 96 | } 97 | }; 98 | 99 | 100 | template struct stable_odd_even_transpose_sort_impl 101 | { 102 | template 103 | static __device__ void sort(RandomAccessIterator, int, Compare) { } 104 | }; 105 | 106 | 107 | template 111 | __forceinline__ __device__ 112 | void stable_odd_even_transpose_sort(const bounded > &, 113 | RandomAccessIterator first, RandomAccessIterator last, 114 | Compare comp) 115 | { 116 | stable_odd_even_transpose_sort_impl<0, bound>::sort(first, last - first, comp); 117 | } // end stable_odd_even_transpose_sort() 118 | 119 | 120 | } // end sort_detail 121 | } // end detail 122 | 123 | 124 | template 129 | __forceinline__ __device__ 130 | void stable_sort_by_key(const bounded > &exec, 131 | RandomAccessIterator1 keys_first, RandomAccessIterator1 keys_last, 132 | RandomAccessIterator2 values_first, 133 | Compare comp) 134 | { 135 | bulk::detail::sort_detail::stable_odd_even_transpose_sort_by_key(exec, keys_first, keys_last, values_first, comp); 136 | } // end stable_sort_by_key() 137 | 138 | 139 | template 143 | __forceinline__ __device__ 144 | void stable_sort(const bounded > &exec, 145 | RandomAccessIterator first, RandomAccessIterator last, 146 | Compare comp) 147 | { 148 | bulk::detail::sort_detail::stable_odd_even_transpose_sort(exec, first, last, comp); 149 | } // end stable_sort() 150 | 151 | 152 | template 156 | __device__ 157 | typename thrust::detail::enable_if< 158 | bound <= groupsize * grainsize 159 | >::type 160 | stable_sort_by_key(bulk::bounded,groupsize> > &g, 161 | RandomAccessIterator1 keys_first, RandomAccessIterator1 keys_last, 162 | RandomAccessIterator2 values_first, 163 | Compare comp) 164 | { 165 | bulk::detail::stable_merge_sort_by_key(g, keys_first, keys_last, values_first, comp); 166 | } // end stable_sort_by_key() 167 | 168 | 169 | } // end bulk 170 | BULK_NAMESPACE_SUFFIX 171 | 172 | -------------------------------------------------------------------------------- /bulk/async.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | #include 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | 26 | BULK_NAMESPACE_PREFIX 27 | namespace bulk 28 | { 29 | 30 | 31 | template 32 | __host__ __device__ 33 | future async(ExecutionGroup g, Function f); 34 | 35 | 36 | template 37 | __host__ __device__ 38 | future async(ExecutionGroup g, Function f, Arg1 arg1); 39 | 40 | 41 | template 42 | __host__ __device__ 43 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2); 44 | 45 | 46 | template 47 | __host__ __device__ 48 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3); 49 | 50 | 51 | template 52 | __host__ __device__ 53 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4); 54 | 55 | 56 | template 57 | __host__ __device__ 58 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5); 59 | 60 | 61 | template 62 | __host__ __device__ 63 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6); 64 | 65 | 66 | template 67 | __host__ __device__ 68 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7); 69 | 70 | 71 | template 72 | __host__ __device__ 73 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8); 74 | 75 | 76 | template 77 | __host__ __device__ 78 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9); 79 | 80 | 81 | template 82 | __host__ __device__ 83 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9, Arg10 arg10); 84 | 85 | 86 | } // end bulk 87 | BULK_NAMESPACE_SUFFIX 88 | 89 | #include 90 | 91 | -------------------------------------------------------------------------------- /bulk/bulk.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | -------------------------------------------------------------------------------- /bulk/choose_sizes.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | 24 | BULK_NAMESPACE_PREFIX 25 | namespace bulk 26 | { 27 | 28 | 29 | template 30 | __host__ __device__ 31 | thrust::pair >::size_type, 32 | typename concurrent_group<>::size_type> 33 | choose_sizes(parallel_group > g, Function f); 34 | 35 | 36 | template 37 | __host__ __device__ 38 | thrust::pair >::size_type, 39 | typename concurrent_group<>::size_type> 40 | choose_sizes(parallel_group > g, Function f, Arg1 arg1); 41 | 42 | 43 | template 44 | __host__ __device__ 45 | thrust::pair >::size_type, 46 | typename concurrent_group<>::size_type> 47 | choose_sizes(parallel_group > g, Function f, Arg1 arg1, Arg2 arg2); 48 | 49 | 50 | template 51 | __host__ __device__ 52 | thrust::pair >::size_type, 53 | typename concurrent_group<>::size_type> 54 | choose_sizes(parallel_group > g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3); 55 | 56 | 57 | template 58 | __host__ __device__ 59 | thrust::pair >::size_type, 60 | typename concurrent_group<>::size_type> 61 | choose_sizes(parallel_group > g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4); 62 | 63 | 64 | template 65 | __host__ __device__ 66 | thrust::pair >::size_type, 67 | typename concurrent_group<>::size_type> 68 | choose_sizes(parallel_group > g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5); 69 | 70 | 71 | template 72 | __host__ __device__ 73 | thrust::pair >::size_type, 74 | typename concurrent_group<>::size_type> 75 | choose_sizes(parallel_group > g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6); 76 | 77 | 78 | } // end bulk 79 | BULK_NAMESPACE_SUFFIX 80 | 81 | #include 82 | 83 | -------------------------------------------------------------------------------- /bulk/detail/alignment.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | BULK_NAMESPACE_PREFIX 22 | namespace bulk 23 | { 24 | namespace detail 25 | { 26 | namespace alignment_of_detail 27 | { 28 | 29 | 30 | template class alignment_of_impl; 31 | 32 | template 33 | struct helper 34 | { 35 | static const std::size_t value = size_diff; 36 | }; 37 | 38 | template 39 | class helper 40 | { 41 | public: 42 | static const std::size_t value = alignment_of_impl::value; 43 | }; 44 | 45 | template 46 | class alignment_of_impl 47 | { 48 | private: 49 | struct big { T x; char c; }; 50 | 51 | public: 52 | static const std::size_t value = helper::value; 53 | }; 54 | 55 | 56 | } // end alignment_of_detail 57 | 58 | 59 | template 60 | struct alignment_of 61 | : alignment_of_detail::alignment_of_impl 62 | {}; 63 | 64 | 65 | template struct aligned_type; 66 | 67 | // __align__ is CUDA-specific, so guard it 68 | #if defined(__CUDACC__) 69 | 70 | // implementing aligned_type portably is tricky: 71 | 72 | # if defined(_MSC_VER) 73 | // implement aligned_type with specialization because MSVC 74 | // requires literals as arguments to declspec(align(n)) 75 | template<> struct aligned_type<1> 76 | { 77 | struct __align__(1) type { }; 78 | }; 79 | 80 | template<> struct aligned_type<2> 81 | { 82 | struct __align__(2) type { }; 83 | }; 84 | 85 | template<> struct aligned_type<4> 86 | { 87 | struct __align__(4) type { }; 88 | }; 89 | 90 | template<> struct aligned_type<8> 91 | { 92 | struct __align__(8) type { }; 93 | }; 94 | 95 | template<> struct aligned_type<16> 96 | { 97 | struct __align__(16) type { }; 98 | }; 99 | 100 | template<> struct aligned_type<32> 101 | { 102 | struct __align__(32) type { }; 103 | }; 104 | 105 | template<> struct aligned_type<64> 106 | { 107 | struct __align__(64) type { }; 108 | }; 109 | 110 | template<> struct aligned_type<128> 111 | { 112 | struct __align__(128) type { }; 113 | }; 114 | 115 | template<> struct aligned_type<256> 116 | { 117 | struct __align__(256) type { }; 118 | }; 119 | 120 | template<> struct aligned_type<512> 121 | { 122 | struct __align__(512) type { }; 123 | }; 124 | 125 | template<> struct aligned_type<1024> 126 | { 127 | struct __align__(1024) type { }; 128 | }; 129 | 130 | template<> struct aligned_type<2048> 131 | { 132 | struct __align__(2048) type { }; 133 | }; 134 | 135 | template<> struct aligned_type<4096> 136 | { 137 | struct __align__(4096) type { }; 138 | }; 139 | 140 | template<> struct aligned_type<8192> 141 | { 142 | struct __align__(8192) type { }; 143 | }; 144 | # elif defined(__GNUC__) && ((10000 * __GNUC__ + 100 * __GNUC_MINOR__ + __GNUC_PATCHLEVEL__) < 40600) 145 | // implement aligned_type with specialization because older gcc 146 | // requires literals as arguments to __attribute__(aligned(n)) 147 | template<> struct aligned_type<1> 148 | { 149 | struct __align__(1) type { }; 150 | }; 151 | 152 | template<> struct aligned_type<2> 153 | { 154 | struct __align__(2) type { }; 155 | }; 156 | 157 | template<> struct aligned_type<4> 158 | { 159 | struct __align__(4) type { }; 160 | }; 161 | 162 | template<> struct aligned_type<8> 163 | { 164 | struct __align__(8) type { }; 165 | }; 166 | 167 | template<> struct aligned_type<16> 168 | { 169 | struct __align__(16) type { }; 170 | }; 171 | 172 | template<> struct aligned_type<32> 173 | { 174 | struct __align__(32) type { }; 175 | }; 176 | 177 | template<> struct aligned_type<64> 178 | { 179 | struct __align__(64) type { }; 180 | }; 181 | 182 | template<> struct aligned_type<128> 183 | { 184 | struct __align__(128) type { }; 185 | }; 186 | 187 | # else 188 | // assume the compiler allows template parameters as 189 | // arguments to __align__ 190 | template struct aligned_type 191 | { 192 | struct __align__(Align) type { }; 193 | }; 194 | # endif // THRUST_HOST_COMPILER 195 | #else 196 | template struct aligned_type 197 | { 198 | struct type { }; 199 | }; 200 | #endif // THRUST_DEVICE_COMPILER 201 | 202 | 203 | template 204 | struct aligned_storage 205 | { 206 | union type 207 | { 208 | unsigned char data[Len]; 209 | 210 | typename aligned_type::type align; 211 | }; 212 | }; 213 | 214 | 215 | } // end detail 216 | } // end bulk 217 | BULK_NAMESPACE_SUFFIX 218 | 219 | -------------------------------------------------------------------------------- /bulk/detail/apply_from_tuple.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | BULK_NAMESPACE_PREFIX 23 | namespace bulk 24 | { 25 | namespace detail 26 | { 27 | 28 | 29 | template 30 | __host__ __device__ 31 | void apply_from_tuple(Function f, const thrust::tuple<> &) 32 | { 33 | f(); 34 | } 35 | 36 | 37 | template 38 | __host__ __device__ 39 | void apply_from_tuple(Function f, const thrust::tuple &args) 40 | { 41 | f(thrust::get<0>(args)); 42 | } 43 | 44 | 45 | template 46 | __host__ __device__ 47 | void apply_from_tuple(Function f, const thrust::tuple &args) 48 | { 49 | f(thrust::get<0>(args), 50 | thrust::get<1>(args)); 51 | } 52 | 53 | 54 | template 55 | __host__ __device__ 56 | void apply_from_tuple(Function f, const thrust::tuple &args) 57 | { 58 | f(thrust::get<0>(args), 59 | thrust::get<1>(args), 60 | thrust::get<2>(args)); 61 | } 62 | 63 | 64 | template 65 | __host__ __device__ 66 | void apply_from_tuple(Function f, const thrust::tuple &args) 67 | { 68 | f(thrust::get<0>(args), 69 | thrust::get<1>(args), 70 | thrust::get<2>(args), 71 | thrust::get<3>(args)); 72 | } 73 | 74 | 75 | template 76 | __host__ __device__ 77 | void apply_from_tuple(Function f, const thrust::tuple &args) 78 | { 79 | f(thrust::get<0>(args), 80 | thrust::get<1>(args), 81 | thrust::get<2>(args), 82 | thrust::get<3>(args), 83 | thrust::get<4>(args)); 84 | } 85 | 86 | 87 | template 88 | __host__ __device__ 89 | void apply_from_tuple(Function f, const thrust::tuple &args) 90 | { 91 | f(thrust::get<0>(args), 92 | thrust::get<1>(args), 93 | thrust::get<2>(args), 94 | thrust::get<3>(args), 95 | thrust::get<4>(args), 96 | thrust::get<5>(args)); 97 | } 98 | 99 | 100 | template 101 | __host__ __device__ 102 | void apply_from_tuple(Function f, const thrust::tuple &args) 103 | { 104 | f(thrust::get<0>(args), 105 | thrust::get<1>(args), 106 | thrust::get<2>(args), 107 | thrust::get<3>(args), 108 | thrust::get<4>(args), 109 | thrust::get<5>(args), 110 | thrust::get<6>(args)); 111 | } 112 | 113 | 114 | template 115 | __host__ __device__ 116 | void apply_from_tuple(Function f, const thrust::tuple &args) 117 | { 118 | f(thrust::get<0>(args), 119 | thrust::get<1>(args), 120 | thrust::get<2>(args), 121 | thrust::get<3>(args), 122 | thrust::get<4>(args), 123 | thrust::get<5>(args), 124 | thrust::get<6>(args), 125 | thrust::get<7>(args)); 126 | } 127 | 128 | 129 | template 130 | __host__ __device__ 131 | void apply_from_tuple(Function f, const thrust::tuple &args) 132 | { 133 | f(thrust::get<0>(args), 134 | thrust::get<1>(args), 135 | thrust::get<2>(args), 136 | thrust::get<3>(args), 137 | thrust::get<4>(args), 138 | thrust::get<5>(args), 139 | thrust::get<6>(args), 140 | thrust::get<7>(args), 141 | thrust::get<8>(args)); 142 | } 143 | 144 | 145 | template 146 | __host__ __device__ 147 | void apply_from_tuple(Function f, const thrust::tuple &args) 148 | { 149 | f(thrust::get<0>(args), 150 | thrust::get<1>(args), 151 | thrust::get<2>(args), 152 | thrust::get<3>(args), 153 | thrust::get<4>(args), 154 | thrust::get<5>(args), 155 | thrust::get<6>(args), 156 | thrust::get<7>(args), 157 | thrust::get<8>(args), 158 | thrust::get<9>(args)); 159 | } 160 | 161 | 162 | } // end detail 163 | } // end bulk 164 | BULK_NAMESPACE_SUFFIX 165 | 166 | -------------------------------------------------------------------------------- /bulk/detail/async.inl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | 25 | BULK_NAMESPACE_PREFIX 26 | namespace bulk 27 | { 28 | namespace detail 29 | { 30 | 31 | 32 | template 33 | __host__ __device__ 34 | future async_in_stream(ExecutionGroup g, Closure c, cudaStream_t s, cudaEvent_t before_event) 35 | { 36 | #if __BULK_HAS_CUDART__ 37 | if(before_event != 0) 38 | { 39 | bulk::detail::throw_on_error(cudaStreamWaitEvent(s, before_event, 0), "cudaStreamWaitEvent in async_in_stream"); 40 | } 41 | #else 42 | bulk::detail::terminate_with_message("async_in_stream(): cudaStreamWaitEvent requires CUDART"); 43 | #endif 44 | 45 | bulk::detail::cuda_launcher launcher; 46 | launcher.launch(g, c, s); 47 | 48 | return future_core_access::create(s, false); 49 | } // end async_in_stream() 50 | 51 | 52 | template 53 | __host__ __device__ 54 | future async(ExecutionGroup g, Closure c, cudaEvent_t before_event) 55 | { 56 | cudaStream_t s; 57 | 58 | // XXX cudaStreamCreate is __host__-only 59 | // figure out a way to support this that does not require creating a new stream 60 | #if (__BULK_HAS_CUDART__ && !defined(__CUDA_ARCH__)) 61 | bulk::detail::throw_on_error(cudaStreamCreate(&s), "cudaStreamCreate in bulk::detail::async"); 62 | #else 63 | s = 0; 64 | bulk::detail::terminate_with_message("bulk::async(): cudaStreamCreate() is unsupported in __device__ code."); 65 | #endif 66 | 67 | #if __BULK_HAS_CUDART__ 68 | if(before_event != 0) 69 | { 70 | bulk::detail::throw_on_error(cudaStreamWaitEvent(s, before_event, 0), "cudaStreamWaitEvent in bulk::detail::async"); 71 | } 72 | #else 73 | bulk::detail::terminate_with_message("async_in_stream(): cudaStreamWaitEvent requires CUDART"); 74 | #endif 75 | 76 | bulk::detail::cuda_launcher launcher; 77 | launcher.launch(g, c, s); 78 | 79 | // note we pass true here, unlike false above 80 | return future_core_access::create(s, true); 81 | } // end async() 82 | 83 | 84 | template 85 | __host__ __device__ 86 | future async(ExecutionGroup g, Closure c) 87 | { 88 | return bulk::detail::async_in_stream(g, c, 0, 0); 89 | } // end async() 90 | 91 | 92 | template 93 | __host__ __device__ 94 | future async(async_launch launch, Closure c) 95 | { 96 | return launch.is_stream_valid() ? 97 | bulk::detail::async_in_stream(launch.exec(), c, launch.stream(), launch.before_event()) : 98 | bulk::detail::async(launch.exec(), c, launch.before_event()); 99 | } // end async() 100 | 101 | 102 | } // end detail 103 | 104 | 105 | template 106 | __host__ __device__ 107 | future async(ExecutionGroup g, Function f) 108 | { 109 | return bulk::detail::async(g, detail::make_closure(f)); 110 | } // end async() 111 | 112 | 113 | template 114 | __host__ __device__ 115 | future async(ExecutionGroup g, Function f, Arg1 arg1) 116 | { 117 | return bulk::detail::async(g, detail::make_closure(f,arg1)); 118 | } // end async() 119 | 120 | 121 | template 122 | __host__ __device__ 123 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2) 124 | { 125 | return bulk::detail::async(g, detail::make_closure(f,arg1,arg2)); 126 | } // end async() 127 | 128 | 129 | template 130 | __host__ __device__ 131 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3) 132 | { 133 | return bulk::detail::async(g, detail::make_closure(f,arg1,arg2,arg3)); 134 | } // end async() 135 | 136 | 137 | template 138 | __host__ __device__ 139 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4) 140 | { 141 | return bulk::detail::async(g, detail::make_closure(f,arg1,arg2,arg3,arg4)); 142 | } // end async() 143 | 144 | 145 | template 146 | __host__ __device__ 147 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) 148 | { 149 | return bulk::detail::async(g, detail::make_closure(f,arg1,arg2,arg3,arg4,arg5)); 150 | } // end async() 151 | 152 | 153 | template 154 | __host__ __device__ 155 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) 156 | { 157 | return bulk::detail::async(g, detail::make_closure(f,arg1,arg2,arg3,arg4,arg5,arg6)); 158 | } // end async() 159 | 160 | 161 | template 162 | __host__ __device__ 163 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7) 164 | { 165 | return bulk::detail::async(g, detail::make_closure(f,arg1,arg2,arg3,arg4,arg5,arg6,arg7)); 166 | } // end async() 167 | 168 | 169 | template 170 | __host__ __device__ 171 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8) 172 | { 173 | return bulk::detail::async(g, detail::make_closure(f,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8)); 174 | } // end async() 175 | 176 | 177 | template 178 | __host__ __device__ 179 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9) 180 | { 181 | return bulk::detail::async(g, detail::make_closure(f,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9)); 182 | } // end async() 183 | 184 | 185 | template 186 | __host__ __device__ 187 | future async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9, Arg10 arg10) 188 | { 189 | return bulk::detail::async(g, detail::make_closure(f,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10)); 190 | } // end async() 191 | 192 | 193 | } // end bulk 194 | BULK_NAMESPACE_SUFFIX 195 | 196 | -------------------------------------------------------------------------------- /bulk/detail/choose_sizes.inl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | 25 | BULK_NAMESPACE_PREFIX 26 | namespace bulk 27 | { 28 | namespace detail 29 | { 30 | 31 | 32 | template 33 | __host__ __device__ 34 | thrust::pair >::size_type, 35 | typename concurrent_group<>::size_type> 36 | choose_sizes(parallel_group > g, Closure) 37 | { 38 | bulk::detail::cuda_launcher< 39 | parallel_group >, 40 | Closure 41 | > launcher; 42 | 43 | return launcher.choose_sizes(g.size(), g.this_exec.size()); 44 | } // end choose_sizes() 45 | 46 | 47 | } // end detail 48 | 49 | 50 | template 51 | __host__ __device__ 52 | thrust::pair >::size_type, 53 | typename concurrent_group<>::size_type> 54 | choose_sizes(parallel_group > g, Function f) 55 | { 56 | return bulk::detail::choose_sizes(g, detail::make_closure(f)); 57 | } 58 | 59 | 60 | template 61 | __host__ __device__ 62 | thrust::pair >::size_type, 63 | typename concurrent_group<>::size_type> 64 | choose_sizes(parallel_group > g, Function f, Arg1 arg1) 65 | { 66 | return bulk::detail::choose_sizes(g, detail::make_closure(f,arg1)); 67 | } 68 | 69 | 70 | template 71 | __host__ __device__ 72 | thrust::pair >::size_type, 73 | typename concurrent_group<>::size_type> 74 | choose_sizes(parallel_group > g, Function f, Arg1 arg1, Arg2 arg2) 75 | { 76 | return bulk::detail::choose_sizes(g, detail::make_closure(f,arg1,arg2)); 77 | } 78 | 79 | 80 | template 81 | __host__ __device__ 82 | thrust::pair >::size_type, 83 | typename concurrent_group<>::size_type> 84 | choose_sizes(parallel_group > g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3) 85 | { 86 | return bulk::detail::choose_sizes(g, detail::make_closure(f,arg1,arg2,arg3)); 87 | } 88 | 89 | 90 | template 91 | __host__ __device__ 92 | thrust::pair >::size_type, 93 | typename concurrent_group<>::size_type> 94 | choose_sizes(parallel_group > g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4) 95 | { 96 | return bulk::detail::choose_sizes(g, detail::make_closure(f,arg1,arg2,arg3,arg4)); 97 | } 98 | 99 | 100 | template 101 | __host__ __device__ 102 | thrust::pair >::size_type, 103 | typename concurrent_group<>::size_type> 104 | choose_sizes(parallel_group > g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) 105 | { 106 | return bulk::detail::choose_sizes(g, detail::make_closure(f,arg1,arg2,arg3,arg4,arg5)); 107 | } 108 | 109 | 110 | template 111 | __host__ __device__ 112 | thrust::pair >::size_type, 113 | typename concurrent_group<>::size_type> 114 | choose_sizes(parallel_group > g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) 115 | { 116 | return bulk::detail::choose_sizes(g, detail::make_closure(f,arg1,arg2,arg3,arg4,arg5,arg6)); 117 | } 118 | 119 | 120 | } // end bulk 121 | BULK_NAMESPACE_SUFFIX 122 | 123 | -------------------------------------------------------------------------------- /bulk/detail/closure.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | #include 23 | #include 24 | 25 | BULK_NAMESPACE_PREFIX 26 | namespace bulk 27 | { 28 | namespace detail 29 | { 30 | 31 | 32 | template 33 | class closure 34 | { 35 | public: 36 | typedef Function function_type; 37 | 38 | typedef Tuple arguments_type; 39 | 40 | __host__ __device__ 41 | closure(function_type f, const arguments_type &args) 42 | :f(f), 43 | args(args) 44 | {} 45 | 46 | 47 | __host__ __device__ 48 | void operator()() 49 | { 50 | apply_from_tuple(f,args); 51 | } 52 | 53 | 54 | __host__ __device__ 55 | function_type function() const 56 | { 57 | return f; 58 | } 59 | 60 | 61 | __host__ __device__ 62 | arguments_type arguments() const 63 | { 64 | return args; 65 | } 66 | 67 | 68 | private: 69 | function_type f; 70 | arguments_type args; 71 | }; // end closure 72 | 73 | 74 | template 75 | __host__ __device__ 76 | const closure &make_closure(const closure &c) 77 | { 78 | return c; 79 | } 80 | 81 | 82 | template 83 | __host__ __device__ 84 | closure > make_closure(Function f) 85 | { 86 | return closure >(f, thrust::tuple<>()); 87 | } 88 | 89 | 90 | template 91 | __host__ __device__ 92 | closure > make_closure(Function f, const Arg1 &a1) 93 | { 94 | return closure >(f, thrust::make_tuple(a1)); 95 | } 96 | 97 | 98 | template 99 | __host__ __device__ 100 | closure< 101 | Function, 102 | thrust::tuple 103 | > 104 | make_closure(Function f, const Arg1 &a1, const Arg2 &a2) 105 | { 106 | return closure >(f, thrust::make_tuple(a1,a2)); 107 | } 108 | 109 | 110 | template 111 | __host__ __device__ 112 | closure< 113 | Function, 114 | thrust::tuple 115 | > 116 | make_closure(Function f, const Arg1 &a1, const Arg2 &a2, const Arg3 &a3) 117 | { 118 | return closure >(f, thrust::make_tuple(a1,a2,a3)); 119 | } 120 | 121 | 122 | template 123 | __host__ __device__ 124 | closure< 125 | Function, 126 | thrust::tuple 127 | > 128 | make_closure(Function f, const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4) 129 | { 130 | return closure >(f, thrust::make_tuple(a1,a2,a3,a4)); 131 | } 132 | 133 | 134 | template 135 | __host__ __device__ 136 | closure< 137 | Function, 138 | thrust::tuple 139 | > 140 | make_closure(Function f, const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5) 141 | { 142 | return closure >(f, thrust::make_tuple(a1,a2,a3,a4,a5)); 143 | } 144 | 145 | 146 | template 147 | __host__ __device__ 148 | closure< 149 | Function, 150 | thrust::tuple 151 | > 152 | make_closure(Function f, const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6) 153 | { 154 | return closure >(f, thrust::make_tuple(a1,a2,a3,a4,a5,a6)); 155 | } 156 | 157 | 158 | template 159 | __host__ __device__ 160 | closure< 161 | Function, 162 | thrust::tuple 163 | > 164 | make_closure(Function f, const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7) 165 | { 166 | return closure >(f, thrust::make_tuple(a1,a2,a3,a4,a5,a6,a7)); 167 | } 168 | 169 | 170 | template 171 | __host__ __device__ 172 | closure< 173 | Function, 174 | thrust::tuple 175 | > 176 | make_closure(Function f, const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8) 177 | { 178 | return closure >(f, thrust::make_tuple(a1,a2,a3,a4,a5,a6,a7,a8)); 179 | } 180 | 181 | 182 | template 183 | __host__ __device__ 184 | closure< 185 | Function, 186 | thrust::tuple 187 | > 188 | make_closure(Function f, const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8, const Arg9 &a9) 189 | { 190 | return closure >(f, thrust::make_tuple(a1,a2,a3,a4,a5,a6,a7,a8,a9)); 191 | } 192 | 193 | 194 | template 195 | __host__ __device__ 196 | closure< 197 | Function, 198 | thrust::tuple 199 | > 200 | make_closure(Function f, const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8, const Arg9 &a9, const Arg10 &a10) 201 | { 202 | return closure >(f, thrust::make_tuple(a1,a2,a3,a4,a5,a6,a7,a8,a9,a10)); 203 | } 204 | 205 | 206 | } // end detail 207 | } // end bulk 208 | BULK_NAMESPACE_SUFFIX 209 | 210 | -------------------------------------------------------------------------------- /bulk/detail/config.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #ifndef BULK_NAMESPACE_PREFIX 20 | #define BULK_NAMESPACE_PREFIX 21 | #endif 22 | 23 | #ifndef BULK_NAMESPACE_SUFFIX 24 | #define BULK_NAMESPACE_SUFFIX 25 | #endif 26 | 27 | #if defined(__CUDACC__) && !(defined(__CUDA__) && defined(__clang__)) 28 | # ifndef __bulk_exec_check_disable__ 29 | # define __bulk_exec_check_disable__ pragma nv_exec_check_disable 30 | # endif // __bulk_exec_check_disable__ 31 | #else 32 | # define __bulk_exec_check_disable__ 33 | #endif // __bulk_exec_check_disable__ 34 | 35 | #include 36 | 37 | #if THRUST_VERSION < 100800 38 | #error "Bulk requires Thrust v1.8 (http://thrust.github.io) or better." 39 | #endif 40 | 41 | 42 | #if defined(__CUDACC__) 43 | # if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__)) 44 | # define __BULK_HAS_CUDART__ 1 45 | # else 46 | # define __BULK_HAS_CUDART__ 0 47 | # endif 48 | #else 49 | # define __BULK_HAS_CUDART__ 0 50 | #endif 51 | 52 | #if defined(__CUDACC__) 53 | # if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 200) 54 | # define __BULK_HAS_PRINTF__ 1 55 | # else 56 | # define __BULK_HAS_PRINTF__ 0 57 | # endif 58 | #else 59 | # define __BULK_HAS_PRINTF__ 1 60 | #endif 61 | 62 | -------------------------------------------------------------------------------- /bulk/detail/cuda_launcher/parameter_ptr.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | 27 | BULK_NAMESPACE_PREFIX 28 | namespace bulk 29 | { 30 | namespace detail 31 | { 32 | 33 | 34 | // this thing has ownership semantics like unique_ptr, so copy and assign are more like moves 35 | template 36 | class parameter_ptr 37 | { 38 | public: 39 | typedef T element_type; 40 | 41 | __host__ __device__ 42 | explicit parameter_ptr(element_type *ptr) 43 | : m_ptr(ptr) 44 | {} 45 | 46 | // XXX copy emulates a move 47 | __host__ __device__ 48 | parameter_ptr(const parameter_ptr& other_) 49 | { 50 | parameter_ptr& other = const_cast(other_); 51 | thrust::swap(m_ptr, other.m_ptr); 52 | } 53 | 54 | __host__ __device__ 55 | ~parameter_ptr() 56 | { 57 | #if __BULK_HAS_CUDART__ 58 | if(m_ptr) 59 | { 60 | bulk::detail::terminate_on_error(cudaFree(m_ptr), "in parameter_ptr dtor"); 61 | } 62 | #else 63 | bulk::detail::terminate_with_message("parameter_ptr dtor: cudaFree requires CUDART"); 64 | #endif 65 | } 66 | 67 | // XXX assign emulates a move 68 | __host__ __device__ 69 | parameter_ptr& operator=(const parameter_ptr& other_) 70 | { 71 | parameter_ptr& other = const_cast(other_); 72 | thrust::swap(m_ptr, other.m_ptr); 73 | return *this; 74 | } 75 | 76 | __host__ __device__ 77 | T* get() const 78 | { 79 | return m_ptr; 80 | } 81 | 82 | private: 83 | T *m_ptr; 84 | }; 85 | 86 | 87 | template 88 | __host__ __device__ 89 | parameter_ptr make_parameter(const T& x) 90 | { 91 | T* raw_ptr = 0; 92 | 93 | // allocate 94 | #if __BULK_HAS_CUDART__ 95 | bulk::detail::throw_on_error(cudaMalloc(&raw_ptr, sizeof(T)), "make_parameter(): after cudaMalloc"); 96 | #else 97 | bulk::detail::terminate_with_message("make_parameter(): cudaMalloc requires CUDART\n"); 98 | #endif 99 | 100 | // do a trivial copy 101 | #ifndef __CUDA_ARCH__ 102 | bulk::detail::throw_on_error(cudaMemcpy(raw_ptr, &x, sizeof(T), cudaMemcpyHostToDevice), 103 | "make_parameter(): after cudaMemcpy"); 104 | #else 105 | std::memcpy(raw_ptr, &x, sizeof(T)); 106 | #endif 107 | 108 | return parameter_ptr(raw_ptr); 109 | } 110 | 111 | 112 | } // end detail 113 | } // end bulk 114 | BULK_NAMESPACE_SUFFIX 115 | 116 | -------------------------------------------------------------------------------- /bulk/detail/cuda_launcher/runtime_introspection.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | // #include this for device_properties_t and function_attributes_t 22 | #include 23 | 24 | // #include this for size_t 25 | #include 26 | 27 | 28 | // runtime introspection isn't possible without CUDART 29 | #if __BULK_HAS_CUDART__ 30 | 31 | 32 | BULK_NAMESPACE_PREFIX 33 | namespace bulk 34 | { 35 | namespace detail 36 | { 37 | 38 | 39 | /*! Returns the current device ordinal. 40 | */ 41 | __host__ __device__ 42 | inline int current_device(); 43 | 44 | /*! Returns a copy of the device_properties_t structure 45 | * that is associated with a given device. 46 | */ 47 | __host__ __device__ 48 | inline device_properties_t device_properties(int device_id); 49 | 50 | /*! Returns a copy of the device_properties_t structure 51 | * that is associated with the current device. 52 | */ 53 | __host__ __device__ 54 | inline device_properties_t device_properties(); 55 | 56 | /*! Returns a copy of the function_attributes_t structure 57 | * that is associated with a given __global__ function 58 | */ 59 | template 60 | __host__ __device__ 61 | inline function_attributes_t function_attributes(KernelFunction kernel); 62 | 63 | /*! Returns the compute capability of a device in integer format. 64 | * For example, returns 10 for sm_10 and 21 for sm_21 65 | * \return The compute capability as an integer 66 | */ 67 | __host__ __device__ 68 | inline size_t compute_capability(const device_properties_t &properties); 69 | 70 | __host__ __device__ 71 | inline size_t compute_capability(); 72 | 73 | 74 | } // end namespace detail 75 | } // end namespace bulk 76 | BULK_NAMESPACE_SUFFIX 77 | 78 | 79 | #endif // __BULK_HAS_CUDART__ 80 | 81 | #include 82 | 83 | -------------------------------------------------------------------------------- /bulk/detail/cuda_launcher/runtime_introspection.inl: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | 27 | BULK_NAMESPACE_PREFIX 28 | namespace bulk 29 | { 30 | namespace detail 31 | { 32 | 33 | 34 | __host__ __device__ 35 | inline device_properties_t device_properties_uncached(int device_id) 36 | { 37 | device_properties_t prop = {0,{0,0,0},0,0,0,0,0,0,0}; 38 | 39 | cudaError_t error = cudaErrorNoDevice; 40 | 41 | #if __BULK_HAS_CUDART__ 42 | error = cudaDeviceGetAttribute(&prop.major, cudaDevAttrComputeCapabilityMajor, device_id); 43 | error = cudaDeviceGetAttribute(&prop.maxGridSize[0], cudaDevAttrMaxGridDimX, device_id); 44 | error = cudaDeviceGetAttribute(&prop.maxGridSize[1], cudaDevAttrMaxGridDimY, device_id); 45 | error = cudaDeviceGetAttribute(&prop.maxGridSize[2], cudaDevAttrMaxGridDimZ, device_id); 46 | error = cudaDeviceGetAttribute(&prop.maxThreadsPerBlock, cudaDevAttrMaxThreadsPerBlock, device_id); 47 | error = cudaDeviceGetAttribute(&prop.maxThreadsPerMultiProcessor, cudaDevAttrMaxThreadsPerMultiProcessor, device_id); 48 | error = cudaDeviceGetAttribute(&prop.minor, cudaDevAttrComputeCapabilityMinor, device_id); 49 | error = cudaDeviceGetAttribute(&prop.multiProcessorCount, cudaDevAttrMultiProcessorCount, device_id); 50 | error = cudaDeviceGetAttribute(&prop.regsPerBlock, cudaDevAttrMaxRegistersPerBlock, device_id); 51 | int temp; 52 | error = cudaDeviceGetAttribute(&temp, cudaDevAttrMaxSharedMemoryPerBlock, device_id); 53 | prop.sharedMemPerBlock = temp; 54 | error = cudaDeviceGetAttribute(&prop.warpSize, cudaDevAttrWarpSize, device_id); 55 | #else 56 | (void) device_id; // Suppress unused parameter warnings 57 | #endif 58 | 59 | throw_on_error(error, "cudaDeviceGetProperty in get_device_properties"); 60 | 61 | return prop; 62 | } 63 | 64 | 65 | inline device_properties_t device_properties_cached(int device_id) 66 | { 67 | // cache the result of get_device_properties, because it is slow 68 | // only cache the first few devices 69 | static const int max_num_devices = 16; 70 | 71 | static bool properties_exist[max_num_devices] = {0}; 72 | static device_properties_t device_properties[max_num_devices] = {}; 73 | 74 | if(device_id >= max_num_devices) 75 | { 76 | return device_properties_uncached(device_id); 77 | } 78 | 79 | if(!properties_exist[device_id]) 80 | { 81 | device_properties[device_id] = device_properties_uncached(device_id); 82 | 83 | // disallow the compiler to move the write to properties_exist[device_id] 84 | // before the initialization of device_properties[device_id] 85 | __thrust_compiler_fence(); 86 | 87 | properties_exist[device_id] = true; 88 | } 89 | 90 | return device_properties[device_id]; 91 | } 92 | 93 | 94 | __host__ __device__ 95 | inline device_properties_t device_properties(int device_id) 96 | { 97 | #ifndef __CUDA_ARCH__ 98 | return device_properties_cached(device_id); 99 | #else 100 | return device_properties_uncached(device_id); 101 | #endif 102 | } 103 | 104 | 105 | __host__ __device__ 106 | inline int current_device() 107 | { 108 | int result = -1; 109 | 110 | #if __BULK_HAS_CUDART__ 111 | bulk::detail::throw_on_error(cudaGetDevice(&result), "current_device(): after cudaGetDevice"); 112 | #endif 113 | 114 | if(result < 0) 115 | { 116 | bulk::detail::throw_on_error(cudaErrorNoDevice, "current_device(): after cudaGetDevice"); 117 | } 118 | 119 | return result; 120 | } 121 | 122 | 123 | __host__ __device__ 124 | inline device_properties_t device_properties() 125 | { 126 | return device_properties(current_device()); 127 | } 128 | 129 | 130 | template 131 | __host__ __device__ 132 | inline function_attributes_t function_attributes(KernelFunction kernel) 133 | { 134 | #if __BULK_HAS_CUDART__ 135 | typedef void (*fun_ptr_type)(); 136 | 137 | fun_ptr_type fun_ptr = reinterpret_cast(kernel); 138 | 139 | cudaFuncAttributes attributes; 140 | 141 | bulk::detail::throw_on_error(cudaFuncGetAttributes(&attributes, fun_ptr), "function_attributes(): after cudaFuncGetAttributes"); 142 | 143 | // be careful about how this is initialized! 144 | function_attributes_t result = { 145 | attributes.constSizeBytes, 146 | attributes.localSizeBytes, 147 | attributes.maxThreadsPerBlock, 148 | attributes.numRegs, 149 | attributes.ptxVersion, 150 | attributes.sharedSizeBytes 151 | }; 152 | 153 | return result; 154 | #else 155 | return function_attributes_t(); 156 | #endif // __CUDACC__ 157 | } 158 | 159 | __host__ __device__ 160 | inline size_t compute_capability(const device_properties_t &properties) 161 | { 162 | return 10 * properties.major + properties.minor; 163 | } 164 | 165 | 166 | __host__ __device__ 167 | inline size_t compute_capability() 168 | { 169 | return compute_capability(device_properties()); 170 | } 171 | 172 | 173 | } // end namespace detail 174 | } // end namespace bulk 175 | BULK_NAMESPACE_SUFFIX 176 | 177 | -------------------------------------------------------------------------------- /bulk/detail/guarded_cuda_runtime_api.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | #include 19 | 20 | // the purpose of this header is to #include without causing 21 | // warnings from redefinitions of __host__ and __device__. 22 | // we only do this if host_defines.h has not been included yet 23 | // we carefully save the definitions of __host__ & __device__ and restore them 24 | // if the compiler does not have push_macro & pop_macro, just undef __host__ & __device__ and hope for the best 25 | 26 | // can't tell exactly when push_macro & pop_macro were introduced to gcc; assume 4.5.0 27 | #if !defined(__HOST_DEFINES_H__) 28 | # if !defined(__GNUC__) || ((10000 * __GNUC__ + 100 * __GNUC_MINOR__ + __GNUC_PATCHLEVEL__) >= 40500) || defined(__clang__) 29 | # ifdef __host__ 30 | # pragma push_macro("__host__") 31 | # undef __host__ 32 | # define BULK_HOST_NEEDS_RESTORATION 33 | # endif 34 | # ifdef __device__ 35 | # pragma push_macro("__device__") 36 | # undef __device__ 37 | # define BULK_DEVICE_NEEDS_RESTORATION 38 | # endif 39 | # else // GNUC pre 4.5.0 40 | # ifdef __host__ 41 | # undef __host__ 42 | # endif 43 | # ifdef __device__ 44 | # undef __device__ 45 | # endif 46 | # endif // has push/pop_macro 47 | #endif // __HOST_DEFINES_H__ 48 | 49 | 50 | #include 51 | 52 | 53 | #if !defined(__GNUC__) || ((10000 * __GNUC__ + 100 * __GNUC_MINOR__ + __GNUC_PATCHLEVEL__) >= 40500) || defined(__clang__) 54 | # ifdef BULK_HOST_NEEDS_RESTORATION 55 | # pragma pop_macro("__host__") 56 | # undef BULK_HOST_NEEDS_RESTORATION 57 | # endif 58 | # ifdef BULK_DEVICE_NEEDS_RESTORATION 59 | # pragma pop_macro("__device__") 60 | # undef BULK_DEVICE_NEEDS_RESTORATION 61 | # endif 62 | #endif // __GNUC__ 63 | 64 | -------------------------------------------------------------------------------- /bulk/detail/head_flags.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | 27 | BULK_NAMESPACE_PREFIX 28 | namespace bulk 29 | { 30 | namespace detail 31 | { 32 | 33 | 34 | template::type>, 36 | typename ValueType = bool, 37 | typename IndexType = typename thrust::iterator_difference::type> 38 | class head_flags_with_init 39 | { 40 | typedef typename thrust::iterator_value::type init_type; 41 | 42 | // XXX WAR cudafe issue 43 | //private: 44 | public: 45 | struct head_flag_functor 46 | { 47 | BinaryPredicate binary_pred; // this must be the first member for performance reasons 48 | init_type init; 49 | IndexType n; 50 | 51 | typedef ValueType result_type; 52 | 53 | __host__ __device__ 54 | head_flag_functor(init_type init, IndexType n) 55 | : binary_pred(), init(init), n(n) 56 | {} 57 | 58 | __host__ __device__ 59 | head_flag_functor(init_type init, IndexType n, BinaryPredicate binary_pred) 60 | : binary_pred(binary_pred), init(init), n(n) 61 | {} 62 | 63 | template 64 | __host__ __device__ __thrust_forceinline__ 65 | result_type operator()(const Tuple &t) 66 | { 67 | const IndexType i = thrust::get<0>(t); 68 | 69 | if(i == 0) 70 | { 71 | return !binary_pred(init, thrust::get<1>(t)); 72 | } 73 | 74 | return !binary_pred(thrust::get<1>(t), thrust::get<2>(t)); 75 | } 76 | }; 77 | 78 | typedef thrust::counting_iterator counting_iterator; 79 | 80 | public: 81 | typedef thrust::transform_iterator< 82 | head_flag_functor, 83 | thrust::zip_iterator > 84 | > iterator; 85 | 86 | __bulk_exec_check_disable__ 87 | __host__ __device__ 88 | head_flags_with_init(RandomAccessIterator first, RandomAccessIterator last, init_type init) 89 | : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator(0), first, first - 1)), 90 | head_flag_functor(init, last - first))), 91 | m_end(m_begin + (last - first)) 92 | {} 93 | 94 | __host__ __device__ 95 | head_flags_with_init(RandomAccessIterator first, RandomAccessIterator last, init_type init, BinaryPredicate binary_pred) 96 | : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator(0), first, first - 1)), 97 | head_flag_functor(init, last - first, binary_pred))), 98 | m_end(m_begin + (last - first)) 99 | {} 100 | 101 | __host__ __device__ 102 | iterator begin() const 103 | { 104 | return m_begin; 105 | } 106 | 107 | __host__ __device__ 108 | iterator end() const 109 | { 110 | return m_end; 111 | } 112 | 113 | template 114 | __host__ __device__ 115 | typename iterator::reference operator[](OtherIndex i) 116 | { 117 | return *(begin() + i); 118 | } 119 | 120 | private: 121 | iterator m_begin, m_end; 122 | }; 123 | 124 | 125 | 126 | template::type>, 128 | typename ValueType = bool, 129 | typename IndexType = typename thrust::iterator_difference::type> 130 | // class head_flags 131 | class head_flags_ 132 | { 133 | // XXX WAR cudafe issue 134 | //private: 135 | public: 136 | struct head_flag_functor 137 | { 138 | BinaryPredicate binary_pred; // this must be the first member for performance reasons 139 | IndexType n; 140 | 141 | typedef ValueType result_type; 142 | 143 | __host__ __device__ 144 | head_flag_functor(IndexType n) 145 | : binary_pred(), n(n) 146 | {} 147 | 148 | __host__ __device__ 149 | head_flag_functor(IndexType n, BinaryPredicate binary_pred) 150 | : binary_pred(binary_pred), n(n) 151 | {} 152 | 153 | template 154 | __host__ __device__ __thrust_forceinline__ 155 | result_type operator()(const Tuple &t) 156 | { 157 | const IndexType i = thrust::get<0>(t); 158 | 159 | // note that we do not dereference the tuple's 2nd element when i <= 0 160 | // and therefore do not dereference a bad location at the boundary 161 | return (i == 0 || !binary_pred(thrust::get<1>(t), thrust::get<2>(t))); 162 | } 163 | }; 164 | 165 | typedef thrust::counting_iterator counting_iterator; 166 | 167 | public: 168 | typedef thrust::transform_iterator< 169 | head_flag_functor, 170 | thrust::zip_iterator > 171 | > iterator; 172 | 173 | __host__ __device__ 174 | //head_flags(RandomAccessIterator first, RandomAccessIterator last) 175 | head_flags_(RandomAccessIterator first, RandomAccessIterator last) 176 | : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator(0), first, first - 1)), 177 | head_flag_functor(last - first))), 178 | m_end(m_begin + (last - first)) 179 | {} 180 | 181 | __host__ __device__ 182 | //head_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred) 183 | head_flags_(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred) 184 | : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator(0), first, first - 1)), 185 | head_flag_functor(last - first, binary_pred))), 186 | m_end(m_begin + (last - first)) 187 | {} 188 | 189 | __host__ __device__ 190 | iterator begin() const 191 | { 192 | return m_begin; 193 | } 194 | 195 | __host__ __device__ 196 | iterator end() const 197 | { 198 | return m_end; 199 | } 200 | 201 | template 202 | __host__ __device__ 203 | typename iterator::reference operator[](OtherIndex i) 204 | { 205 | return *(begin() + i); 206 | } 207 | 208 | private: 209 | iterator m_begin, m_end; 210 | }; 211 | 212 | 213 | template 214 | __host__ __device__ 215 | //head_flags_ 216 | head_flags_ 217 | make_head_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred) 218 | { 219 | //return head_flags(first, last, binary_pred); 220 | return head_flags_(first, last, binary_pred); 221 | } 222 | 223 | 224 | template 225 | __host__ __device__ 226 | //head_flags 227 | head_flags_ 228 | make_head_flags(RandomAccessIterator first, RandomAccessIterator last) 229 | { 230 | //return head_flags(first, last); 231 | return head_flags_(first, last); 232 | } 233 | 234 | 235 | } // end detail 236 | } // end bulk 237 | BULK_NAMESPACE_SUFFIX 238 | 239 | -------------------------------------------------------------------------------- /bulk/detail/is_contiguous_iterator.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | BULK_NAMESPACE_PREFIX 23 | namespace bulk 24 | { 25 | namespace detail 26 | { 27 | 28 | 29 | template 30 | struct is_contiguous_iterator 31 | : thrust::detail::is_trivial_iterator 32 | {}; 33 | 34 | 35 | } // end detail 36 | } // end bulk 37 | BULK_NAMESPACE_SUFFIX 38 | 39 | -------------------------------------------------------------------------------- /bulk/detail/pointer_traits.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | BULK_NAMESPACE_PREFIX 22 | namespace bulk 23 | { 24 | namespace detail 25 | { 26 | 27 | 28 | inline __device__ unsigned int __isShared(const void *ptr) 29 | { 30 | // XXX WAR unused variable warning 31 | (void) ptr; 32 | 33 | unsigned int ret; 34 | 35 | #if __CUDA_ARCH__ >= 200 36 | asm volatile ("{ \n\t" 37 | " .reg .pred p; \n\t" 38 | " isspacep.shared p, %1; \n\t" 39 | " selp.u32 %0, 1, 0, p; \n\t" 40 | # if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) 41 | "} \n\t" : "=r"(ret) : "l"(ptr)); 42 | # else 43 | "} \n\t" : "=r"(ret) : "r"(ptr)); 44 | # endif 45 | #else 46 | ret = 0; 47 | #endif 48 | 49 | return ret; 50 | } // end __isShared() 51 | 52 | 53 | inline __device__ bool is_shared(const void *ptr) 54 | { 55 | return __isShared(ptr); 56 | } // end is_shared() 57 | 58 | 59 | inline __device__ bool is_global(const void *ptr) 60 | { 61 | // XXX WAR unused variable warning 62 | (void) ptr; 63 | 64 | #if __CUDA_ARCH__ >= 200 65 | return __isGlobal(ptr); 66 | #else 67 | return false; 68 | #endif 69 | } // end is_global() 70 | 71 | 72 | } // end detail 73 | } // end bulk 74 | BULK_NAMESPACE_SUFFIX 75 | 76 | -------------------------------------------------------------------------------- /bulk/detail/synchronize.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | BULK_NAMESPACE_PREFIX 26 | namespace bulk 27 | { 28 | namespace detail 29 | { 30 | 31 | 32 | inline __host__ __device__ 33 | void synchronize(const char* message = "") 34 | { 35 | #if __BULK_HAS_CUDART__ 36 | bulk::detail::throw_on_error(cudaDeviceSynchronize(), message); 37 | #else 38 | bulk::detail::terminate_with_message("cudaDeviceSynchronize() requires CUDART"); 39 | (void)message; // Avoid unused parameter warnings 40 | #endif 41 | } // end terminate() 42 | 43 | 44 | inline __host__ __device__ 45 | void synchronize_if_enabled(const char* message = "") 46 | { 47 | // XXX we rely on __THRUST_SYNCHRONOUS here 48 | // note we always have to synchronize in __device__ code 49 | #if __THRUST_SYNCHRONOUS || defined(__CUDA_ARCH__) 50 | synchronize(message); 51 | #else 52 | // WAR "unused parameter" warning 53 | (void) message; 54 | #endif 55 | } 56 | 57 | 58 | } // end detail 59 | } // end bulk 60 | BULK_NAMESPACE_SUFFIX 61 | 62 | -------------------------------------------------------------------------------- /bulk/detail/tail_flags.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | 27 | BULK_NAMESPACE_PREFIX 28 | namespace bulk 29 | { 30 | namespace detail 31 | { 32 | 33 | 34 | template::type>, 36 | typename ValueType = bool, 37 | typename IndexType = typename thrust::iterator_difference::type> 38 | class tail_flags_ 39 | { 40 | // XXX WAR cudafe bug 41 | //private: 42 | public: 43 | struct tail_flag_functor 44 | { 45 | BinaryPredicate binary_pred; // this must be the first member for performance reasons 46 | RandomAccessIterator iter; 47 | IndexType n; 48 | 49 | typedef ValueType result_type; 50 | 51 | __host__ __device__ 52 | tail_flag_functor(RandomAccessIterator first, RandomAccessIterator last) 53 | : binary_pred(), iter(first), n(last - first) 54 | {} 55 | 56 | __host__ __device__ 57 | tail_flag_functor(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred) 58 | : binary_pred(binary_pred), iter(first), n(last - first) 59 | {} 60 | 61 | __host__ __device__ __thrust_forceinline__ 62 | result_type operator()(const IndexType &i) 63 | { 64 | return (i == (n - 1) || !binary_pred(iter[i], iter[i+1])); 65 | } 66 | }; 67 | 68 | typedef thrust::counting_iterator counting_iterator; 69 | 70 | public: 71 | typedef thrust::transform_iterator< 72 | tail_flag_functor, 73 | counting_iterator 74 | > iterator; 75 | 76 | __bulk_exec_check_disable__ 77 | __host__ __device__ 78 | tail_flags_(RandomAccessIterator first, RandomAccessIterator last) 79 | : m_begin(thrust::make_transform_iterator(thrust::counting_iterator(0), 80 | tail_flag_functor(first, last))), 81 | m_end(m_begin + (last - first)) 82 | {} 83 | 84 | __bulk_exec_check_disable__ 85 | __host__ __device__ 86 | tail_flags_(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred) 87 | : m_begin(thrust::make_transform_iterator(thrust::counting_iterator(0), 88 | tail_flag_functor(first, last, binary_pred))), 89 | m_end(m_begin + (last - first)) 90 | {} 91 | 92 | __host__ __device__ 93 | iterator begin() const 94 | { 95 | return m_begin; 96 | } 97 | 98 | __host__ __device__ 99 | iterator end() const 100 | { 101 | return m_end; 102 | } 103 | 104 | template 105 | __host__ __device__ 106 | typename iterator::reference operator[](OtherIndex i) 107 | { 108 | return *(begin() + i); 109 | } 110 | 111 | private: 112 | iterator m_begin, m_end; 113 | }; 114 | 115 | 116 | template 117 | __host__ __device__ 118 | //tail_flags 119 | tail_flags_ 120 | make_tail_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred) 121 | { 122 | // return tail_flags(first, last, binary_pred); 123 | return tail_flags_(first, last, binary_pred); 124 | } 125 | 126 | 127 | template 128 | __host__ __device__ 129 | //tail_flags 130 | tail_flags_ 131 | make_tail_flags(RandomAccessIterator first, RandomAccessIterator last) 132 | { 133 | // return tail_flags(first, last); 134 | return tail_flags_(first, last); 135 | } 136 | 137 | 138 | } // end detail 139 | } // end bulk 140 | BULK_NAMESPACE_SUFFIX 141 | 142 | -------------------------------------------------------------------------------- /bulk/detail/terminate.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | BULK_NAMESPACE_PREFIX 24 | namespace bulk 25 | { 26 | namespace detail 27 | { 28 | 29 | 30 | __host__ __device__ 31 | inline void terminate() 32 | { 33 | #ifdef __CUDA_ARCH__ 34 | asm("trap;"); 35 | #else 36 | std::terminate(); 37 | #endif 38 | } // end terminate() 39 | 40 | 41 | __host__ __device__ 42 | inline void terminate_with_message(const char* message) 43 | { 44 | #if __BULK_HAS_PRINTF__ 45 | std::printf("%s\n", message); 46 | #endif 47 | 48 | bulk::detail::terminate(); 49 | } 50 | 51 | 52 | __host__ __device__ 53 | inline void terminate_on_error(cudaError_t e, const char* message) 54 | { 55 | if(e) 56 | { 57 | #if (__BULK_HAS_PRINTF__ && __BULK_HAS_CUDART__) 58 | printf("Error after: %s: %s\n", message, cudaGetErrorString(e)); 59 | #elif __BULK_HAS_PRINTF__ 60 | printf("Error: %s\n", message); 61 | #endif 62 | bulk::detail::terminate(); 63 | } 64 | } 65 | 66 | 67 | } // end detail 68 | } // end bulk 69 | BULK_NAMESPACE_SUFFIX 70 | 71 | -------------------------------------------------------------------------------- /bulk/detail/throw_on_error.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | 26 | BULK_NAMESPACE_PREFIX 27 | namespace bulk 28 | { 29 | namespace detail 30 | { 31 | 32 | 33 | inline __host__ __device__ 34 | void throw_on_error(cudaError_t e, const char *message) 35 | { 36 | if(e) 37 | { 38 | #ifndef __CUDA_ARCH__ 39 | throw thrust::system_error(e, thrust::cuda_category(), message); 40 | #else 41 | # if (__BULK_HAS_PRINTF__ && __BULK_HAS_CUDART__) 42 | printf("Error after %s: %s\n", message, cudaGetErrorString(e)); 43 | # elif __BULK_HAS_PRINTF__ 44 | printf("Error: %s\n", message); 45 | # endif 46 | bulk::detail::terminate(); 47 | #endif 48 | } // end if 49 | } // end throw_on_error() 50 | 51 | 52 | } // end detail 53 | } // end bulk 54 | BULK_NAMESPACE_SUFFIX 55 | 56 | -------------------------------------------------------------------------------- /bulk/detail/tuple_meta_transform.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | BULK_NAMESPACE_PREFIX 23 | namespace bulk 24 | { 25 | namespace detail 26 | { 27 | 28 | 29 | template class UnaryMetaFunction, 31 | unsigned int sz = thrust::tuple_size::value> 32 | struct tuple_meta_transform; 33 | 34 | template class UnaryMetaFunction> 36 | struct tuple_meta_transform 37 | { 38 | typedef thrust::tuple<> type; 39 | }; 40 | 41 | template class UnaryMetaFunction> 43 | struct tuple_meta_transform 44 | { 45 | typedef thrust::tuple< 46 | typename UnaryMetaFunction::type>::type 47 | > type; 48 | }; 49 | 50 | template class UnaryMetaFunction> 52 | struct tuple_meta_transform 53 | { 54 | typedef thrust::tuple< 55 | typename UnaryMetaFunction::type>::type, 56 | typename UnaryMetaFunction::type>::type 57 | > type; 58 | }; 59 | 60 | template class UnaryMetaFunction> 62 | struct tuple_meta_transform 63 | { 64 | typedef thrust::tuple< 65 | typename UnaryMetaFunction::type>::type, 66 | typename UnaryMetaFunction::type>::type, 67 | typename UnaryMetaFunction::type>::type 68 | > type; 69 | }; 70 | 71 | template class UnaryMetaFunction> 73 | struct tuple_meta_transform 74 | { 75 | typedef thrust::tuple< 76 | typename UnaryMetaFunction::type>::type, 77 | typename UnaryMetaFunction::type>::type, 78 | typename UnaryMetaFunction::type>::type, 79 | typename UnaryMetaFunction::type>::type 80 | > type; 81 | }; 82 | 83 | template class UnaryMetaFunction> 85 | struct tuple_meta_transform 86 | { 87 | typedef thrust::tuple< 88 | typename UnaryMetaFunction::type>::type, 89 | typename UnaryMetaFunction::type>::type, 90 | typename UnaryMetaFunction::type>::type, 91 | typename UnaryMetaFunction::type>::type, 92 | typename UnaryMetaFunction::type>::type 93 | > type; 94 | }; 95 | 96 | template class UnaryMetaFunction> 98 | struct tuple_meta_transform 99 | { 100 | typedef thrust::tuple< 101 | typename UnaryMetaFunction::type>::type, 102 | typename UnaryMetaFunction::type>::type, 103 | typename UnaryMetaFunction::type>::type, 104 | typename UnaryMetaFunction::type>::type, 105 | typename UnaryMetaFunction::type>::type, 106 | typename UnaryMetaFunction::type>::type 107 | > type; 108 | }; 109 | 110 | template class UnaryMetaFunction> 112 | struct tuple_meta_transform 113 | { 114 | typedef thrust::tuple< 115 | typename UnaryMetaFunction::type>::type, 116 | typename UnaryMetaFunction::type>::type, 117 | typename UnaryMetaFunction::type>::type, 118 | typename UnaryMetaFunction::type>::type, 119 | typename UnaryMetaFunction::type>::type, 120 | typename UnaryMetaFunction::type>::type, 121 | typename UnaryMetaFunction::type>::type 122 | > type; 123 | }; 124 | 125 | template class UnaryMetaFunction> 127 | struct tuple_meta_transform 128 | { 129 | typedef thrust::tuple< 130 | typename UnaryMetaFunction::type>::type, 131 | typename UnaryMetaFunction::type>::type, 132 | typename UnaryMetaFunction::type>::type, 133 | typename UnaryMetaFunction::type>::type, 134 | typename UnaryMetaFunction::type>::type, 135 | typename UnaryMetaFunction::type>::type, 136 | typename UnaryMetaFunction::type>::type, 137 | typename UnaryMetaFunction::type>::type 138 | > type; 139 | }; 140 | 141 | template class UnaryMetaFunction> 143 | struct tuple_meta_transform 144 | { 145 | typedef thrust::tuple< 146 | typename UnaryMetaFunction::type>::type, 147 | typename UnaryMetaFunction::type>::type, 148 | typename UnaryMetaFunction::type>::type, 149 | typename UnaryMetaFunction::type>::type, 150 | typename UnaryMetaFunction::type>::type, 151 | typename UnaryMetaFunction::type>::type, 152 | typename UnaryMetaFunction::type>::type, 153 | typename UnaryMetaFunction::type>::type, 154 | typename UnaryMetaFunction::type>::type 155 | > type; 156 | }; 157 | 158 | template class UnaryMetaFunction> 160 | struct tuple_meta_transform 161 | { 162 | typedef thrust::tuple< 163 | typename UnaryMetaFunction::type>::type, 164 | typename UnaryMetaFunction::type>::type, 165 | typename UnaryMetaFunction::type>::type, 166 | typename UnaryMetaFunction::type>::type, 167 | typename UnaryMetaFunction::type>::type, 168 | typename UnaryMetaFunction::type>::type, 169 | typename UnaryMetaFunction::type>::type, 170 | typename UnaryMetaFunction::type>::type, 171 | typename UnaryMetaFunction::type>::type, 172 | typename UnaryMetaFunction::type>::type 173 | > type; 174 | }; 175 | 176 | 177 | } // end detail 178 | } // end bulk 179 | BULK_NAMESPACE_SUFFIX 180 | 181 | -------------------------------------------------------------------------------- /bulk/future.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | BULK_NAMESPACE_PREFIX 29 | namespace bulk 30 | { 31 | namespace detail 32 | { 33 | 34 | 35 | struct future_core_access; 36 | 37 | 38 | } // end detail 39 | 40 | 41 | template class future; 42 | 43 | 44 | template<> 45 | class future 46 | { 47 | public: 48 | __host__ __device__ 49 | ~future() 50 | { 51 | if(valid()) 52 | { 53 | #if __BULK_HAS_CUDART__ 54 | // swallow errors 55 | cudaError_t e = cudaEventDestroy(m_event); 56 | 57 | #if __BULK_HAS_PRINTF__ 58 | if(e) 59 | { 60 | printf("CUDA error after cudaEventDestroy in future dtor: %s", cudaGetErrorString(e)); 61 | } // end if 62 | #endif // __BULK_HAS_PRINTF__ 63 | 64 | if(m_owns_stream) 65 | { 66 | e = cudaStreamDestroy(m_stream); 67 | 68 | #if __BULK_HAS_PRINTF__ 69 | if(e) 70 | { 71 | printf("CUDA error after cudaStreamDestroy in future dtor: %s", cudaGetErrorString(e)); 72 | } // end if 73 | #endif // __BULK_HAS_PRINTF__ 74 | } // end if 75 | #endif 76 | } // end if 77 | } // end ~future() 78 | 79 | __host__ __device__ 80 | void wait() const 81 | { 82 | // XXX should probably check for valid() here 83 | 84 | #if __BULK_HAS_CUDART__ 85 | 86 | #ifndef __CUDA_ARCH__ 87 | // XXX need to capture the error as an exception and then throw it in .get() 88 | bulk::detail::throw_on_error(cudaEventSynchronize(m_event), "cudaEventSynchronize in future::wait"); 89 | #else 90 | // XXX need to capture the error as an exception and then throw it in .get() 91 | bulk::detail::throw_on_error(cudaDeviceSynchronize(), "cudaDeviceSynchronize in future::wait"); 92 | #endif // __CUDA_ARCH__ 93 | 94 | #else 95 | // XXX should terminate with a message 96 | bulk::detail::terminate(); 97 | #endif // __BULK_HAS_CUDART__ 98 | } // end wait() 99 | 100 | __host__ __device__ 101 | bool valid() const 102 | { 103 | return m_event != 0; 104 | } // end valid() 105 | 106 | __host__ __device__ 107 | future() 108 | : m_stream(0), m_event(0), m_owns_stream(false) 109 | {} 110 | 111 | // simulate a move 112 | // XXX need to add rval_ref or something 113 | __host__ __device__ 114 | future(const future &other) 115 | : m_stream(0), m_event(0), m_owns_stream(false) 116 | { 117 | thrust::swap(m_stream, const_cast(other).m_stream); 118 | thrust::swap(m_event, const_cast(other).m_event); 119 | thrust::swap(m_owns_stream, const_cast(other).m_owns_stream); 120 | } // end future() 121 | 122 | // simulate a move 123 | // XXX need to add rval_ref or something 124 | __host__ __device__ 125 | future &operator=(const future &other) 126 | { 127 | thrust::swap(m_stream, const_cast(other).m_stream); 128 | thrust::swap(m_event, const_cast(other).m_event); 129 | thrust::swap(m_owns_stream, const_cast(other).m_owns_stream); 130 | return *this; 131 | } // end operator=() 132 | 133 | private: 134 | friend struct detail::future_core_access; 135 | 136 | __host__ __device__ 137 | future(cudaStream_t s, bool owns_stream) 138 | : m_stream(s),m_owns_stream(owns_stream) 139 | { 140 | #if __BULK_HAS_CUDART__ 141 | bulk::detail::throw_on_error(cudaEventCreateWithFlags(&m_event, create_flags), "cudaEventCreateWithFlags in future ctor"); 142 | bulk::detail::throw_on_error(cudaEventRecord(m_event, m_stream), "cudaEventRecord in future ctor"); 143 | #endif 144 | } // end future() 145 | 146 | // XXX this combination makes the constructor expensive 147 | //static const int create_flags = cudaEventDisableTiming | cudaEventBlockingSync; 148 | static const int create_flags = cudaEventDisableTiming; 149 | 150 | cudaStream_t m_stream; 151 | cudaEvent_t m_event; 152 | bool m_owns_stream; 153 | }; // end future 154 | 155 | 156 | namespace detail 157 | { 158 | 159 | 160 | struct future_core_access 161 | { 162 | __host__ __device__ 163 | inline static future create(cudaStream_t s, bool owns_stream) 164 | { 165 | return future(s, owns_stream); 166 | } // end create_in_stream() 167 | 168 | __host__ __device__ 169 | inline static cudaEvent_t event(const future &f) 170 | { 171 | return f.m_event; 172 | } // end event() 173 | }; // end future_core_access 174 | 175 | 176 | } // end detail 177 | 178 | 179 | } // end namespace bulk 180 | BULK_NAMESPACE_SUFFIX 181 | 182 | -------------------------------------------------------------------------------- /bulk/iterator.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | -------------------------------------------------------------------------------- /bulk/iterator/strided_iterator.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | 22 | 23 | BULK_NAMESPACE_PREFIX 24 | namespace bulk 25 | { 26 | 27 | 28 | template::type> 30 | class strided_iterator 31 | : public thrust::iterator_adaptor< 32 | strided_iterator, 33 | Iterator 34 | > 35 | { 36 | private: 37 | typedef thrust::iterator_adaptor,Iterator> super_t; 38 | 39 | public: 40 | typedef Size stride_type; 41 | 42 | inline __host__ __device__ 43 | strided_iterator() 44 | : super_t(), m_stride(1) 45 | {} 46 | 47 | inline __host__ __device__ 48 | strided_iterator(const strided_iterator& other) 49 | : super_t(other), m_stride(other.m_stride) 50 | {} 51 | 52 | inline __host__ __device__ 53 | strided_iterator(const Iterator &base, stride_type stride) 54 | : super_t(base), m_stride(stride) 55 | {} 56 | 57 | inline __host__ __device__ 58 | stride_type stride() const 59 | { 60 | return m_stride; 61 | } 62 | 63 | private: 64 | friend class thrust::iterator_core_access; 65 | 66 | __host__ __device__ 67 | void increment() 68 | { 69 | super_t::base_reference() += stride(); 70 | } 71 | 72 | __host__ __device__ 73 | void decrement() 74 | { 75 | super_t::base_reference() -= stride(); 76 | } 77 | 78 | __host__ __device__ 79 | void advance(typename super_t::difference_type n) 80 | { 81 | super_t::base_reference() += n * stride(); 82 | } 83 | 84 | template 85 | __host__ __device__ 86 | typename super_t::difference_type distance_to(const strided_iterator &other) const 87 | { 88 | if(other.base() >= this->base()) 89 | { 90 | return (other.base() - this->base() + (stride() - 1)) / stride(); 91 | } 92 | 93 | return (other.base() - this->base() - (stride() - 1)) / stride(); 94 | } 95 | 96 | stride_type m_stride; 97 | }; 98 | 99 | 100 | template 101 | __host__ __device__ 102 | strided_iterator make_strided_iterator(Iterator iter, Size stride) 103 | { 104 | return strided_iterator(iter, stride); 105 | } 106 | 107 | 108 | } // end bulk 109 | BULK_NAMESPACE_SUFFIX 110 | 111 | -------------------------------------------------------------------------------- /bulk/uninitialized.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | 25 | BULK_NAMESPACE_PREFIX 26 | namespace bulk 27 | { 28 | 29 | 30 | template 31 | class uninitialized 32 | { 33 | private: 34 | typename bulk::detail::aligned_storage< 35 | sizeof(T), 36 | bulk::detail::alignment_of::value 37 | >::type storage; 38 | 39 | __host__ __device__ __thrust_forceinline__ 40 | const T* ptr() const 41 | { 42 | const void *result = storage.data; 43 | return reinterpret_cast(result); 44 | } 45 | 46 | __host__ __device__ __thrust_forceinline__ 47 | T* ptr() 48 | { 49 | void *result = storage.data; 50 | return reinterpret_cast(result); 51 | } 52 | 53 | public: 54 | // copy assignment 55 | __host__ __device__ __thrust_forceinline__ 56 | uninitialized &operator=(const T &other) 57 | { 58 | T& self = *this; 59 | self = other; 60 | return *this; 61 | } 62 | 63 | __host__ __device__ __thrust_forceinline__ 64 | T& get() 65 | { 66 | return *ptr(); 67 | } 68 | 69 | __host__ __device__ __thrust_forceinline__ 70 | const T& get() const 71 | { 72 | return *ptr(); 73 | } 74 | 75 | __host__ __device__ __thrust_forceinline__ 76 | operator T& () 77 | { 78 | return get(); 79 | } 80 | 81 | __host__ __device__ __thrust_forceinline__ 82 | operator const T&() const 83 | { 84 | return get(); 85 | } 86 | 87 | __bulk_exec_check_disable__ 88 | __host__ __device__ __thrust_forceinline__ 89 | void construct() 90 | { 91 | ::new(ptr()) T(); 92 | } 93 | 94 | __bulk_exec_check_disable__ 95 | template 96 | __host__ __device__ __thrust_forceinline__ 97 | void construct(const Arg &a) 98 | { 99 | ::new(ptr()) T(a); 100 | } 101 | 102 | __bulk_exec_check_disable__ 103 | template 104 | __host__ __device__ __thrust_forceinline__ 105 | void construct(const Arg1 &a1, const Arg2 &a2) 106 | { 107 | ::new(ptr()) T(a1,a2); 108 | } 109 | 110 | __bulk_exec_check_disable__ 111 | template 112 | __host__ __device__ __thrust_forceinline__ 113 | void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3) 114 | { 115 | ::new(ptr()) T(a1,a2,a3); 116 | } 117 | 118 | __bulk_exec_check_disable__ 119 | template 120 | __host__ __device__ __thrust_forceinline__ 121 | void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4) 122 | { 123 | ::new(ptr()) T(a1,a2,a3,a4); 124 | } 125 | 126 | __bulk_exec_check_disable__ 127 | template 128 | __host__ __device__ __thrust_forceinline__ 129 | void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5) 130 | { 131 | ::new(ptr()) T(a1,a2,a3,a4,a5); 132 | } 133 | 134 | __bulk_exec_check_disable__ 135 | template 136 | __host__ __device__ __thrust_forceinline__ 137 | void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6) 138 | { 139 | ::new(ptr()) T(a1,a2,a3,a4,a5,a6); 140 | } 141 | 142 | __bulk_exec_check_disable__ 143 | template 144 | __host__ __device__ __thrust_forceinline__ 145 | void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7) 146 | { 147 | ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7); 148 | } 149 | 150 | __bulk_exec_check_disable__ 151 | template 152 | __host__ __device__ __thrust_forceinline__ 153 | void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8) 154 | { 155 | ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7,a8); 156 | } 157 | 158 | __bulk_exec_check_disable__ 159 | template 160 | __host__ __device__ __thrust_forceinline__ 161 | void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8, const Arg9 &a9) 162 | { 163 | ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7,a8,a9); 164 | } 165 | 166 | __bulk_exec_check_disable__ 167 | template 168 | __host__ __device__ __thrust_forceinline__ 169 | void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8, const Arg9 &a9, const Arg10 &a10) 170 | { 171 | ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7,a8,a9,a10); 172 | } 173 | 174 | __bulk_exec_check_disable__ 175 | __host__ __device__ __thrust_forceinline__ 176 | void destroy() 177 | { 178 | T& self = *this; 179 | self.~T(); 180 | } 181 | }; 182 | 183 | 184 | template 185 | class uninitialized_array 186 | { 187 | public: 188 | typedef T value_type; 189 | typedef T& reference; 190 | typedef const T& const_reference; 191 | typedef T* pointer; 192 | typedef const T* const_pointer; 193 | typedef pointer iterator; 194 | typedef const_pointer const_iterator; 195 | typedef std::size_t size_type; 196 | 197 | __thrust_forceinline__ __host__ __device__ 198 | iterator begin() 199 | { 200 | return data(); 201 | } 202 | 203 | __thrust_forceinline__ __host__ __device__ 204 | const_iterator begin() const 205 | { 206 | return data(); 207 | } 208 | 209 | __thrust_forceinline__ __host__ __device__ 210 | iterator end() 211 | { 212 | return begin() + size(); 213 | } 214 | 215 | __thrust_forceinline__ __host__ __device__ 216 | const_iterator end() const 217 | { 218 | return begin() + size(); 219 | } 220 | 221 | __thrust_forceinline__ __host__ __device__ 222 | const_iterator cbegin() const 223 | { 224 | return begin(); 225 | } 226 | 227 | __thrust_forceinline__ __host__ __device__ 228 | const_iterator cend() const 229 | { 230 | return end(); 231 | } 232 | 233 | __thrust_forceinline__ __host__ __device__ 234 | size_type size() const 235 | { 236 | return N; 237 | } 238 | 239 | __thrust_forceinline__ __host__ __device__ 240 | bool empty() const 241 | { 242 | return false; 243 | } 244 | 245 | __thrust_forceinline__ __host__ __device__ 246 | T* data() 247 | { 248 | return impl.get(); 249 | } 250 | 251 | __thrust_forceinline__ __host__ __device__ 252 | const T* data() const 253 | { 254 | return impl.get(); 255 | } 256 | 257 | // element access 258 | __thrust_forceinline__ __host__ __device__ 259 | reference operator[](size_type n) 260 | { 261 | return data()[n]; 262 | } 263 | 264 | __thrust_forceinline__ __host__ __device__ 265 | const_reference operator[](size_type n) const 266 | { 267 | return data()[n]; 268 | } 269 | 270 | __thrust_forceinline__ __host__ __device__ 271 | reference front() 272 | { 273 | return *data(); 274 | } 275 | 276 | __thrust_forceinline__ __host__ __device__ 277 | const_reference front() const 278 | { 279 | return *data(); 280 | } 281 | 282 | __thrust_forceinline__ __host__ __device__ 283 | reference back() 284 | { 285 | return data()[size() - size_type(1)]; 286 | } 287 | 288 | __thrust_forceinline__ __host__ __device__ 289 | const_reference back() const 290 | { 291 | return data()[size() - size_type(1)]; 292 | } 293 | 294 | private: 295 | uninitialized impl; 296 | }; 297 | 298 | 299 | } // end bulk 300 | BULK_NAMESPACE_SUFFIX 301 | 302 | -------------------------------------------------------------------------------- /decomposition.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | template 6 | class trivial_decomposition 7 | { 8 | public: 9 | typedef Size size_type; 10 | 11 | typedef thrust::pair range; 12 | 13 | __host__ __device__ 14 | trivial_decomposition() 15 | : m_n(0) 16 | {} 17 | 18 | __host__ __device__ 19 | trivial_decomposition(size_type n) 20 | : m_n(n) 21 | {} 22 | 23 | __host__ __device__ 24 | range operator[](size_type) const 25 | { 26 | return range(0, n()); 27 | } 28 | 29 | __host__ __device__ 30 | size_type size() const 31 | { 32 | return 1; 33 | } 34 | 35 | // XXX think of a better name for this 36 | __host__ __device__ 37 | size_type n() const 38 | { 39 | return m_n; 40 | } 41 | 42 | private: 43 | Size m_n; 44 | }; 45 | 46 | 47 | template 48 | __host__ __device__ 49 | trivial_decomposition make_trivial_decomposition(Size n) 50 | { 51 | return trivial_decomposition(n); 52 | } 53 | 54 | 55 | template 56 | class blocked_decomposition 57 | { 58 | public: 59 | typedef Size size_type; 60 | 61 | typedef thrust::pair range; 62 | 63 | __host__ __device__ 64 | blocked_decomposition() 65 | : m_n(0), 66 | m_block_size(0), 67 | m_num_partitions(0) 68 | {} 69 | 70 | __host__ __device__ 71 | blocked_decomposition(size_type n, Size block_size) 72 | : m_n(n), 73 | m_block_size(block_size), 74 | m_num_partitions((n + block_size - 1) / block_size) 75 | {} 76 | 77 | __host__ __device__ 78 | range operator[](size_type i) const 79 | { 80 | size_type first = i * m_block_size; 81 | size_type last = thrust::min(m_n, first + m_block_size); 82 | 83 | return range(first, last); 84 | } 85 | 86 | __host__ __device__ 87 | size_type size() const 88 | { 89 | return m_num_partitions; 90 | } 91 | 92 | // XXX think of a better name for this 93 | __host__ __device__ 94 | size_type n() const 95 | { 96 | return m_n; 97 | } 98 | 99 | private: 100 | Size m_n; 101 | Size m_block_size; 102 | Size m_num_partitions; 103 | }; 104 | 105 | 106 | template 107 | __host__ __device__ 108 | blocked_decomposition make_blocked_decomposition(Size n, Size block_size) 109 | { 110 | return blocked_decomposition(n,block_size); 111 | } 112 | 113 | 114 | template 115 | class uniform_decomposition 116 | : public blocked_decomposition 117 | { 118 | private: 119 | typedef blocked_decomposition super_t; 120 | 121 | public: 122 | __host__ __device__ 123 | uniform_decomposition() 124 | : super_t() 125 | {} 126 | 127 | __host__ __device__ 128 | uniform_decomposition(Size n, Size num_partitions) 129 | : super_t(n, n / num_partitions) 130 | {} 131 | }; 132 | 133 | 134 | template 135 | __host__ __device__ 136 | uniform_decomposition make_uniform_decomposition(Size n, Size num_partitions) 137 | { 138 | return uniform_decomposition(n,num_partitions); 139 | } 140 | 141 | 142 | template 143 | class aligned_decomposition 144 | { 145 | public: 146 | typedef Size size_type; 147 | 148 | typedef thrust::pair range; 149 | 150 | __host__ __device__ 151 | aligned_decomposition() 152 | : m_n(0), 153 | m_num_partitions(0), 154 | m_tile_size(0) 155 | {} 156 | 157 | __host__ __device__ 158 | aligned_decomposition(Size n, Size num_partitions, Size aligned_size) 159 | : m_n(n), 160 | m_num_partitions(num_partitions), 161 | m_tile_size(aligned_size) 162 | { 163 | size_type num_tiles = (n + m_tile_size - 1) / m_tile_size; 164 | 165 | m_num_tiles_per_partition = num_tiles / size(); 166 | m_last_partial_tile_size = num_tiles % size(); 167 | } 168 | 169 | __host__ __device__ 170 | range operator[](Size i) const 171 | { 172 | range result = range_in_tiles(i); 173 | result.first *= m_tile_size; 174 | result.second = thrust::min(m_n, result.second * m_tile_size); 175 | return result; 176 | } 177 | 178 | __host__ __device__ 179 | size_type size() const 180 | { 181 | return m_num_partitions; 182 | } 183 | 184 | // XXX think of a better name for this 185 | __host__ __device__ 186 | size_type n() const 187 | { 188 | return m_n; 189 | } 190 | 191 | private: 192 | __host__ __device__ 193 | range range_in_tiles(size_type i) const 194 | { 195 | range result; 196 | 197 | result.first = m_num_tiles_per_partition * i; 198 | result.first += thrust::min(i, m_last_partial_tile_size); 199 | 200 | result.second = result.first + m_num_tiles_per_partition + (i < m_last_partial_tile_size); 201 | 202 | return result; 203 | } 204 | 205 | size_type m_n; 206 | size_type m_num_partitions; 207 | size_type m_num_tiles_per_partition; 208 | size_type m_tile_size; 209 | size_type m_last_partial_tile_size; 210 | }; 211 | 212 | 213 | template 214 | __host__ __device__ 215 | aligned_decomposition make_aligned_decomposition(Size n, Size num_partitions, Size aligned_size) 216 | { 217 | return aligned_decomposition(n,num_partitions,aligned_size); 218 | } 219 | 220 | -------------------------------------------------------------------------------- /for_each.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | struct for_each_kernel 7 | { 8 | template 9 | __host__ __device__ 10 | void operator()(bulk::agent<> &self, Iterator first, Function f) 11 | { 12 | f(first[self.index()]); 13 | } 14 | }; 15 | 16 | struct print_functor 17 | { 18 | __host__ __device__ 19 | void operator()(int x) 20 | { 21 | printf("%d\n", x); 22 | } 23 | }; 24 | 25 | int main() 26 | { 27 | size_t n = 32; 28 | 29 | thrust::device_vector vec(n); 30 | thrust::sequence(vec.begin(), vec.end()); 31 | 32 | bulk::async(bulk::par(n), for_each_kernel(), bulk::root.this_exec, vec.begin(), print_functor()).wait(); 33 | 34 | return 0; 35 | } 36 | 37 | -------------------------------------------------------------------------------- /futures.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | struct task1 6 | { 7 | __device__ 8 | void operator()() 9 | { 10 | printf("Hello world from task1\n"); 11 | } 12 | }; 13 | 14 | struct task2 15 | { 16 | __device__ 17 | void operator()() 18 | { 19 | printf("Hello world from task2\n"); 20 | } 21 | }; 22 | 23 | void task3() 24 | { 25 | printf("Hello world from task3\n"); 26 | }; 27 | 28 | int main() 29 | { 30 | cudaStream_t s1; 31 | cudaStreamCreate(&s1); 32 | 33 | using bulk::par; 34 | using bulk::async; 35 | 36 | // we can insert a task into a stream directly 37 | bulk::future t1 = async(par(s1, 1), task1()); 38 | 39 | // or we can make a new task depend on a previous future 40 | bulk::future t2 = async(par(t1, 1), task2()); 41 | 42 | // task3 is independent of both task1 & task2 and executes in this thread 43 | task3(); 44 | 45 | t1.wait(); 46 | t2.wait(); 47 | 48 | cudaStreamDestroy(s1); 49 | 50 | return 0; 51 | } 52 | 53 | -------------------------------------------------------------------------------- /head_flags.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | 26 | template::type>, 28 | typename ValueType = bool, 29 | typename IndexType = typename thrust::iterator_difference::type> 30 | class head_flags_with_init 31 | { 32 | typedef typename thrust::iterator_value::type init_type; 33 | 34 | // XXX WAR cudafe issue 35 | //private: 36 | public: 37 | struct head_flag_functor 38 | { 39 | BinaryPredicate binary_pred; // this must be the first member for performance reasons 40 | init_type init; 41 | IndexType n; 42 | 43 | typedef ValueType result_type; 44 | 45 | __host__ __device__ 46 | head_flag_functor(init_type init, IndexType n) 47 | : binary_pred(), init(init), n(n) 48 | {} 49 | 50 | __host__ __device__ 51 | head_flag_functor(init_type init, IndexType n, BinaryPredicate binary_pred) 52 | : binary_pred(binary_pred), init(init), n(n) 53 | {} 54 | 55 | template 56 | __host__ __device__ __thrust_forceinline__ 57 | result_type operator()(const Tuple &t) 58 | { 59 | const IndexType i = thrust::get<0>(t); 60 | 61 | if(i == 0) 62 | { 63 | return !binary_pred(init, thrust::get<1>(t)); 64 | } 65 | 66 | return !binary_pred(thrust::get<1>(t), thrust::get<2>(t)); 67 | } 68 | }; 69 | 70 | typedef thrust::counting_iterator counting_iterator; 71 | 72 | public: 73 | typedef thrust::transform_iterator< 74 | head_flag_functor, 75 | thrust::zip_iterator > 76 | > iterator; 77 | 78 | #pragma hd_warning_disable 79 | __host__ __device__ 80 | head_flags_with_init(RandomAccessIterator first, RandomAccessIterator last, init_type init) 81 | : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator(0), first, first - 1)), 82 | head_flag_functor(init, last - first))), 83 | m_end(m_begin + (last - first)) 84 | {} 85 | 86 | __host__ __device__ 87 | head_flags_with_init(RandomAccessIterator first, RandomAccessIterator last, init_type init, BinaryPredicate binary_pred) 88 | : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator(0), first, first - 1)), 89 | head_flag_functor(init, last - first, binary_pred))), 90 | m_end(m_begin + (last - first)) 91 | {} 92 | 93 | __host__ __device__ 94 | iterator begin() const 95 | { 96 | return m_begin; 97 | } 98 | 99 | __host__ __device__ 100 | iterator end() const 101 | { 102 | return m_end; 103 | } 104 | 105 | template 106 | __host__ __device__ 107 | typename iterator::reference operator[](OtherIndex i) 108 | { 109 | return *(begin() + i); 110 | } 111 | 112 | private: 113 | iterator m_begin, m_end; 114 | }; 115 | 116 | 117 | 118 | template::type>, 120 | typename ValueType = bool, 121 | typename IndexType = typename thrust::iterator_difference::type> 122 | class head_flags 123 | { 124 | // XXX WAR cudafe issue 125 | //private: 126 | public: 127 | struct head_flag_functor 128 | { 129 | BinaryPredicate binary_pred; // this must be the first member for performance reasons 130 | IndexType n; 131 | 132 | typedef ValueType result_type; 133 | 134 | __host__ __device__ 135 | head_flag_functor(IndexType n) 136 | : binary_pred(), n(n) 137 | {} 138 | 139 | __host__ __device__ 140 | head_flag_functor(IndexType n, BinaryPredicate binary_pred) 141 | : binary_pred(binary_pred), n(n) 142 | {} 143 | 144 | template 145 | __host__ __device__ __thrust_forceinline__ 146 | result_type operator()(const Tuple &t) 147 | { 148 | const IndexType i = thrust::get<0>(t); 149 | 150 | // note that we do not dereference the tuple's 2nd element when i <= 0 151 | // and therefore do not dereference a bad location at the boundary 152 | return (i == 0 || !binary_pred(thrust::get<1>(t), thrust::get<2>(t))); 153 | } 154 | }; 155 | 156 | typedef thrust::counting_iterator counting_iterator; 157 | 158 | public: 159 | typedef thrust::transform_iterator< 160 | head_flag_functor, 161 | thrust::zip_iterator > 162 | > iterator; 163 | 164 | __host__ __device__ 165 | head_flags(RandomAccessIterator first, RandomAccessIterator last) 166 | : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator(0), first, first - 1)), 167 | head_flag_functor(last - first))), 168 | m_end(m_begin + (last - first)) 169 | {} 170 | 171 | __host__ __device__ 172 | head_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred) 173 | : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator(0), first, first - 1)), 174 | head_flag_functor(last - first, binary_pred))), 175 | m_end(m_begin + (last - first)) 176 | {} 177 | 178 | __host__ __device__ 179 | iterator begin() const 180 | { 181 | return m_begin; 182 | } 183 | 184 | __host__ __device__ 185 | iterator end() const 186 | { 187 | return m_end; 188 | } 189 | 190 | template 191 | __host__ __device__ 192 | typename iterator::reference operator[](OtherIndex i) 193 | { 194 | return *(begin() + i); 195 | } 196 | 197 | private: 198 | iterator m_begin, m_end; 199 | }; 200 | 201 | 202 | template 203 | __host__ __device__ 204 | head_flags 205 | make_head_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred) 206 | { 207 | return head_flags(first, last, binary_pred); 208 | } 209 | 210 | 211 | template 212 | __host__ __device__ 213 | head_flags 214 | make_head_flags(RandomAccessIterator first, RandomAccessIterator last) 215 | { 216 | return head_flags(first, last); 217 | } 218 | 219 | 220 | -------------------------------------------------------------------------------- /hello_world.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | struct hello 5 | { 6 | __host__ __device__ 7 | void operator()() 8 | { 9 | printf("Hello world!\n"); 10 | } 11 | 12 | __host__ __device__ 13 | void operator()(bulk::parallel_group<> &g) 14 | { 15 | printf("Hello world from agent %d\n", g.this_exec.index()); 16 | } 17 | }; 18 | 19 | int main() 20 | { 21 | bulk::async(bulk::par(1), hello()); 22 | 23 | // wait for this async to finish before exiting the program 24 | bulk::async(bulk::par(32), hello(), bulk::root).wait(); 25 | 26 | return 0; 27 | } 28 | 29 | -------------------------------------------------------------------------------- /join_iterator.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | template 13 | class join_iterator; 14 | 15 | 16 | namespace detail 17 | { 18 | 19 | 20 | template 24 | struct join_iterator_base 25 | { 26 | typedef typename thrust::detail::remove_reference::type value_type; 27 | 28 | typedef typename thrust::iterator_system::type system1; 29 | typedef typename thrust::iterator_system::type system2; 30 | typedef typename thrust::detail::minimum_system::type system; 31 | 32 | typedef thrust::iterator_adaptor< 33 | join_iterator, 34 | thrust::counting_iterator, 35 | value_type, 36 | system, 37 | thrust::random_access_traversal_tag, 38 | Reference, 39 | Difference 40 | > type; 41 | }; // end join_iterator_base 42 | 43 | 44 | } // end detail 45 | 46 | 47 | template::type, 50 | typename Reference = typename thrust::iterator_value::type> 51 | class join_iterator 52 | : public detail::join_iterator_base::type 53 | { 54 | private: 55 | typedef typename detail::join_iterator_base::type super_t; 56 | typedef typename super_t::difference_type size_type; 57 | 58 | public: 59 | inline __host__ __device__ 60 | join_iterator(RandomAccessIterator1 first1, size_type n, RandomAccessIterator2 first2) 61 | : super_t(thrust::counting_iterator(0)), 62 | m_n1(n), 63 | m_iter1(first1), 64 | m_iter2(first2 - m_n1) 65 | {} 66 | 67 | 68 | inline __host__ __device__ 69 | join_iterator(const join_iterator &other) 70 | : super_t(other), 71 | m_n1(other.m_n1), 72 | m_iter1(other.m_iter1), 73 | m_iter2(other.m_iter2) 74 | {} 75 | 76 | 77 | private: 78 | friend class thrust::iterator_core_access; 79 | 80 | 81 | __host__ __device__ 82 | typename super_t::reference dereference() const 83 | { 84 | size_type i = *super_t::base(); 85 | return (i < m_n1) ? m_iter1[i] : m_iter2[i]; 86 | } // end dereference() 87 | 88 | 89 | size_type m_n1; 90 | RandomAccessIterator1 m_iter1; 91 | RandomAccessIterator2 m_iter2; 92 | }; // end join_iterator 93 | 94 | 95 | template 96 | __host__ __device__ 97 | join_iterator make_join_iterator(RandomAccessIterator1 first1, Size n1, RandomAccessIterator2 first2) 98 | { 99 | return join_iterator(first1, n1, first2); 100 | } // end make_join_iterator() 101 | 102 | -------------------------------------------------------------------------------- /ping_pong.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | struct ping 6 | { 7 | __device__ 8 | void operator()(volatile int *ball) 9 | { 10 | *ball = 1; 11 | 12 | for(unsigned int next_state = 2; 13 | next_state < 25; 14 | next_state += 2) 15 | { 16 | while(*ball != next_state) 17 | { 18 | printf("ping waiting for return\n"); 19 | } 20 | 21 | *ball += 1; 22 | 23 | printf("ping! ball is now %d\n", next_state + 1); 24 | } 25 | } 26 | }; 27 | 28 | struct pong 29 | { 30 | __device__ 31 | void operator()(volatile int *ball) 32 | { 33 | for(unsigned int next_state = 1; 34 | next_state < 25; 35 | next_state += 2) 36 | { 37 | while(*ball != next_state) 38 | { 39 | printf("pong waiting for return\n"); 40 | } 41 | 42 | *ball += 1; 43 | 44 | printf("pong! ball is now %d\n", next_state + 1); 45 | } 46 | } 47 | }; 48 | 49 | int main() 50 | { 51 | cudaStream_t s1, s2; 52 | cudaStreamCreate(&s1); 53 | cudaStreamCreate(&s2); 54 | 55 | using bulk::par; 56 | using bulk::async; 57 | 58 | thrust::device_vector ball(1); 59 | 60 | bulk::future t1 = async(par(s1, 1), ping(), thrust::raw_pointer_cast(&*ball.data())); 61 | bulk::future t2 = async(par(s2, 1), pong(), thrust::raw_pointer_cast(&*ball.data())); 62 | 63 | t1.wait(); 64 | t2.wait(); 65 | 66 | cudaStreamDestroy(s1); 67 | cudaStreamDestroy(s2); 68 | 69 | return 0; 70 | } 71 | 72 | -------------------------------------------------------------------------------- /reduce.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "time_invocation_cuda.hpp" 10 | #include "decomposition.hpp" 11 | 12 | 13 | struct reduce_partitions 14 | { 15 | template 16 | __device__ 17 | void operator()(ConcurrentGroup &this_group, Iterator1 first, Iterator1 last, Iterator2 result, T init, BinaryOperation binary_op) 18 | { 19 | T sum = bulk::reduce(this_group, first, last, init, binary_op); 20 | 21 | if(this_group.this_exec.index() == 0) 22 | { 23 | *result = sum; 24 | } 25 | } 26 | 27 | template 28 | __device__ 29 | void operator()(ConcurrentGroup &this_group, Iterator1 first, Iterator1 last, Iterator2 result, BinaryOperation binary_op) 30 | { 31 | // noticeably faster to pass the last element as the init 32 | typename thrust::iterator_value::type init = last[-1]; 33 | (*this)(this_group, first, last - 1, result, init, binary_op); 34 | } 35 | 36 | 37 | template 38 | __device__ 39 | void operator()(ConcurrentGroup &this_group, Iterator1 first, Decomposition decomp, Iterator2 result, T init, BinaryFunction binary_op) 40 | { 41 | typename Decomposition::range range = decomp[this_group.index()]; 42 | 43 | Iterator1 last = first + range.second; 44 | first += range.first; 45 | 46 | if(this_group.index() != 0) 47 | { 48 | // noticeably faster to pass the last element as the init 49 | init = last[-1]; 50 | --last; 51 | } // end if 52 | 53 | (*this)(this_group, first, last, result + this_group.index(), init, binary_op); 54 | } 55 | }; 56 | 57 | 58 | template 61 | T my_reduce(RandomAccessIterator first, RandomAccessIterator last, T init, BinaryOperation binary_op) 62 | { 63 | typedef typename thrust::iterator_difference::type size_type; 64 | 65 | const size_type n = last - first; 66 | 67 | if(n <= 0) return init; 68 | 69 | const size_type groupsize = 128; 70 | const size_type grainsize = 7; 71 | const size_type tile_size = groupsize * grainsize; 72 | const size_type num_tiles = (n + tile_size - 1) / tile_size; 73 | const size_type subscription = 10; 74 | 75 | bulk::concurrent_group< 76 | bulk::agent, 77 | groupsize 78 | > g; 79 | 80 | const size_type num_groups = thrust::min(subscription * g.hardware_concurrency(), num_tiles); 81 | 82 | aligned_decomposition decomp(n, num_groups, tile_size); 83 | 84 | thrust::cuda::tag t; 85 | thrust::detail::temporary_array partial_sums(t, decomp.size()); 86 | 87 | // reduce into partial sums 88 | bulk::async(bulk::par(g, decomp.size()), reduce_partitions(), bulk::root.this_exec, first, decomp, partial_sums.begin(), init, binary_op); 89 | 90 | if(partial_sums.size() > 1) 91 | { 92 | // reduce the partial sums 93 | bulk::async(g, reduce_partitions(), bulk::root, partial_sums.begin(), partial_sums.end(), partial_sums.begin(), binary_op); 94 | } // end while 95 | 96 | return partial_sums[0]; 97 | } // end my_reduce() 98 | 99 | 100 | template 101 | T my_reduce(const thrust::device_vector *vec) 102 | { 103 | return my_reduce(vec->begin(), vec->end(), T(0), thrust::plus()); 104 | } 105 | 106 | 107 | template 108 | T thrust_reduce(const thrust::device_vector *vec) 109 | { 110 | return thrust::reduce(vec->begin(), vec->end(), T(0), thrust::plus()); 111 | } 112 | 113 | 114 | template 115 | void compare() 116 | { 117 | thrust::device_vector vec(1 << 28); 118 | 119 | thrust_reduce(&vec); 120 | double thrust_msecs = time_invocation_cuda(50, thrust_reduce, &vec); 121 | 122 | my_reduce(&vec); 123 | double my_msecs = time_invocation_cuda(50, my_reduce, &vec); 124 | 125 | std::cout << "Thrust's time: " << thrust_msecs << " ms" << std::endl; 126 | std::cout << "My time: " << my_msecs << " ms" << std::endl; 127 | 128 | std::cout << "Performance relative to Thrust: " << thrust_msecs / my_msecs << std::endl; 129 | } 130 | 131 | 132 | int main() 133 | { 134 | size_t n = 123456789; 135 | 136 | thrust::device_vector vec(n); 137 | 138 | thrust::sequence(vec.begin(), vec.end()); 139 | 140 | int my_result = my_reduce(vec.begin(), vec.end(), 13, thrust::plus()); 141 | 142 | std::cout << "my_result: " << my_result << std::endl; 143 | 144 | int thrust_result = thrust::reduce(vec.begin(), vec.end(), 13, thrust::plus()); 145 | 146 | std::cout << "thrust_result: " << thrust_result << std::endl; 147 | 148 | assert(thrust_result == my_result); 149 | 150 | std::cout << "int: " << std::endl; 151 | compare(); 152 | 153 | std::cout << "long int: " << std::endl; 154 | compare(); 155 | 156 | std::cout << "float: " << std::endl; 157 | compare(); 158 | 159 | std::cout << "double: " << std::endl; 160 | compare(); 161 | } 162 | 163 | -------------------------------------------------------------------------------- /reduce_intervals.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "decomposition.hpp" 5 | 6 | struct reduce_intervals_kernel 7 | { 8 | template 9 | __device__ void operator()(bulk::concurrent_group,groupsize> &this_group, 10 | RandomAccessIterator1 first, 11 | Decomposition decomp, 12 | RandomAccessIterator2 result, 13 | BinaryFunction binary_op) 14 | { 15 | typedef typename thrust::iterator_value::type value_type; 16 | 17 | typename Decomposition::range rng = decomp[this_group.index()]; 18 | 19 | value_type init = first[rng.second-1]; 20 | 21 | value_type sum = bulk::reduce(this_group, first + rng.first, first + rng.second - 1, init, binary_op); 22 | 23 | if(this_group.this_exec.index() == 0) 24 | { 25 | result[this_group.index()] = sum; 26 | } // end if 27 | } // end operator() 28 | }; // end reduce_intervals_kernel 29 | 30 | 31 | template 32 | RandomAccessIterator2 reduce_intervals(RandomAccessIterator1 first, Decomposition decomp, RandomAccessIterator2 result, BinaryFunction binary_op) 33 | { 34 | typedef typename thrust::iterator_value::type result_type; 35 | const size_t groupsize = 128; 36 | size_t heap_size = groupsize * sizeof(result_type); 37 | bulk::async(bulk::grid(decomp.size(),heap_size), reduce_intervals_kernel(), bulk::root.this_exec, first, decomp, result, binary_op); 38 | 39 | return result + decomp.size(); 40 | } // end reduce_intervals() 41 | 42 | 43 | template 44 | RandomAccessIterator2 reduce_intervals(RandomAccessIterator1 first, RandomAccessIterator1 last, Size interval_size, RandomAccessIterator2 result, BinaryFunction binary_op) 45 | { 46 | return reduce_intervals(first, make_blocked_decomposition(last - first,interval_size), result, binary_op); 47 | } // end reduce_intervals() 48 | 49 | -------------------------------------------------------------------------------- /saxpy.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | struct saxpy 9 | { 10 | __host__ __device__ 11 | void operator()(bulk::agent<> &self, float a, float *x, float *y) 12 | { 13 | int i = self.index(); 14 | y[i] = a * x[i] + y[i]; 15 | } 16 | }; 17 | 18 | int main() 19 | { 20 | size_t n = 1 << 24; 21 | thrust::device_vector x(n, 1); 22 | thrust::device_vector y(n, 1); 23 | 24 | float a = 13; 25 | 26 | bulk::async(bulk::par(n), saxpy(), bulk::root.this_exec, a, thrust::raw_pointer_cast(x.data()), thrust::raw_pointer_cast(y.data())); 27 | 28 | assert(thrust::all_of(y.begin(), y.end(), thrust::placeholders::_1 == 14)); 29 | 30 | std::cout << "It worked!" << std::endl; 31 | 32 | return 0; 33 | } 34 | 35 | -------------------------------------------------------------------------------- /scan.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "time_invocation_cuda.hpp" 9 | #include 10 | #include 11 | #include 12 | #include "decomposition.hpp" 13 | 14 | 15 | struct inclusive_scan_n 16 | { 17 | template 18 | __device__ void operator()(ConcurrentGroup &this_group, InputIterator first, Size n, OutputIterator result, T init, BinaryFunction binary_op) 19 | { 20 | bulk::inclusive_scan(this_group, first, first + n, result, init, binary_op); 21 | } 22 | }; 23 | 24 | struct exclusive_scan_n 25 | { 26 | template 27 | __device__ void operator()(ConcurrentGroup &this_group, InputIterator first, Size n, OutputIterator result, T init, BinaryFunction binary_op) 28 | { 29 | bulk::exclusive_scan(this_group, first, first + n, result, init, binary_op); 30 | } 31 | }; 32 | 33 | 34 | struct inclusive_downsweep 35 | { 36 | template 37 | __device__ void operator()(ConcurrentGroup &this_group, 38 | RandomAccessIterator1 first, 39 | Decomposition decomp, 40 | RandomAccessIterator2 carries_first, 41 | RandomAccessIterator3 result, 42 | BinaryFunction binary_op) 43 | { 44 | typename Decomposition::range range = decomp[this_group.index()]; 45 | 46 | RandomAccessIterator1 last = first + range.second; 47 | first += range.first; 48 | result += range.first; 49 | 50 | typename thrust::iterator_value::type carry = carries_first[this_group.index()]; 51 | 52 | bulk::inclusive_scan(this_group, first, last, result, carry, binary_op); 53 | } 54 | }; 55 | 56 | 57 | struct accumulate_tiles 58 | { 59 | template 60 | __device__ void operator()(ConcurrentGroup &this_group, 61 | RandomAccessIterator1 first, 62 | Decomposition decomp, 63 | RandomAccessIterator2 result, 64 | BinaryFunction binary_op) 65 | { 66 | typedef typename thrust::iterator_value::type value_type; 67 | 68 | typename Decomposition::range range = decomp[this_group.index()]; 69 | 70 | const bool commutative = thrust::detail::is_commutative::value; 71 | 72 | // for a commutative accumulate, it's much faster to pass the last value as the init for some reason 73 | value_type init = commutative ? first[range.second-1] : *first; 74 | 75 | value_type sum = commutative ? 76 | bulk::accumulate(this_group, first + range.first, first + range.second - 1, init, binary_op) : 77 | bulk::accumulate(this_group, first + range.first + 1, first + range.second, init, binary_op); 78 | 79 | if(this_group.this_exec.index() == 0) 80 | { 81 | result[this_group.index()] = sum; 82 | } // end if 83 | } // end operator() 84 | }; // end accumulate_tiles 85 | 86 | 87 | template 88 | RandomAccessIterator2 inclusive_scan(RandomAccessIterator1 first, RandomAccessIterator1 last, RandomAccessIterator2 result, T init, BinaryFunction binary_op) 89 | { 90 | typedef typename bulk::detail::scan_detail::scan_intermediate< 91 | RandomAccessIterator1, 92 | RandomAccessIterator2, 93 | BinaryFunction 94 | >::type intermediate_type; 95 | 96 | typedef typename thrust::iterator_difference::type Size; 97 | 98 | Size n = last - first; 99 | 100 | const Size threshold_of_parallelism = 20000; 101 | 102 | if(n < threshold_of_parallelism) 103 | { 104 | typedef bulk::detail::scan_detail::scan_buffer<512,3,RandomAccessIterator1,RandomAccessIterator2,BinaryFunction> heap_type; 105 | Size heap_size = sizeof(heap_type); 106 | bulk::async(bulk::con<512,3>(heap_size), inclusive_scan_n(), bulk::root, first, n, result, init, binary_op); 107 | } // end if 108 | else 109 | { 110 | // determined from empirical testing on k20c 111 | const int groupsize = sizeof(intermediate_type) <= sizeof(int) ? 128 : 256; 112 | const int grainsize = sizeof(intermediate_type) <= sizeof(int) ? 9 : 5; 113 | 114 | const Size tile_size = groupsize * grainsize; 115 | int num_tiles = (n + tile_size - 1) / tile_size; 116 | 117 | // 20 determined from empirical testing on k20c & GTX 480 118 | int subscription = 20; 119 | Size num_groups = thrust::min(subscription * bulk::concurrent_group<>::hardware_concurrency(), num_tiles); 120 | 121 | aligned_decomposition decomp(n, num_groups, tile_size); 122 | 123 | thrust::cuda::tag t; 124 | thrust::detail::temporary_array carries(t, num_groups); 125 | 126 | // Run the parallel raking reduce as an upsweep. 127 | // n loads + num_groups stores 128 | Size heap_size = groupsize * sizeof(intermediate_type); 129 | bulk::async(bulk::grid(num_groups,heap_size), accumulate_tiles(), bulk::root.this_exec, first, decomp, carries.begin(), binary_op); 130 | 131 | // scan the sums to get the carries 132 | // num_groups loads + num_groups stores 133 | typedef bulk::detail::scan_detail::scan_buffer<256,3,RandomAccessIterator1,RandomAccessIterator2,BinaryFunction> heap_type2; 134 | heap_size = sizeof(heap_type2); 135 | bulk::async(bulk::con<256,3>(heap_size), exclusive_scan_n(), bulk::root, carries.begin(), num_groups, carries.begin(), init, binary_op); 136 | 137 | // do the downsweep - n loads, n stores 138 | typedef bulk::detail::scan_detail::scan_buffer< 139 | groupsize, 140 | grainsize, 141 | RandomAccessIterator1,RandomAccessIterator2,BinaryFunction 142 | > heap_type3; 143 | heap_size = sizeof(heap_type3); 144 | bulk::async(bulk::grid(num_groups,heap_size), inclusive_downsweep(), bulk::root.this_exec, first, decomp, carries.begin(), result, binary_op); 145 | } // end else 146 | 147 | return result + n; 148 | } // end inclusive_scan() 149 | 150 | 151 | template 152 | void my_scan(thrust::device_vector *data, T init) 153 | { 154 | ::inclusive_scan(data->begin(), data->end(), data->begin(), init, thrust::plus()); 155 | } 156 | 157 | 158 | template 159 | void validate(size_t n) 160 | { 161 | thrust::host_vector h_input(n); 162 | thrust::fill(h_input.begin(), h_input.end(), 1); 163 | 164 | thrust::host_vector h_result(n); 165 | 166 | T init = 13; 167 | 168 | thrust::inclusive_scan(h_input.begin(), h_input.end(), h_result.begin()); 169 | thrust::for_each(h_result.begin(), h_result.end(), thrust::placeholders::_1 += init); 170 | 171 | thrust::device_vector d_input = h_input; 172 | thrust::device_vector d_result(d_input.size()); 173 | 174 | ::inclusive_scan(d_input.begin(), d_input.end(), d_result.begin(), init, thrust::plus()); 175 | 176 | cudaError_t error = cudaDeviceSynchronize(); 177 | 178 | if(error) 179 | { 180 | std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl; 181 | } 182 | 183 | assert(h_result == d_result); 184 | } 185 | 186 | 187 | template 188 | void thrust_scan(thrust::device_vector *data) 189 | { 190 | thrust::inclusive_scan(data->begin(), data->end(), data->begin()); 191 | } 192 | 193 | 194 | template 195 | void compare(size_t n = 1 << 28) 196 | { 197 | thrust::device_vector vec(n); 198 | 199 | thrust_scan(&vec); 200 | double thrust_msecs = time_invocation_cuda(50, thrust_scan, &vec); 201 | 202 | my_scan(&vec, T(13)); 203 | double my_msecs = time_invocation_cuda(50, my_scan, &vec, 13); 204 | 205 | std::cout << "N: " << n << std::endl; 206 | std::cout << " Thrust's time: " << thrust_msecs << " ms" << std::endl; 207 | std::cout << " My time: " << my_msecs << " ms" << std::endl; 208 | std::cout << " Performance relative to Thrust: " << thrust_msecs / my_msecs << std::endl; 209 | std::cout << std::endl; 210 | } 211 | 212 | 213 | 214 | int main() 215 | { 216 | for(size_t n = 1; n <= 1 << 20; n <<= 1) 217 | { 218 | std::cout << "Testing n = " << n << std::endl; 219 | validate(n); 220 | } 221 | 222 | thrust::default_random_engine rng; 223 | for(int i = 0; i < 20; ++i) 224 | { 225 | size_t n = rng() % (1 << 20); 226 | 227 | std::cout << "Testing n = " << n << std::endl; 228 | validate(n); 229 | } 230 | 231 | std::cout << "32b int:" << std::endl; 232 | for(int i = 0; i < 28; ++i) 233 | { 234 | compare(1 << i); 235 | } 236 | 237 | std::cout << "64b float:" << std::endl; 238 | for(int i = 0; i < 28; ++i) 239 | { 240 | compare(1 << i); 241 | } 242 | 243 | return 0; 244 | } 245 | 246 | -------------------------------------------------------------------------------- /sum.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | struct sum 7 | { 8 | __device__ 9 | void operator()(bulk::concurrent_group<> &g, thrust::device_ptr data, thrust::device_ptr result) 10 | { 11 | unsigned int n = g.size(); 12 | 13 | // allocate some special memory that the group can use for fast communication 14 | int *s_data = static_cast(bulk::malloc(g, n * sizeof(int))); 15 | 16 | // the whole group cooperatively copies the data 17 | bulk::copy_n(g, data, n, s_data); 18 | 19 | while(n > 1) 20 | { 21 | unsigned int half_n = n / 2; 22 | 23 | if(g.this_exec.index() < half_n) 24 | { 25 | s_data[g.this_exec.index()] += s_data[n - g.this_exec.index() - 1]; 26 | } 27 | 28 | // the group synchronizes after each update 29 | g.wait(); 30 | 31 | n -= half_n; 32 | } 33 | 34 | if(g.this_exec.index() == 0) 35 | { 36 | *result = s_data[0]; 37 | } 38 | 39 | // wait for agent 0 to store the result 40 | g.wait(); 41 | 42 | // free the memory cooperatively 43 | bulk::free(g, s_data); 44 | } 45 | }; 46 | 47 | int main() 48 | { 49 | size_t group_size = 512; 50 | 51 | size_t n = group_size; 52 | 53 | // [1, 1, 1, ... 1] - 512 of them 54 | thrust::device_vector vec(n, 1); 55 | 56 | thrust::device_vector result(1); 57 | 58 | using bulk::con; 59 | 60 | // let the runtime size the heap 61 | bulk::async(con(group_size), sum(), bulk::root, vec.data(), result.data()); 62 | 63 | assert(512 == result[0]); 64 | 65 | // size the heap ourself 66 | size_t heap_size = group_size * sizeof(int); 67 | bulk::async(con(group_size, heap_size), sum(), bulk::root, vec.data(), result.data()); 68 | 69 | assert(512 == result[0]); 70 | } 71 | 72 | -------------------------------------------------------------------------------- /tail_flags.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2008-2013 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | 26 | template::type>, 28 | typename ValueType = bool, 29 | typename IndexType = typename thrust::iterator_difference::type> 30 | class tail_flags 31 | { 32 | // XXX WAR cudafe issue 33 | //private: 34 | public: 35 | struct tail_flag_functor 36 | { 37 | BinaryPredicate binary_pred; // this must be the first member for performance reasons 38 | IndexType n; 39 | 40 | typedef ValueType result_type; 41 | 42 | __host__ __device__ 43 | tail_flag_functor(IndexType n) 44 | : binary_pred(), n(n) 45 | {} 46 | 47 | __host__ __device__ 48 | tail_flag_functor(IndexType n, BinaryPredicate binary_pred) 49 | : binary_pred(binary_pred), n(n) 50 | {} 51 | 52 | template 53 | __host__ __device__ __thrust_forceinline__ 54 | result_type operator()(const Tuple &t) 55 | { 56 | const IndexType i = thrust::get<0>(t); 57 | 58 | // note that we do not dereference the tuple's 2nd element when i >= n 59 | // and therefore do not dereference a bad location at the boundary 60 | return (i == (n - 1) || !binary_pred(thrust::get<1>(t), thrust::get<2>(t))); 61 | } 62 | }; 63 | 64 | typedef thrust::counting_iterator counting_iterator; 65 | 66 | public: 67 | typedef thrust::transform_iterator< 68 | tail_flag_functor, 69 | thrust::zip_iterator > 70 | > iterator; 71 | 72 | #pragma hd_warning_disable 73 | __host__ __device__ 74 | tail_flags(RandomAccessIterator first, RandomAccessIterator last) 75 | : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator(0), first, first + 1)), 76 | tail_flag_functor(last - first))), 77 | m_end(m_begin + (last - first)) 78 | {} 79 | 80 | #pragma hd_warning_disable 81 | __host__ __device__ 82 | tail_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred) 83 | : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator(0), first, first + 1)), 84 | tail_flag_functor(last - first, binary_pred))), 85 | m_end(m_begin + (last - first)) 86 | {} 87 | 88 | __host__ __device__ 89 | iterator begin() const 90 | { 91 | return m_begin; 92 | } 93 | 94 | __host__ __device__ 95 | iterator end() const 96 | { 97 | return m_end; 98 | } 99 | 100 | template 101 | __host__ __device__ 102 | typename iterator::reference operator[](OtherIndex i) 103 | { 104 | return *(begin() + i); 105 | } 106 | 107 | private: 108 | iterator m_begin, m_end; 109 | }; 110 | 111 | 112 | template 113 | __host__ __device__ 114 | tail_flags 115 | make_tail_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred) 116 | { 117 | return tail_flags(first, last, binary_pred); 118 | } 119 | 120 | 121 | template 122 | __host__ __device__ 123 | tail_flags 124 | make_tail_flags(RandomAccessIterator first, RandomAccessIterator last) 125 | { 126 | return tail_flags(first, last); 127 | } 128 | 129 | 130 | -------------------------------------------------------------------------------- /time_invocation_cuda.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | template 6 | double time_invocation_cuda(std::size_t num_trials, Function f) 7 | { 8 | cudaEvent_t start, stop; 9 | cudaEventCreate(&start); 10 | cudaEventCreate(&stop); 11 | 12 | cudaEventRecord(start); 13 | for(std::size_t i = 0; 14 | i < num_trials; 15 | ++i) 16 | { 17 | f(); 18 | } 19 | cudaEventRecord(stop); 20 | cudaThreadSynchronize(); 21 | 22 | float msecs = 0; 23 | cudaEventElapsedTime(&msecs, start, stop); 24 | 25 | cudaEventDestroy(start); 26 | cudaEventDestroy(stop); 27 | 28 | // return mean msecs 29 | return msecs / num_trials; 30 | } 31 | 32 | template 33 | double time_invocation_cuda(std::size_t num_trials, Function f, Arg1 arg1) 34 | { 35 | cudaEvent_t start, stop; 36 | cudaEventCreate(&start); 37 | cudaEventCreate(&stop); 38 | 39 | cudaEventRecord(start); 40 | for(std::size_t i = 0; 41 | i < num_trials; 42 | ++i) 43 | { 44 | f(arg1); 45 | } 46 | cudaEventRecord(stop); 47 | cudaThreadSynchronize(); 48 | 49 | float msecs = 0; 50 | cudaEventElapsedTime(&msecs, start, stop); 51 | 52 | cudaEventDestroy(start); 53 | cudaEventDestroy(stop); 54 | 55 | // return mean msecs 56 | return msecs / num_trials; 57 | } 58 | 59 | template 60 | double time_invocation_cuda(std::size_t num_trials, Function f, Arg1 arg1, Arg2 arg2) 61 | { 62 | cudaEvent_t start, stop; 63 | cudaEventCreate(&start); 64 | cudaEventCreate(&stop); 65 | 66 | cudaEventRecord(start); 67 | for(std::size_t i = 0; 68 | i < num_trials; 69 | ++i) 70 | { 71 | f(arg1,arg2); 72 | } 73 | cudaEventRecord(stop); 74 | cudaThreadSynchronize(); 75 | 76 | float msecs = 0; 77 | cudaEventElapsedTime(&msecs, start, stop); 78 | 79 | cudaEventDestroy(start); 80 | cudaEventDestroy(stop); 81 | 82 | // return mean msecs 83 | return msecs / num_trials; 84 | } 85 | 86 | template 87 | double time_invocation_cuda(std::size_t num_trials, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3) 88 | { 89 | cudaEvent_t start, stop; 90 | cudaEventCreate(&start); 91 | cudaEventCreate(&stop); 92 | 93 | cudaEventRecord(start); 94 | for(std::size_t i = 0; 95 | i < num_trials; 96 | ++i) 97 | { 98 | f(arg1,arg2,arg3); 99 | } 100 | cudaEventRecord(stop); 101 | cudaThreadSynchronize(); 102 | 103 | float msecs = 0; 104 | cudaEventElapsedTime(&msecs, start, stop); 105 | 106 | cudaEventDestroy(start); 107 | cudaEventDestroy(stop); 108 | 109 | // return mean msecs 110 | return msecs / num_trials; 111 | } 112 | 113 | template 114 | double time_invocation_cuda(std::size_t num_trials, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4) 115 | { 116 | cudaEvent_t start, stop; 117 | cudaEventCreate(&start); 118 | cudaEventCreate(&stop); 119 | 120 | cudaEventRecord(start); 121 | for(std::size_t i = 0; 122 | i < num_trials; 123 | ++i) 124 | { 125 | f(arg1,arg2,arg3,arg4); 126 | } 127 | cudaEventRecord(stop); 128 | cudaThreadSynchronize(); 129 | 130 | float msecs = 0; 131 | cudaEventElapsedTime(&msecs, start, stop); 132 | 133 | cudaEventDestroy(start); 134 | cudaEventDestroy(stop); 135 | 136 | // return mean msecs 137 | return msecs / num_trials; 138 | } 139 | 140 | template 141 | double time_invocation_cuda(std::size_t num_trials, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) 142 | { 143 | cudaEvent_t start, stop; 144 | cudaEventCreate(&start); 145 | cudaEventCreate(&stop); 146 | 147 | cudaEventRecord(start); 148 | for(std::size_t i = 0; 149 | i < num_trials; 150 | ++i) 151 | { 152 | f(arg1,arg2,arg3,arg4,arg5); 153 | } 154 | cudaEventRecord(stop); 155 | cudaThreadSynchronize(); 156 | 157 | float msecs = 0; 158 | cudaEventElapsedTime(&msecs, start, stop); 159 | 160 | cudaEventDestroy(start); 161 | cudaEventDestroy(stop); 162 | 163 | // return mean msecs 164 | return msecs / num_trials; 165 | } 166 | 167 | template 168 | double time_invocation_cuda(std::size_t num_trials, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) 169 | { 170 | cudaEvent_t start, stop; 171 | cudaEventCreate(&start); 172 | cudaEventCreate(&stop); 173 | 174 | cudaEventRecord(start); 175 | for(std::size_t i = 0; 176 | i < num_trials; 177 | ++i) 178 | { 179 | f(arg1,arg2,arg3,arg4,arg5,arg6); 180 | } 181 | cudaEventRecord(stop); 182 | cudaThreadSynchronize(); 183 | 184 | float msecs = 0; 185 | cudaEventElapsedTime(&msecs, start, stop); 186 | 187 | cudaEventDestroy(start); 188 | cudaEventDestroy(stop); 189 | 190 | // return mean msecs 191 | return msecs / num_trials; 192 | } 193 | 194 | template 195 | double time_invocation_cuda(std::size_t num_trials, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7) 196 | { 197 | cudaEvent_t start, stop; 198 | cudaEventCreate(&start); 199 | cudaEventCreate(&stop); 200 | 201 | cudaEventRecord(start); 202 | for(std::size_t i = 0; 203 | i < num_trials; 204 | ++i) 205 | { 206 | f(arg1,arg2,arg3,arg4,arg5,arg6,arg7); 207 | } 208 | cudaEventRecord(stop); 209 | cudaThreadSynchronize(); 210 | 211 | float msecs = 0; 212 | cudaEventElapsedTime(&msecs, start, stop); 213 | 214 | cudaEventDestroy(start); 215 | cudaEventDestroy(stop); 216 | 217 | // return mean msecs 218 | return msecs / num_trials; 219 | } 220 | 221 | 222 | --------------------------------------------------------------------------------