├── .gitignore ├── LICENSE ├── README.md ├── REQUIREMENTS.txt ├── _config.yml ├── docs ├── convsdf │ ├── README.md │ └── diagram.png ├── convsp │ ├── README.md │ ├── conv_diagram.png │ └── kernel_diagram.png ├── imageprojection │ └── README.md ├── particlecollision │ └── README.md ├── particleprojection │ └── README.md └── reorderdata │ └── README.md ├── examples ├── convsp_example.py ├── fluid_sim.py └── tblogger.py ├── external └── cub-1.3.2 │ └── cub │ ├── block │ ├── block_discontinuity.cuh │ ├── block_exchange.cuh │ ├── block_histogram.cuh │ ├── block_load.cuh │ ├── block_radix_rank.cuh │ ├── block_radix_sort.cuh │ ├── block_raking_layout.cuh │ ├── block_reduce.cuh │ ├── block_scan.cuh │ ├── block_shift.cuh │ ├── block_store.cuh │ └── specializations │ │ ├── block_histogram_atomic.cuh │ │ ├── block_histogram_sort.cuh │ │ ├── block_reduce_raking.cuh │ │ ├── block_reduce_raking_commutative_only.cuh │ │ ├── block_reduce_warp_reductions.cuh │ │ ├── block_scan_raking.cuh │ │ └── block_scan_warp_scans.cuh │ ├── block_range │ ├── block_range_histo.cuh │ ├── block_range_radix_sort_downsweep.cuh │ ├── block_range_radix_sort_upsweep.cuh │ ├── block_range_reduce.cuh │ ├── block_range_reduce_by_key.cuh │ ├── block_range_scan.cuh │ ├── block_range_select.cuh │ ├── block_scan_prefix_operators.cuh │ └── specializations │ │ ├── block_range_histo_gatomic.cuh │ │ ├── block_range_histo_satomic.cuh │ │ └── block_range_histo_sort.cuh │ ├── cub.cuh │ ├── device │ ├── device_histogram.cuh │ ├── device_partition.cuh │ ├── device_radix_sort.cuh │ ├── device_reduce.cuh │ ├── device_scan.cuh │ ├── device_select.cuh │ └── dispatch │ │ ├── device_histogram_dispatch.cuh │ │ ├── device_radix_sort_dispatch.cuh │ │ ├── device_reduce_by_key_dispatch.cuh │ │ ├── device_reduce_dispatch.cuh │ │ ├── device_scan_dispatch.cuh │ │ └── device_select_dispatch.cuh │ ├── grid │ ├── grid_barrier.cuh │ ├── grid_even_share.cuh │ ├── grid_mapping.cuh │ └── grid_queue.cuh │ ├── host │ └── spinlock.cuh │ ├── iterator │ ├── arg_index_input_iterator.cuh │ ├── cache_modified_input_iterator.cuh │ ├── cache_modified_output_iterator.cuh │ ├── constant_input_iterator.cuh │ ├── counting_input_iterator.cuh │ ├── tex_obj_input_iterator.cuh │ ├── tex_ref_input_iterator.cuh │ └── transform_input_iterator.cuh │ ├── thread │ ├── thread_load.cuh │ ├── thread_operators.cuh │ ├── thread_reduce.cuh │ ├── thread_scan.cuh │ └── thread_store.cuh │ ├── util_allocator.cuh │ ├── util_arch.cuh │ ├── util_debug.cuh │ ├── util_device.cuh │ ├── util_macro.cuh │ ├── util_namespace.cuh │ ├── util_ptx.cuh │ ├── util_type.cuh │ └── warp │ ├── specializations │ ├── warp_reduce_shfl.cuh │ ├── warp_reduce_smem.cuh │ ├── warp_scan_shfl.cuh │ └── warp_scan_smem.cuh │ ├── warp_reduce.cuh │ └── warp_scan.cuh ├── python └── SmoothParticleNets │ ├── ImageProjection.py │ ├── ParticleCollision.py │ ├── ParticleProjection.py │ ├── __init__.py │ ├── convsdf.py │ ├── convsp.py │ ├── error_checking.py │ └── kernels.py ├── setup.py ├── src ├── common_funcs.h ├── constants.h ├── cpu_layer_funcs.cpp ├── cuda_layer_funcs.cpp ├── gpu_kernels.cu └── gpu_kernels.h └── tests ├── gradcheck.py ├── regular_grid_interpolater.py ├── test_convsdf.py ├── test_convsp.py ├── test_imageprojection.py ├── test_particlecollision.py └── test_particleprojection.py /.gitignore: -------------------------------------------------------------------------------- 1 | lib/gpu_kernels.cu.o 2 | test/__pycache__/test_f_grid.cpython-27-PYTEST.pyc 3 | test/__pycache__/test_particles2grid.cpython-27-PYTEST.pyc 4 | python/SmoothParticleNets/_ext/_ext.so 5 | *.pyc 6 | python/SmoothParticleNets/_ext/__ext.so 7 | .cache/ 8 | test/.cache/ 9 | test/pytest_args.py 10 | ._timings_n2_shared.csv 11 | src/kernel_constants.h 12 | build 13 | *.so 14 | *.egg-info 15 | tests/pytest_args.py 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 cschenck 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SmoothParticleNets 2 | 3 | Smooth Particle Networks (SmoothParticleNets or SPNets) is a set of custom PyTorch layers to facilitate computation with unordered particle sets. 4 | They were created for the purpose of enabling particle-based fluid dynamics inside a deep network, but the layers can be used for other purposes. 5 | Broadly, the layers enable computing particle-particle interactions, particle-object interactions, and projections onto and out of a camera image. 6 | The interface to this library is in Python. 7 | This library contains 6 layers, listed below. 8 | Note that this library provides only the basic functionality and no additional utilities, e.g., the library does not include a particle visualizer and the library does not include a tool for processing 3D object mesh files into signed distance fields. 9 | 10 | ## Layers 11 | 12 | Below is the list of each layer contained in this library. 13 | Clicking on the layer's name will take you to a description of what that layer does and how to use it. 14 | 15 | * [ConvSP](https://cschenck.github.io/SmoothParticleNets/docs/convsp) 16 | * [ConvSDF](https://cschenck.github.io/SmoothParticleNets/docs/convsdf) 17 | * [ImageProjection](https://cschenck.github.io/SmoothParticleNets/docs/imageprojection) 18 | * [ParticleProjection](https://cschenck.github.io/SmoothParticleNets/docs/particleprojection) 19 | * [ParticleCollision](https://cschenck.github.io/SmoothParticleNets/docs/particlecollision) 20 | * [ReorderData](https://cschenck.github.io/SmoothParticleNets/docs/reorderdata) 21 | 22 | ## Requirements 23 | 24 | This library only requires PyTorch as a dependency. 25 | The current version of the library has been tested to work with PyTorch 0.4.1. 26 | Furthermore, this library only supports Python 3, and does not support Python 2. 27 | 28 | Note that this library was developed only under linux and may or may not run directly without modification on other platforms. 29 | Specifically, this library is confirmed to work on Ubuntu 18.04 with PyTorch 0.4.1, Cuda 10.0, and the 410 Nvidia drivers (although that should not matter). 30 | 31 | ## Installation 32 | 33 | To install this library, download the source from github. 34 | Once downloaded, enter the root directory of the source and run 35 | ```bash 36 | sudo python3 setup.py install 37 | ``` 38 | 39 | Once installed, in Python you should be able to call 'import SmoothParticleNets', which will import the library. 40 | 41 | ## Citation 42 | 43 | In published works please cite this as 44 | > C. Schenck and D. Fox, "SPNets: Differentiable Fluid Dynamics for Deep Neural Networks," in *Proceedings of the Second Conference on Robot Learning (CoRL),* Zurich, Switzerland, 2018. 45 | 46 | ```bibtex 47 | @inproceedings{spnets2018, 48 | title={SPNets: Differentiable Fluid Dynamics for Deep Neural Networks}, 49 | author={Schenck, C. and Fox, D.}, 50 | booktitle={Proceedings of the Second Conference on Robot Learning (CoRL)}, 51 | year={2018}, 52 | address={Zurich, Switzerland} 53 | } 54 | ``` 55 | -------------------------------------------------------------------------------- /REQUIREMENTS.txt: -------------------------------------------------------------------------------- 1 | torch 0.4.1 2 | torchvision 3 | CUDA 10 4 | nvidia drivers 410 5 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /docs/convsdf/README.md: -------------------------------------------------------------------------------- 1 | # ConvSDF 2 | 3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets) 4 | 5 | ## Description 6 | 7 | The ConvSDF layer is the other primary layer in addition to the ConvSP layer. 8 | ConvSDF stands for Signed Distance Field Convolution. 9 | The purpose of this layer is to enable particle-object interactions. 10 | The particles are represented as a list of coordinate locations. 11 | The objects are represented as signed distance fields (SDFs). 12 | SDFs are functions that take in a point in space relative to the object and return the signed distance to the closest point on the surface of the object, where the sign indicates if the query point is inside the object (negative) or outside (positive). 13 | For ConvSDF, this function is represented as a lookup table in the form of a grid. 14 | ConvSDF accepts a grid with the SDF values for each grid cell filled in, then performs linear interpolation when looking up the SDF value for a specific point. 15 | 16 | ConvSDF works as follows. 17 | ConvSDF operates on sets of query locations, but for simplicity the following describes a single query location. 18 | For a given query point, ConvSDF places a convolutional kernel around that point's location in space. 19 | Then it looks up the SDF values at the center of each of the kernel cells. 20 | This is then convolved with a set of weights in the same manner as a standard convolutional layer, the values are multiplied by a set of weights and then summed. 21 | The following diagram illustrates this process. 22 | 23 | ![](diagram.png) 24 | 25 | The SDF field is shown as a heatmap, with the object boundry shown in black. 26 | The large red dot is the query location, with the smaller red dots showing the kernel cell centers. 27 | The output of ConvSDF is the convolved value for the given query location. 28 | 29 | The ConvSDF layer is given the pre-computed SDF grids; it does not compute grids from mesh files. 30 | That must be done externally. 31 | SmoothParticleNets does not include any tools to do this (although some can be found by searching online). 32 | This was done intentnionally to reduce the dependencies that this library requires. 33 | Furthermore, for simplicity, ConvSDF assumes the origin of all the SDF grids is the bottom corner grid. 34 | Ensure that when generating SDF grids that you note if the origin in the mesh file differs from the bottom corner of the grid and ensure you update all poses to take this into account. 35 | SDFs in 1D or in 4+D are not really well-defined, so for now ConvSDF only supports 2D or 3D. 36 | 37 | One common usecase for ConvSDF is to compute when particles are inside objects and how to move them away from the object. 38 | This can be done by using ConvSDF to first compute which particles have a negative SDF value, and then by using another ConvSDF layer with fixed +1/-1 weights to compute numerical gradients. 39 | Multiplying the gradients by the distance yields the vector to move the particle by. 40 | 41 | ConvSDF is implemented as a subclass of torch.nn.Module. 42 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d). 43 | ConvSDF is implemented with gradients for the query locations and the object poses so that it can be used during a backward call. 44 | ConvSdf is impelemented in native code with Cuda support, so it can be evaluated efficiently. 45 | 46 | ## Example 47 | 48 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches. 49 | ```python 50 | # Let's make a simple SDF grid. 51 | sdf = torch.Tensor([[0.7, 0.5, 0.5, 0.7], [0.5, -0.5, -0.5, 0.5], [-0.5, 0.5, 0.5, -0.5], [0.7, 0.5, 0.5, 0.7]]) 52 | # Construct a ConvSDF layer with 5 kernels. 53 | ConvSDF(sdfs=[sdf], sdf_sizes=[1.0], out_channels=5, ndim=2, kernel_size=1, dilation=0.1, max_distance=1.0, with_params=True, compute_pose_grads=True) 54 | # Convolve at the particle locations. Put the object at the origin with no rotation. 55 | new_data = conv(locs, torch.Tensor([[0]]*locs.shape[0]), torch.Tensor([[0.0, 0.0, 0.0, 0.0]]*locs.shape[0]), torch.Tensor([[1.0]]*locs.shape[0])) 56 | ``` 57 | 58 | 59 | ## Documentation 60 | 61 | ConvSDF provides three functions: a constructor, SetSDFs, and forward. 62 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer). 63 | 64 | * ### ConvSDF(sdfs, sdf_sizes, out_channels, ndim, kernel_size, dilation, max_distance, with_params=True, compute_pose_grads=False): 65 | * Arguments 66 | * **sdfs**[list of torch.Tensor]: The pre-computed SDF grids for every object that may be encountered. During the forward call, specific objects can be selected. When there are multiple objects in a scene, the SDFs are combined using the MIN operator (e.g., when evaluating each at a specific query location, the SDF with the smallest value is used). Each value in the grids should be the distance to the surface of the object and negative iff it is inside the object. 67 | * **sdf_sizes**[list of float]: The size of one side of a grid cell for each SDF. The grid cells are assumed to be hypercubes. 68 | * **out_channels**[int]: Similar to standard convolutions, this is the number of convolutional kernels to create. The output is then a feature vector for each query location. Unlike ConvSP, the input is not an arbitrary feature vector but an SDF, so there is no corresponding in_channels argument. 69 | * **ndim**[int]: The dimensionality of the coordinate space. 70 | * **kernel_size**[int or tuple]: The size of the kernel. If a tuple, then len(kernel_size) == ndim must be True. If an integer, the same size is used for each dimension. Kernel sizes must be odd. 71 | * **dilation**[float or tuple]: The size of a kernel cell. If a tuple, then len(dilation) == nimd must be True. If a float, then the same size is used for each dimension. Unlike standard convolutions, where the size of a kernel cell is fixed by the size of a grid cell (e.g., size of a pixel), the unordered particle sets do not provide that structure, so this size must be specified. 72 | * **max_distance**[float]: When looking up the SDF value in an SDF grid, if it is larger than this value, this value is used instead. This is useful when query locations may fall outside of the pre-computed SDF grids. 73 | * **with_params**[boolean]: (optional) If True (default), the parameters of the layer (weights and bias) will be instantiated as torch.nn.Parameters so that they are treated as parameters by PyTorch's built-in operators. If False, then they are added as torch.autograd.Variables and will not be modified by PyTorch directly. This can be useful if desiring fixed (non-trainable) parameters or for debugging. 74 | * **compuse_pose_grads**[boolean]: (optional) If False, will not compute gradients with respect to the poses of the objects during backpropagation. This can speed up the backward pass when these gradients are not desired. 75 | 76 | * ### SetSDFs(sdfs, sdf_sizes): 77 | * Arguments 78 | * **sdfs**[list of torch.Tensor]: The pre-computed SDF grids for every object that may be encountered. During the forward call, specific objects can be selected. When there are multiple objects in a scene, the SDFs are combined using the MIN operator (e.g., when evaluating each at a specific query location, the SDF with the smallest value is used). Each value in the grids should be the distance to the surface of the object and negative iff it is inside the object. 79 | * **sdf_sizes**[list of float]: The size of one side of a grid cell for each SDF. The grid cells are assumed to be hypercubes. 80 | 81 | * ### forward(locs, idxs, poses, scales): 82 | * Arguments 83 | * **locs**[BxNxD torch.autograd.Variable]: The batched list of query locations. D must match the ndim argument to the constructor. 84 | * **idxs**[BxM torch.autograd.Variable]: The indices of the objects to use, where M is the number of objects in the scene. The indices index into the sdfs passed into the constructor. Not every element in the batch must have M objects. Any element that has fewer than M objects may simply set the usused indices to -1. 85 | * **poses**[BxMxDD torch.autograd.Variable]: The pose of each object in the scene. The first D values are the translation, and the remaining values are the rotation. For 2D, the rotation is a single angle. For 3D, the rotation is a quaternion in xyzw format. Only 2D and 3D are supported. The origina for all objects is the lower corner of its SDF grid. 86 | * **scales**[BxM torch.autograd.Variable]: The scale for each object, where 0.5 shrinks the object by half and 2.0 doubles the size of the object. 87 | * Returns 88 | * **new_data**[BxMxG torch.autograd.Variable]: The result of the convolutions. G is the out_channels argument passed to the constructor. This is a new feature vector for each of the query locations. 89 | 90 | 91 | -------------------------------------------------------------------------------- /docs/convsdf/diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cschenck/SmoothParticleNets/1bfde9bd6ce00dcb8750a48f49ce03f4400fb8cc/docs/convsdf/diagram.png -------------------------------------------------------------------------------- /docs/convsp/README.md: -------------------------------------------------------------------------------- 1 | # ConvSP 2 | 3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets) 4 | 5 | ## Description 6 | 7 | The ConvSP layer is the main workhorse layer of SmoothParticleNets. 8 | ConvSP stands for Smooth Particle Convolution. 9 | The ConvSP layer operates on unordered particle sets. 10 | Each particle has a feature vector associated with it, and the ConvSP performs a convolution on these features, similar to how a Conv2D layer performs a convolution on the channels of a feature image. 11 | However, unlike in a standard convolution on a gird, the features associated with each particle here create a continuous vector field across space. 12 | 13 | More formally, a set of particles represents a continuous vector field in space. 14 | That is, at everypoint in space it is possible to evaluate the features represented by the particle set. 15 | This is illustrated in the following diagram and equation 16 | 17 | ![](kernel_diagram.png) 18 | 19 | Given an arbitrary query location (the red dot), the features of each nearby particle (x_j) are averaged together, weighted based on their distance to the query point using a kernel function W. 20 | 21 | This is then used to perform convolutions. 22 | Unlike in the standard convolution, here there isn't a well-defined grid to convolve on. 23 | Instead, the ConvSP layer convolves in free space. 24 | This is illustrated in the following diagram. 25 | 26 | ![](conv_diagram.png) 27 | 28 | In the above 2D case, the kernel used is 3x3. 29 | Given a query location (the large red dot), the kernel is placed on top of that location. 30 | Then the above field lookup equation is used to evaluate the continuous vector field at the center of each kernel cell (small red dots). 31 | The resulting values are then multiplied by kernel weights and summed in the same manner as a standard convolution. 32 | The key difference between ConvSP and a standard convolution is the use of the smoothing kernel average above to allow evaluating the kernel at any arbitrary point in space. 33 | 34 | 35 | ConvSP is implemented as a subclass of torch.nn.Module. 36 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d). 37 | ConvSP is implemented with gradients so that it can be used during a backward call. 38 | ConvSP is impelemented in native code with Cuda support, so it can be evaluated efficiently. 39 | 40 | ## Example 41 | 42 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches and data is a tensor containing a feature vector for each particle. 43 | ```python 44 | # Create a ConvSP layer with 5 output channels, 3 size kernel with dilation of 0.05, and a radius of 0.1. 45 | conv = ConvSP(in_channels=data.shape[2], out_channels=5, locs.shape[2], kernel_size=3, dilation=0.05, radius=0.1, dis_norm=False, with_params=True, kernel_fn='spiky') 46 | # The ConvSP layer requires a ParticleCollision layer to generate the neighbor list. The radius of the neighbor list should be the maximum distance a neighor of any kernel cell could be from the center of the kernel, which is radius + kernel_size/2*dilation. 47 | coll = ParticleCollision(ndim=locs.shape[2], radius=(0.1 + 0.05)) 48 | # PartileCollision reorders locs and data. 49 | locs, data, idxs, neighbors = coll(locs, data) 50 | # Get the new features. We'll use the particle locations as the query locations, so we won't be passing anything for qlocs. 51 | new_data = conv(locs, data, neighbors) 52 | # new_data is still reordered according to the reordered locs, but we might want them in the original order. 53 | reorder = ReorderData(reverse=True) 54 | locs, new_data = reorder(idxs, locs, new_data) 55 | ``` 56 | 57 | 58 | ## Documentation 59 | 60 | ConvSP provides two functions: a constructor and forward. 61 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer). 62 | 63 | * ### ConvSP(in_channels, out_channels, ndim, kernel_size, dilation, radius, dis_norm=False, kernel_fn='default', with_params=True): 64 | * Arguments 65 | * **in_channels**[int]: The dimensionality of the feature vectors associated with each particle. 66 | * **out_channels**[int]: Similar to standard convolutions, this is the number of convolutional kernels to create. The output is then a feature vector for each query location. 67 | * **ndim**[int]: The dimensionality of the particle's coordinate space. 68 | * **kernel_size**[int or tuple]: The size of the kernel. If a tuple, then len(kernel_size) == ndim must be True. If an integer, the same size is used for each dimension. Kernel sizes must be odd. 69 | * **dilation**[float or tuple]: The size of a kernel cell. If a tuple, then len(dilation) == nimd must be True. If a float, then the same size is used for each dimension. Unlike standard convolutions, where the size of a kernel cell is fixed by the size of a grid cell (e.g., size of a pixel), the unordered particle sets do not provide that structure, so this size must be specified. 70 | * **radius**[float]: The radius to use when computing the smoothing kernel average. Only particles within this distance of the query location are used in the average. 71 | * **dis_norm**[boolean]: (optional) If true, the features in the smoothing kernel average will be divided by the distance from the query location to the particle. This normalization can be useful for some computations. 72 | * **kernel_fn**[string]: (optional) The kernel function to use in the smoothing kernel average. SmoothParticleNets provides many options for the kernel. Refer to kernels.py for a complete list. 73 | * **with_params**[boolean]: (optional) If True (default), the parameters of the layer (weights and bias) will be instantiated as torch.nn.Parameters so that they are treated as parameters by PyTorch's built-in operators. If False, then they are added as torch.autograd.Variables and will not be modified by PyTorch directly. This can be useful if desiring fixed (non-trainable) parameters or for debugging. 74 | 75 | * ### forward(locs, data, neighbors, qlocs=None): 76 | * Arguments 77 | * **locs**[BxNxD torch.autograd.Variable]: The batched list of particle locations. D must match the ndim argument to the constructor. 78 | * **data**[BxNxK torch.autograd.Variable]: The feature vectors associated with each particle. K must be the same as the in_channels argument to the constructor. 79 | * **neighbors**[BxMxF torch.autograd.Variable]: The pre-computed neighbor list for each query location. This can be generated using the ParticleCollision layer. This is necessary for evaluating the kernel smoothing average. 80 | * **qlocs**[BxMxD torch.autograd.Variable]: (optional) The set of locations to perform convolutions around. Usually this will be the same as the particle locations, but not always. If this argument is not provided, locs is used. 81 | * Returns 82 | * **new_data**[BxMxG torch.autograd.Variable]: The result of the convolutions. G is the out_channels argument passed to the constructor. This is a new feature vector for each of the query locations. 83 | 84 | -------------------------------------------------------------------------------- /docs/convsp/conv_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cschenck/SmoothParticleNets/1bfde9bd6ce00dcb8750a48f49ce03f4400fb8cc/docs/convsp/conv_diagram.png -------------------------------------------------------------------------------- /docs/convsp/kernel_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cschenck/SmoothParticleNets/1bfde9bd6ce00dcb8750a48f49ce03f4400fb8cc/docs/convsp/kernel_diagram.png -------------------------------------------------------------------------------- /docs/imageprojection/README.md: -------------------------------------------------------------------------------- 1 | # ImageProjection 2 | 3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets) 4 | 5 | ## Description 6 | 7 | The ImageProjection layer projects an image feature map onto a set of particles in the view frame of the camera. 8 | That is, given an image of C channels, it first projects each particle onto the image using given camera intrinsics (focal length, etc.) and extrinsics (pose). 9 | Then it uses bilinear interpolation between the 4 adjacent pixels to generate a feature vector for the given particle. 10 | The output is a C-length feature vector for each particle. 11 | The ImageProjection layer currently only supports 3D coordinate spaces. 12 | 13 | ImageProjection is implemented as a subclass of torch.nn.Module. 14 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d). 15 | ImageProjection can compute gradients with respect to the camera or particle poses and the image features, and is implemented with Cuda support for efficient computation. 16 | 17 | ## Example 18 | 19 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches and image is a [BxHxWxC] feature image. 20 | ```python 21 | # First create the ParticleProjection layer. 22 | proj = ImageProjection(camera_fl=540) 23 | # Setup the camera pose. 24 | camera_pose = torch.Tensor([0.0, 0.0, 0.0]) 25 | camera_rotation = torch.Tensor([0.0, 0.0, 0.0, 1.0]) 26 | new_data = proj(locs, image, camera_pose, camera_rotation) 27 | ``` 28 | 29 | 30 | ## Documentation 31 | 32 | ImageProjection provides two functions: a constructor and forward. 33 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer). 34 | 35 | * ### ImageProjection(camera_fl): 36 | * Arguments 37 | * **camera_fl**[float]: The focal length of the camera. 38 | 39 | * ### forward(locs, image, camera_pose, camera_rot, depth_mask=None): 40 | * Arguments 41 | * **locs**[BxNx3 torch.autograd.Variable]: The batched list of particle locations. Only 3D particle loations are supported. 42 | * **image**[BxHxWxC torch.autograd.Variable]: The image to project onto the particles. H and W are the height and width, respectively, and C is the number of channels. 43 | * **camera_pose**[Bx3 torch.autograd.Variable]: The camera translation in the environment. 44 | * **camera_rot**[Bx4 torch.autograd.Variable]: The camera rotation in the environment, represented as a quaternion in xyzw format. 45 | * **depth_mask**[BxHxW torch.autograd.Variable]: (optional) If passed, this is used to mask particles that are obscured by obstructions in the environment. If the depth of a pixel is less than the depth of the particle, nothing is projected onto that particle. 46 | * Returns 47 | * **new_data**[BxNxC torch.autograd.Variable]: The set of features for each particle after projecting the image features onto them. 48 | 49 | 50 | -------------------------------------------------------------------------------- /docs/particlecollision/README.md: -------------------------------------------------------------------------------- 1 | # ParticleCollision 2 | 3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets) 4 | 5 | ## Description 6 | 7 | The ParticleCollision layer pre-computes neighbor lists (i.e., "colliding" particles) for each given particle. 8 | That is, given a list of particle positions and a fixed radius, this layer returns a short list for each particle with the index of all other particles that are within that radius of it. 9 | To do this, internally the ParticleCollision layer creates a hashgrid and performs lookups based on that grid. 10 | The resulting neighbor list is designed to be used by the ConvSP layer to compute particle-particle interactions. 11 | 12 | An important operation that this layer does alongside computing collisions is to reorder the particle list. 13 | The reordering places particles falling in the same grid cell in the hash grid next to each other in memory. 14 | By doing so, cache hits are increased dramatically during the computation of particle-particle interactions in ConvSP, resulting in a large speedup. 15 | Due to this reordering, the returned list of colliding neighbor indices are indices in the *reordered* list, not in the original. 16 | The standard use of this layer is to compute collisions, make as many calls to ConvSP as are desired, then use the ReorderData layer to return the particle list to its original order. 17 | It is important to emphasize that reordering the data according to the hash grid is critical for perfomance of the ConvSP layer. 18 | 19 | ParticleCollision is implemented as a subclass of torch.nn.Module. 20 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d). 21 | There are no gradients to compute for this layer, so it simply passes them through when calling backward. 22 | 23 | ## Example 24 | 25 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches and vel is a same size tensor containing the particle's velocities. 26 | ```python 27 | coll = ParticleCollision(ndim, radius) 28 | # PartileCollision reorders locs and vel. 29 | locs, vel, idxs, neighbors = coll(locs, vel) 30 | ``` 31 | 32 | 33 | ## Documentation 34 | 35 | ParticleCollision provides two functions: a constructor and forward. 36 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer). 37 | 38 | * ### ParticleCollision(ndim, radius, max_grid_dim=96, max_collisions=128, include_self=True): 39 | * Arguments 40 | * **ndim**[int]: The dimensionality of the particle's coordinate space. 41 | * **radius**[float]: The maximum distance a particle can be from another and still be colliding. 42 | * **max_grid_dims**[int]: (optional) The maximum size of the hash grid in any dimension. This is useful for limiting memory consumpation in cases where the particles are very spread out relative to the collision radius. Particles that don't fall in the hash grid are placed in the cell closest to them. 43 | * **max_collisions**[int]: (optional) The maximum number of neighbors to return. The returned neighbor list for each particle will always be this length (although not necessarily entirely filled in), so selecting this parameter is a balance between memory consumption and ensuring all colliding particles are included. 44 | * **include_self**[boolean]: (optional) If True, the particle will be in its own list of neighbors. If False it will not be. 45 | 46 | * ### forward(idxs, locs, data=None, qlocs=None): 47 | * Arguments 48 | * **locs**[BxNxD torch.autograd.Variable]: The batched list of particle locations. D must match the ndim argument to the constructor. 49 | * **data**[BxNxK torch.autograd.Variable]: (optional) Additional data associated with each particle. This data is not used during the forward call, however since the locs are reordered, any data associated with each particle must also be reordered. Technically this could also be accomplished instead by calling the ReorderData layer on the data after calling forward, but doing so here helps to prevent bugs when calling ConvSP with reordered locs but non-reordered data. 50 | * **qlocs**[BxMxD torch.autograd.Variable]: (optional) In the case where it is desired to compute collisions between two different particle sets, this is the second set. Rather than returning the neighbor list for particles in locs, if this argument is passed, the returned neighbor list is a list for each particle in qlocs of the indices of particles in locs (after reordering) that it collides with. 51 | * Returns 52 | * **locs**[BxNxD torch.autograd.Variable]: The reordered list of particle positions. 53 | * **data**[BxNxK torch.autograd.Variable]: (optional) If data was passed as an input, then the data reordered is returned. 54 | * **idxs**[BxNxD torch.autograd.Variable]: The index list for the reordered particle list. Each index value indicates where the original index of that particle in the original locs, i.e., idxs[b, i] = j where i is the new index of the particle after reordering and j is its original index (b being the batch). 55 | * **neighbors**[Bx(N/M)xC torch.autograd.Variable]: The neighbor list for each particle. If qlocs was passed as an argument, then it is the neighbors of each particle in qlocs instead of locs. Each value indicates the index in locs (after reordering) of the neighboring particle. C is the value of max_collisions as passed to the constructor. Note that not all particles will have max_collisions neighbors. In that event, the values in each particle's list are filled sequentially, with unfilled values in the list being set to -1. 56 | -------------------------------------------------------------------------------- /docs/particleprojection/README.md: -------------------------------------------------------------------------------- 1 | # ParticleProjection 2 | 3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets) 4 | 5 | ## Description 6 | 7 | The ParticleProjection layer is designed to allow comparison of the particle state with a camera image. 8 | It does this by projecting the particles onto a virtual camera image, which can then be compared to other camera images as desired. 9 | Each particle is projected onto the virtual image as a small Gaussian, which allows for smooth gradients with respect to the particle positions or camera pose. 10 | The layer computes the image coordinate of a given particle location using the pinhole camera model, not taking into account any distortions, e.g., radial distortion. 11 | ParticleProjection currently only supports 3D particle locations. 12 | 13 | ParticleProjection is implemented as a subclass of torch.nn.Module. 14 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d). 15 | ParticleProjection can compute gradients with respect to the camera or particle poses, and is implemented with Cuda support for efficient computation. 16 | 17 | ## Example 18 | 19 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches. 20 | ```python 21 | # First create the ParticleProjection layer. 22 | proj = ParticleProjection(camera_fl=540, camera_size=(480, 640), filter_std=5.0, filter_scale=10.0) 23 | # Setup the camera pose. 24 | camera_pose = torch.Tensor([0.0, 0.0, 0.0]) 25 | camera_rotation = torch.Tensor([0.0, 0.0, 0.0, 1.0]) 26 | image = proj(locs, camera_pose, camera_rotation) 27 | ``` 28 | 29 | 30 | ## Documentation 31 | 32 | ParticleProjection provides two functions: a constructor and forward. 33 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer). 34 | 35 | * ### ParticleProjection(camera_fl, camera_size, filter_std, filter_scale): 36 | * Arguments 37 | * **camera_fl**[float]: The focal length of the camera. 38 | * **camera_size**[tuple]: A tuple of the camera image height and width (in that order) in pixels. 39 | * **filter_std**[float]: The standard deviation (in pixels) of the Gaussian for each particle. The Gaussian will be added to all pixels within 2x of this to the particle's image coordinate. 40 | * **filter_scale**[float]: All values added to a pixel will be multiplied by this to allow control of the intensity of the Gaussians for each particle. This is equivalent to multiplying the output image by this value after the fact. 41 | 42 | * ### forward(locs, camera_pose, camera_rot, depth_mask=None): 43 | * Arguments 44 | * **locs**[BxNx3 torch.autograd.Variable]: The batched list of particle locations. Only 3D particle loations are supported. 45 | * **camera_pose**[Bx3 torch.autograd.Variable]: The camera translation in the environment. 46 | * **camera_rot**[Bx4 torch.autograd.Variable]: The camera rotation in the environment, represented as a quaternion in xyzw format. 47 | * **depth_mask**[BxHxW torch.autograd.Variable]: (optional) If passed, this is used to mask particles that are obscured by obstructions in the environment. If the depth of a pixel is less than the depth of the particle, the particle's contribution to that pixel is not added. H and W must match the camera image height and width passed to the constructor. 48 | * Returns 49 | * **image**[BxHxW torch.autograd.Variable]: The projected image. Particles appear as small Gaussians, and where particles overlap the Gaussians are added together. 50 | 51 | -------------------------------------------------------------------------------- /docs/reorderdata/README.md: -------------------------------------------------------------------------------- 1 | # ReorderData 2 | 3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets) 4 | 5 | ## Description 6 | 7 | The ReorderData layer is fairly simple. 8 | The layer reorders a given tensor based on a tensor containing the indices for the data in the first tensor. 9 | More formally, assume that DATA is a BxNxD tensor containing N D-dimensional data points (e.g., XYZ particle locations) over B batches. 10 | Let IDXS be a BxN tensor, where each IDXS[i, :] contains the numbers 0 to N-1 in some arbitrary order. 11 | This layer then returns DATA where the second dimension has been rearranged according to IDXS. 12 | This is equivalent to 13 | ```python 14 | DATA[i, :, :] = DATA[i, IDXS[i, :], :] 15 | ``` 16 | in PyTorch syntax, however this layer is specialized for this specific kind of indexing resulting in a faster implementation. 17 | This layer is designed as a helper layer for the ParticleCollision layer. 18 | 19 | ReorderData is implemented as a subclass of torch.nn.Module. 20 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d). 21 | Additionally, this layer computes graidents, so it can be used in a backward pass. 22 | 23 | ## Example 24 | 25 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches and *vel* is a same size tensor containing the particles' velocities. 26 | ```python 27 | # ReorderData is most commonly used in conjunction with ParticleCollision. 28 | coll = ParticleCollision(ndim, radius) 29 | # Set reverse=True. ParticleCollision calls ReorderData internally, so we want to undo that reordering when we're done. 30 | reorder = ReorderData(reverse=True) 31 | # PartileCollision reorders locs and vel. 32 | locs, vel, idxs, neighbors = coll(locs, vel) 33 | # Perform desired operations with locs, vel, neighbors... 34 | # When we're done, return locs and vel to their original order using ReorderData. 35 | locs, vel = reorder(idxs, locs, vel) 36 | ``` 37 | 38 | 39 | ## Documentation 40 | 41 | ReorderData provides two functions: a constructor and forward. 42 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer). 43 | 44 | * ### ReorderData(reverse=True): 45 | * Arguments 46 | * **reverse**[boolean]: (optional) When False, behaves as normal, using the given indices to reorder the data. When True, this layer assumes that the given data was already reordered according to the given indices, and so reverses that process and retursn the data to the original order. 47 | 48 | * ### forward(idxs, locs, data=None): 49 | * Arguments 50 | * **idxs**[BxN torch.autograd.Variable]: The list of indices to redorder the input by. 51 | * **locs**[BxNxD torch.autograd.Variable]: The main data to be reordered. It is called *locs* because ReorderData is primarily a helper for ParticleCollision, which reorders the locations of the particles. 52 | * **data**[BxNxK torch.autograd.Variable]: (optional) Additional data to reorder alongside locs. Calling forward with both locs and data is equivalent to calling it twice in a row with each individually. This argument is provided as a convenience. 53 | * Returns 54 | * **locs**[BxNxD torch.autograd.Variable]: A new tensor with the same values as in the locs argument reordered based in idxs. 55 | * **data**[BxNxK torch.autograd.Variable]: (optional) If the data argument is passed, then forward will return a pair of tensors, where the second has the same values as data but reordered according to idxs. -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/block/block_raking_layout.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data. 32 | */ 33 | 34 | 35 | #pragma once 36 | 37 | #include "../util_macro.cuh" 38 | #include "../util_arch.cuh" 39 | #include "../util_namespace.cuh" 40 | 41 | /// Optional outer namespace(s) 42 | CUB_NS_PREFIX 43 | 44 | /// CUB namespace 45 | namespace cub { 46 | 47 | /** 48 | * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data. ![](raking.png) 49 | * \ingroup BlockModule 50 | * 51 | * \par Overview 52 | * This type facilitates a shared memory usage pattern where a block of CUDA 53 | * threads places elements into shared memory and then reduces the active 54 | * parallelism to one "raking" warp of threads for serially aggregating consecutive 55 | * sequences of shared items. Padding is inserted to eliminate bank conflicts 56 | * (for most data types). 57 | * 58 | * \tparam T The data type to be exchanged. 59 | * \tparam BLOCK_THREADS The thread block size in threads. 60 | * \tparam PTX_ARCH [optional] \ptxversion 61 | */ 62 | template < 63 | typename T, 64 | int BLOCK_THREADS, 65 | int PTX_ARCH = CUB_PTX_ARCH> 66 | struct BlockRakingLayout 67 | { 68 | //--------------------------------------------------------------------- 69 | // Constants and type definitions 70 | //--------------------------------------------------------------------- 71 | 72 | enum 73 | { 74 | /// The total number of elements that need to be cooperatively reduced 75 | SHARED_ELEMENTS = BLOCK_THREADS, 76 | 77 | /// Maximum number of warp-synchronous raking threads 78 | MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)), 79 | 80 | /// Number of raking elements per warp-synchronous raking thread (rounded up) 81 | SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS, 82 | 83 | /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads) 84 | RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH, 85 | 86 | /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1) 87 | HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0), 88 | 89 | /// Degree of bank conflicts (e.g., 4-way) 90 | CONFLICT_DEGREE = (HAS_CONFLICTS) ? 91 | (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) : 92 | 1, 93 | 94 | /// Pad each segment length with one element if degree of bank conflicts is greater than 4-way (heuristic) 95 | SEGMENT_PADDING = (CONFLICT_DEGREE > CUB_PREFER_CONFLICT_OVER_PADDING(PTX_ARCH)) ? 1 : 0, 96 | // SEGMENT_PADDING = (HAS_CONFLICTS) ? 1 : 0, 97 | 98 | /// Total number of elements in the raking grid 99 | GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + SEGMENT_PADDING), 100 | 101 | /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads) 102 | UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0), 103 | }; 104 | 105 | 106 | /** 107 | * \brief Shared memory storage type 108 | */ 109 | typedef T _TempStorage[BlockRakingLayout::GRID_ELEMENTS]; 110 | 111 | /// Alias wrapper allowing storage to be unioned 112 | struct TempStorage : Uninitialized<_TempStorage> {}; 113 | 114 | 115 | /** 116 | * \brief Returns the location for the calling thread to place data into the grid 117 | */ 118 | static __device__ __forceinline__ T* PlacementPtr( 119 | TempStorage &temp_storage, 120 | int linear_tid) 121 | { 122 | // Offset for partial 123 | unsigned int offset = linear_tid; 124 | 125 | // Add in one padding element for every segment 126 | if (SEGMENT_PADDING > 0) 127 | { 128 | offset += offset / SEGMENT_LENGTH; 129 | } 130 | 131 | // Incorporating a block of padding partials every shared memory segment 132 | return temp_storage.Alias() + offset; 133 | } 134 | 135 | 136 | /** 137 | * \brief Returns the location for the calling thread to begin sequential raking 138 | */ 139 | static __device__ __forceinline__ T* RakingPtr( 140 | TempStorage &temp_storage, 141 | int linear_tid) 142 | { 143 | return temp_storage.Alias() + (linear_tid * (SEGMENT_LENGTH + SEGMENT_PADDING)); 144 | } 145 | }; 146 | 147 | } // CUB namespace 148 | CUB_NS_POSTFIX // Optional outer namespace(s) 149 | 150 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/block/specializations/block_histogram_atomic.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../../util_namespace.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | 45 | /** 46 | * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 47 | */ 48 | template 49 | struct BlockHistogramAtomic 50 | { 51 | /// Shared memory storage layout type 52 | struct TempStorage {}; 53 | 54 | 55 | /// Constructor 56 | __device__ __forceinline__ BlockHistogramAtomic( 57 | TempStorage &temp_storage) 58 | {} 59 | 60 | 61 | /// Composite data onto an existing histogram 62 | template < 63 | typename T, 64 | typename HistoCounter, 65 | int ITEMS_PER_THREAD> 66 | __device__ __forceinline__ void Composite( 67 | T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram 68 | HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram 69 | { 70 | // Update histogram 71 | #pragma unroll 72 | for (int i = 0; i < ITEMS_PER_THREAD; ++i) 73 | { 74 | atomicAdd(histogram + items[i], 1); 75 | } 76 | } 77 | 78 | }; 79 | 80 | } // CUB namespace 81 | CUB_NS_POSTFIX // Optional outer namespace(s) 82 | 83 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/block/specializations/block_histogram_sort.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../../block/block_radix_sort.cuh" 37 | #include "../../block/block_discontinuity.cuh" 38 | #include "../../util_ptx.cuh" 39 | #include "../../util_namespace.cuh" 40 | 41 | /// Optional outer namespace(s) 42 | CUB_NS_PREFIX 43 | 44 | /// CUB namespace 45 | namespace cub { 46 | 47 | 48 | 49 | /** 50 | * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 51 | */ 52 | template < 53 | typename T, ///< Sample type 54 | int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension 55 | int ITEMS_PER_THREAD, ///< The number of samples per thread 56 | int BINS, ///< The number of bins into which histogram samples may fall 57 | int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension 58 | int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension 59 | int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective 60 | struct BlockHistogramSort 61 | { 62 | /// Constants 63 | enum 64 | { 65 | /// The thread block size in threads 66 | BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, 67 | }; 68 | 69 | // Parameterize BlockRadixSort type for our thread block 70 | typedef BlockRadixSort< 71 | T, 72 | BLOCK_DIM_X, 73 | ITEMS_PER_THREAD, 74 | NullType, 75 | 4, 76 | (PTX_ARCH >= 350) ? true : false, 77 | BLOCK_SCAN_WARP_SCANS, 78 | (PTX_ARCH >= 350) ? cudaSharedMemBankSizeEightByte : cudaSharedMemBankSizeFourByte, 79 | BLOCK_DIM_Y, 80 | BLOCK_DIM_Z, 81 | PTX_ARCH> 82 | BlockRadixSortT; 83 | 84 | // Parameterize BlockDiscontinuity type for our thread block 85 | typedef BlockDiscontinuity< 86 | T, 87 | BLOCK_DIM_X, 88 | BLOCK_DIM_Y, 89 | BLOCK_DIM_Z, 90 | PTX_ARCH> 91 | BlockDiscontinuityT; 92 | 93 | /// Shared memory 94 | union _TempStorage 95 | { 96 | // Storage for sorting bin values 97 | typename BlockRadixSortT::TempStorage sort; 98 | 99 | struct 100 | { 101 | // Storage for detecting discontinuities in the tile of sorted bin values 102 | typename BlockDiscontinuityT::TempStorage flag; 103 | 104 | // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values 105 | unsigned int run_begin[BINS]; 106 | unsigned int run_end[BINS]; 107 | }; 108 | }; 109 | 110 | 111 | /// Alias wrapper allowing storage to be unioned 112 | struct TempStorage : Uninitialized<_TempStorage> {}; 113 | 114 | 115 | // Thread fields 116 | _TempStorage &temp_storage; 117 | int linear_tid; 118 | 119 | 120 | /// Constructor 121 | __device__ __forceinline__ BlockHistogramSort( 122 | TempStorage &temp_storage) 123 | : 124 | temp_storage(temp_storage.Alias()), 125 | linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) 126 | {} 127 | 128 | 129 | // Discontinuity functor 130 | struct DiscontinuityOp 131 | { 132 | // Reference to temp_storage 133 | _TempStorage &temp_storage; 134 | 135 | // Constructor 136 | __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) : 137 | temp_storage(temp_storage) 138 | {} 139 | 140 | // Discontinuity predicate 141 | __device__ __forceinline__ bool operator()(const T &a, const T &b, unsigned int b_index) 142 | { 143 | if (a != b) 144 | { 145 | // Note the begin/end offsets in shared storage 146 | temp_storage.run_begin[b] = b_index; 147 | temp_storage.run_end[a] = b_index; 148 | 149 | return true; 150 | } 151 | else 152 | { 153 | return false; 154 | } 155 | } 156 | }; 157 | 158 | 159 | // Composite data onto an existing histogram 160 | template < 161 | typename HistoCounter> 162 | __device__ __forceinline__ void Composite( 163 | T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram 164 | HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram 165 | { 166 | enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; 167 | 168 | // Sort bytes in blocked arrangement 169 | BlockRadixSortT(temp_storage.sort).Sort(items); 170 | 171 | __syncthreads(); 172 | 173 | // Initialize the shared memory's run_begin and run_end for each bin 174 | int histo_offset = 0; 175 | 176 | #pragma unroll 177 | for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) 178 | { 179 | temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; 180 | temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; 181 | } 182 | // Finish up with guarded initialization if necessary 183 | if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) 184 | { 185 | temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; 186 | temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; 187 | } 188 | 189 | __syncthreads(); 190 | 191 | int flags[ITEMS_PER_THREAD]; // unused 192 | 193 | // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile 194 | DiscontinuityOp flag_op(temp_storage); 195 | BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op); 196 | 197 | // Update begin for first item 198 | if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0; 199 | 200 | __syncthreads(); 201 | 202 | // Composite into histogram 203 | histo_offset = 0; 204 | 205 | #pragma unroll 206 | for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) 207 | { 208 | int thread_offset = histo_offset + linear_tid; 209 | HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; 210 | histogram[thread_offset] += count; 211 | } 212 | 213 | // Finish up with guarded composition if necessary 214 | if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) 215 | { 216 | int thread_offset = histo_offset + linear_tid; 217 | HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; 218 | histogram[thread_offset] += count; 219 | } 220 | } 221 | 222 | }; 223 | 224 | } // CUB namespace 225 | CUB_NS_POSTFIX // Optional outer namespace(s) 226 | 227 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/block/specializations/block_reduce_raking_commutative_only.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "block_reduce_raking.cuh" 37 | #include "../../warp/warp_reduce.cuh" 38 | #include "../../thread/thread_reduce.cuh" 39 | #include "../../util_ptx.cuh" 40 | #include "../../util_namespace.cuh" 41 | 42 | /// Optional outer namespace(s) 43 | CUB_NS_PREFIX 44 | 45 | /// CUB namespace 46 | namespace cub { 47 | 48 | 49 | /** 50 | * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. Does not support block sizes that are not a multiple of the warp size. 51 | */ 52 | template < 53 | typename T, ///< Data type being reduced 54 | int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension 55 | int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension 56 | int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension 57 | int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective 58 | struct BlockReduceRakingCommutativeOnly 59 | { 60 | /// Constants 61 | enum 62 | { 63 | /// The thread block size in threads 64 | BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, 65 | }; 66 | 67 | // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values 68 | typedef BlockReduceRaking FallBack; 69 | 70 | /// Constants 71 | enum 72 | { 73 | /// Number of warp threads 74 | WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), 75 | 76 | /// Whether or not to use fall-back 77 | USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)), 78 | 79 | /// Number of raking threads 80 | RAKING_THREADS = WARP_THREADS, 81 | 82 | /// Number of threads actually sharing items with the raking threads 83 | SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS), 84 | 85 | /// Number of raking elements per warp synchronous raking thread 86 | SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS, 87 | }; 88 | 89 | /// WarpReduce utility type 90 | typedef WarpReduce WarpReduce; 91 | 92 | /// Layout type for padded thread block raking grid 93 | typedef BlockRakingLayout BlockRakingLayout; 94 | 95 | /// Shared memory storage layout type 96 | struct _TempStorage 97 | { 98 | union 99 | { 100 | struct 101 | { 102 | typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction 103 | typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid 104 | }; 105 | typename FallBack::TempStorage fallback_storage; ///< Fall-back storage for non-commutative block scan 106 | }; 107 | }; 108 | 109 | 110 | /// Alias wrapper allowing storage to be unioned 111 | struct TempStorage : Uninitialized<_TempStorage> {}; 112 | 113 | 114 | // Thread fields 115 | _TempStorage &temp_storage; 116 | int linear_tid; 117 | 118 | 119 | /// Constructor 120 | __device__ __forceinline__ BlockReduceRakingCommutativeOnly( 121 | TempStorage &temp_storage) 122 | : 123 | temp_storage(temp_storage.Alias()), 124 | linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) 125 | {} 126 | 127 | 128 | /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. 129 | template 130 | __device__ __forceinline__ T Sum( 131 | T partial, ///< [in] Calling thread's input partial reductions 132 | int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) 133 | { 134 | if (USE_FALLBACK || !FULL_TILE) 135 | { 136 | return FallBack(temp_storage.fallback_storage).template Sum(partial, num_valid); 137 | } 138 | else 139 | { 140 | // Place partial into shared memory grid 141 | if (linear_tid >= RAKING_THREADS) 142 | *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; 143 | 144 | __syncthreads(); 145 | 146 | // Reduce parallelism to one warp 147 | if (linear_tid < RAKING_THREADS) 148 | { 149 | // Raking reduction in grid 150 | T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); 151 | partial = ThreadReduce(raking_segment, cub::Sum(), partial); 152 | 153 | // Warpscan 154 | partial = WarpReduce(temp_storage.warp_storage).Sum(partial); 155 | } 156 | } 157 | 158 | return partial; 159 | } 160 | 161 | 162 | /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. 163 | template < 164 | bool FULL_TILE, 165 | typename ReductionOp> 166 | __device__ __forceinline__ T Reduce( 167 | T partial, ///< [in] Calling thread's input partial reductions 168 | int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) 169 | ReductionOp reduction_op) ///< [in] Binary reduction operator 170 | { 171 | if (USE_FALLBACK || !FULL_TILE) 172 | { 173 | return FallBack(temp_storage.fallback_storage).template Reduce(partial, num_valid, reduction_op); 174 | } 175 | else 176 | { 177 | // Place partial into shared memory grid 178 | if (linear_tid >= RAKING_THREADS) 179 | *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; 180 | 181 | __syncthreads(); 182 | 183 | // Reduce parallelism to one warp 184 | if (linear_tid < RAKING_THREADS) 185 | { 186 | // Raking reduction in grid 187 | T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); 188 | partial = ThreadReduce(raking_segment, reduction_op, partial); 189 | 190 | // Warpscan 191 | partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op); 192 | } 193 | } 194 | 195 | return partial; 196 | } 197 | 198 | }; 199 | 200 | } // CUB namespace 201 | CUB_NS_POSTFIX // Optional outer namespace(s) 202 | 203 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/block_range/specializations/block_range_histo_gatomic.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | 38 | #include "../../util_type.cuh" 39 | #include "../../util_namespace.cuh" 40 | 41 | /// Optional outer namespace(s) 42 | CUB_NS_PREFIX 43 | 44 | /// CUB namespace 45 | namespace cub { 46 | 47 | 48 | 49 | /** 50 | * BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics 51 | */ 52 | template < 53 | typename BlockRangeHistogramPolicy, ///< Tuning policy 54 | int BINS, ///< Number of histogram bins per channel 55 | int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) 56 | int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed 57 | typename InputIterator, ///< The input iterator type \iterator. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1] 58 | typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin 59 | typename Offset> ///< Signed integer type for global offsets 60 | struct BlockRangeHistogramGlobalAtomic 61 | { 62 | //--------------------------------------------------------------------- 63 | // Types and constants 64 | //--------------------------------------------------------------------- 65 | 66 | // Sample type 67 | typedef typename std::iterator_traits::value_type SampleT; 68 | 69 | // Constants 70 | enum 71 | { 72 | BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS, 73 | ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD, 74 | TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, 75 | TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS, 76 | }; 77 | 78 | // Shared memory type required by this thread block 79 | typedef NullType TempStorage; 80 | 81 | 82 | //--------------------------------------------------------------------- 83 | // Per-thread fields 84 | //--------------------------------------------------------------------- 85 | 86 | /// Reference to output histograms 87 | HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]; 88 | 89 | /// Input data to reduce 90 | InputIterator d_in; 91 | 92 | 93 | //--------------------------------------------------------------------- 94 | // Interface 95 | //--------------------------------------------------------------------- 96 | 97 | /** 98 | * Constructor 99 | */ 100 | __device__ __forceinline__ BlockRangeHistogramGlobalAtomic( 101 | TempStorage &temp_storage, ///< Reference to temp_storage 102 | InputIterator d_in, ///< Input data to reduce 103 | HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms 104 | : 105 | d_in(d_in), 106 | d_out_histograms(d_out_histograms) 107 | {} 108 | 109 | 110 | /** 111 | * Process a single tile of input 112 | */ 113 | template 114 | __device__ __forceinline__ void ConsumeTile( 115 | Offset block_offset, ///< The offset the tile to consume 116 | int valid_items = TILE_ITEMS) ///< The number of valid items in the tile 117 | { 118 | if (FULL_TILE) 119 | { 120 | // Full tile of samples to read and composite 121 | SampleT items[ITEMS_PER_THREAD][CHANNELS]; 122 | 123 | #pragma unroll 124 | for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 125 | { 126 | #pragma unroll 127 | for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) 128 | { 129 | if (CHANNEL < ACTIVE_CHANNELS) 130 | { 131 | items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; 132 | } 133 | } 134 | } 135 | 136 | __threadfence_block(); 137 | 138 | #pragma unroll 139 | for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) 140 | { 141 | #pragma unroll 142 | for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) 143 | { 144 | if (CHANNEL < ACTIVE_CHANNELS) 145 | { 146 | atomicAdd(d_out_histograms[CHANNEL] + items[ITEM][CHANNEL], 1); 147 | } 148 | } 149 | } 150 | } 151 | else 152 | { 153 | // Only a partially-full tile of samples to read and composite 154 | int bounds = valid_items - (threadIdx.x * CHANNELS); 155 | 156 | #pragma unroll 157 | for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) 158 | { 159 | #pragma unroll 160 | for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) 161 | { 162 | if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds)) 163 | { 164 | SampleT item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; 165 | atomicAdd(d_out_histograms[CHANNEL] + item, 1); 166 | } 167 | } 168 | } 169 | 170 | } 171 | } 172 | 173 | 174 | /** 175 | * Aggregate results into output 176 | */ 177 | __device__ __forceinline__ void AggregateOutput() 178 | {} 179 | }; 180 | 181 | 182 | } // CUB namespace 183 | CUB_NS_POSTFIX // Optional outer namespace(s) 184 | 185 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/cub.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * CUB umbrella include file 32 | */ 33 | 34 | #pragma once 35 | 36 | 37 | // Block 38 | #include "block/block_histogram.cuh" 39 | #include "block/block_discontinuity.cuh" 40 | #include "block/block_exchange.cuh" 41 | #include "block/block_load.cuh" 42 | #include "block/block_radix_rank.cuh" 43 | #include "block/block_radix_sort.cuh" 44 | #include "block/block_reduce.cuh" 45 | #include "block/block_scan.cuh" 46 | #include "block/block_store.cuh" 47 | #include "block/block_shift.cuh" 48 | 49 | // Device 50 | #include "device/device_histogram.cuh" 51 | #include "device/device_partition.cuh" 52 | #include "device/device_radix_sort.cuh" 53 | #include "device/device_reduce.cuh" 54 | #include "device/device_scan.cuh" 55 | #include "device/device_select.cuh" 56 | 57 | // Grid 58 | //#include "grid/grid_barrier.cuh" 59 | #include "grid/grid_even_share.cuh" 60 | #include "grid/grid_mapping.cuh" 61 | #include "grid/grid_queue.cuh" 62 | 63 | // Host 64 | #include "host/spinlock.cuh" 65 | 66 | // Thread 67 | #include "thread/thread_load.cuh" 68 | #include "thread/thread_operators.cuh" 69 | #include "thread/thread_reduce.cuh" 70 | #include "thread/thread_scan.cuh" 71 | #include "thread/thread_store.cuh" 72 | 73 | // Warp 74 | #include "warp/warp_reduce.cuh" 75 | #include "warp/warp_scan.cuh" 76 | 77 | // Iterator 78 | #include "iterator/arg_index_input_iterator.cuh" 79 | #include "iterator/cache_modified_input_iterator.cuh" 80 | #include "iterator/cache_modified_output_iterator.cuh" 81 | #include "iterator/constant_input_iterator.cuh" 82 | #include "iterator/counting_input_iterator.cuh" 83 | #include "iterator/tex_obj_input_iterator.cuh" 84 | #include "iterator/tex_ref_input_iterator.cuh" 85 | #include "iterator/transform_input_iterator.cuh" 86 | 87 | // Util 88 | #include "util_allocator.cuh" 89 | #include "util_arch.cuh" 90 | #include "util_debug.cuh" 91 | #include "util_device.cuh" 92 | #include "util_macro.cuh" 93 | #include "util_ptx.cuh" 94 | #include "util_type.cuh" 95 | 96 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/grid/grid_barrier.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../util_debug.cuh" 37 | #include "../util_namespace.cuh" 38 | #include "../thread/thread_load.cuh" 39 | 40 | /// Optional outer namespace(s) 41 | CUB_NS_PREFIX 42 | 43 | /// CUB namespace 44 | namespace cub { 45 | 46 | 47 | /** 48 | * \addtogroup GridModule 49 | * @{ 50 | */ 51 | 52 | 53 | /** 54 | * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid 55 | */ 56 | class GridBarrier 57 | { 58 | protected : 59 | 60 | typedef unsigned int SyncFlag; 61 | 62 | // Counters in global device memory 63 | SyncFlag* d_sync; 64 | 65 | public: 66 | 67 | /** 68 | * Constructor 69 | */ 70 | GridBarrier() : d_sync(NULL) {} 71 | 72 | 73 | /** 74 | * Synchronize 75 | */ 76 | __device__ __forceinline__ void Sync() const 77 | { 78 | volatile SyncFlag *d_vol_sync = d_sync; 79 | 80 | // Threadfence and syncthreads to make sure global writes are visible before 81 | // thread-0 reports in with its sync counter 82 | __threadfence(); 83 | __syncthreads(); 84 | 85 | if (blockIdx.x == 0) 86 | { 87 | // Report in ourselves 88 | if (threadIdx.x == 0) 89 | { 90 | d_vol_sync[blockIdx.x] = 1; 91 | } 92 | 93 | __syncthreads(); 94 | 95 | // Wait for everyone else to report in 96 | for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) 97 | { 98 | while (ThreadLoad(d_sync + peer_block) == 0) 99 | { 100 | __threadfence_block(); 101 | } 102 | } 103 | 104 | __syncthreads(); 105 | 106 | // Let everyone know it's safe to proceed 107 | for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) 108 | { 109 | d_vol_sync[peer_block] = 0; 110 | } 111 | } 112 | else 113 | { 114 | if (threadIdx.x == 0) 115 | { 116 | // Report in 117 | d_vol_sync[blockIdx.x] = 1; 118 | 119 | // Wait for acknowledgment 120 | while (ThreadLoad(d_sync + blockIdx.x) == 1) 121 | { 122 | __threadfence_block(); 123 | } 124 | } 125 | 126 | __syncthreads(); 127 | } 128 | } 129 | }; 130 | 131 | 132 | /** 133 | * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation. 134 | * 135 | * Uses RAII for lifetime, i.e., device resources are reclaimed when 136 | * the destructor is called. 137 | */ 138 | class GridBarrierLifetime : public GridBarrier 139 | { 140 | protected: 141 | 142 | // Number of bytes backed by d_sync 143 | size_t sync_bytes; 144 | 145 | public: 146 | 147 | /** 148 | * Constructor 149 | */ 150 | GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {} 151 | 152 | 153 | /** 154 | * DeviceFrees and resets the progress counters 155 | */ 156 | cudaError_t HostReset() 157 | { 158 | cudaError_t retval = cudaSuccess; 159 | if (d_sync) 160 | { 161 | CubDebug(retval = cudaFree(d_sync)); 162 | d_sync = NULL; 163 | } 164 | sync_bytes = 0; 165 | return retval; 166 | } 167 | 168 | 169 | /** 170 | * Destructor 171 | */ 172 | virtual ~GridBarrierLifetime() 173 | { 174 | HostReset(); 175 | } 176 | 177 | 178 | /** 179 | * Sets up the progress counters for the next kernel launch (lazily 180 | * allocating and initializing them if necessary) 181 | */ 182 | cudaError_t Setup(int sweep_grid_size) 183 | { 184 | cudaError_t retval = cudaSuccess; 185 | do { 186 | size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag); 187 | if (new_sync_bytes > sync_bytes) 188 | { 189 | if (d_sync) 190 | { 191 | if (CubDebug(retval = cudaFree(d_sync))) break; 192 | } 193 | 194 | sync_bytes = new_sync_bytes; 195 | 196 | // Allocate and initialize to zero 197 | if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break; 198 | if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break; 199 | } 200 | } while (0); 201 | 202 | return retval; 203 | } 204 | }; 205 | 206 | 207 | /** @} */ // end group GridModule 208 | 209 | } // CUB namespace 210 | CUB_NS_POSTFIX // Optional outer namespace(s) 211 | 212 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/grid/grid_even_share.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains). 32 | */ 33 | 34 | 35 | #pragma once 36 | 37 | #include "../util_namespace.cuh" 38 | #include "../util_macro.cuh" 39 | 40 | /// Optional outer namespace(s) 41 | CUB_NS_PREFIX 42 | 43 | /// CUB namespace 44 | namespace cub { 45 | 46 | 47 | /** 48 | * \addtogroup GridModule 49 | * @{ 50 | */ 51 | 52 | 53 | /** 54 | * \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains). 55 | * 56 | * \par Overview 57 | * GridEvenShare indicates which sections of input are to be mapped onto which threadblocks. 58 | * Threadblocks may receive one of three different amounts of work: "big", "normal", 59 | * and "last". The "big" workloads are one scheduling grain larger than "normal". The "last" work unit 60 | * for the last threadblock may be partially-full if the input is not an even multiple of 61 | * the scheduling grain size. 62 | * 63 | * \par 64 | * Before invoking a child grid, a parent thread will typically construct an instance of 65 | * GridEvenShare. The instance can be passed to child threadblocks which can 66 | * initialize their per-threadblock offsets using \p BlockInit(). 67 | * 68 | * \tparam Offset Signed integer type for global offsets 69 | */ 70 | template 71 | struct GridEvenShare 72 | { 73 | Offset total_grains; 74 | int big_blocks; 75 | Offset big_share; 76 | Offset normal_share; 77 | Offset normal_base_offset; 78 | 79 | /// Total number of input items 80 | Offset num_items; 81 | 82 | /// Grid size in threadblocks 83 | int grid_size; 84 | 85 | /// Offset into input marking the beginning of the owning thread block's segment of input tiles 86 | Offset block_offset; 87 | 88 | /// Offset into input of marking the end (one-past) of the owning thread block's segment of input tiles 89 | Offset block_end; 90 | 91 | /** 92 | * \brief Default constructor. Zero-initializes block-specific fields. 93 | */ 94 | __host__ __device__ __forceinline__ GridEvenShare() : 95 | num_items(0), 96 | grid_size(0), 97 | block_offset(0), 98 | block_end(0) {} 99 | 100 | /** 101 | * \brief Constructor. Initializes the grid-specific members \p num_items and \p grid_size. To be called prior prior to kernel launch) 102 | */ 103 | __host__ __device__ __forceinline__ GridEvenShare( 104 | Offset num_items, ///< Total number of input items 105 | int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items) 106 | int schedule_granularity) ///< Granularity by which the input can be parcelled into and distributed among threablocks. Usually the thread block's native tile size (or a multiple thereof. 107 | { 108 | this->num_items = num_items; 109 | this->block_offset = num_items; 110 | this->block_end = num_items; 111 | this->total_grains = (num_items + schedule_granularity - 1) / schedule_granularity; 112 | this->grid_size = CUB_MIN(total_grains, max_grid_size); 113 | Offset grains_per_block = total_grains / grid_size; 114 | this->big_blocks = total_grains - (grains_per_block * grid_size); // leftover grains go to big blocks 115 | this->normal_share = grains_per_block * schedule_granularity; 116 | this->normal_base_offset = big_blocks * schedule_granularity; 117 | this->big_share = normal_share + schedule_granularity; 118 | } 119 | 120 | 121 | 122 | /** 123 | * \brief Initializes ranges for the specified partition index 124 | */ 125 | __device__ __forceinline__ void Init(int partition_id) 126 | { 127 | if (partition_id < big_blocks) 128 | { 129 | // This threadblock gets a big share of grains (grains_per_block + 1) 130 | block_offset = (partition_id * big_share); 131 | block_end = block_offset + big_share; 132 | } 133 | else if (partition_id < total_grains) 134 | { 135 | // This threadblock gets a normal share of grains (grains_per_block) 136 | block_offset = normal_base_offset + (partition_id * normal_share); 137 | block_end = CUB_MIN(num_items, block_offset + normal_share); 138 | } 139 | } 140 | 141 | 142 | /** 143 | * \brief Initializes ranges for the current thread block (e.g., to be called by each threadblock after startup) 144 | */ 145 | __device__ __forceinline__ void BlockInit() 146 | { 147 | Init(blockIdx.x); 148 | } 149 | 150 | 151 | /** 152 | * Print to stdout 153 | */ 154 | __host__ __device__ __forceinline__ void Print() 155 | { 156 | printf( 157 | #if (CUB_PTX_ARCH > 0) 158 | "\tthreadblock(%d) " 159 | "block_offset(%lu) " 160 | "block_end(%lu) " 161 | #endif 162 | "num_items(%lu) " 163 | "total_grains(%lu) " 164 | "big_blocks(%lu) " 165 | "big_share(%lu) " 166 | "normal_share(%lu)\n", 167 | #if (CUB_PTX_ARCH > 0) 168 | blockIdx.x, 169 | (unsigned long) block_offset, 170 | (unsigned long) block_end, 171 | #endif 172 | (unsigned long) num_items, 173 | (unsigned long) total_grains, 174 | (unsigned long) big_blocks, 175 | (unsigned long) big_share, 176 | (unsigned long) normal_share); 177 | } 178 | }; 179 | 180 | 181 | 182 | /** @} */ // end group GridModule 183 | 184 | } // CUB namespace 185 | CUB_NS_POSTFIX // Optional outer namespace(s) 186 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/grid/grid_mapping.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../util_namespace.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | 45 | /** 46 | * \addtogroup GridModule 47 | * @{ 48 | */ 49 | 50 | 51 | /****************************************************************************** 52 | * Mapping policies 53 | *****************************************************************************/ 54 | 55 | 56 | /** 57 | * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. 58 | */ 59 | enum GridMappingStrategy 60 | { 61 | /** 62 | * \brief An "even-share" strategy for assigning input tiles to thread blocks. 63 | * 64 | * \par Overview 65 | * The input is evenly partitioned into \p p segments, where \p p is 66 | * constant and corresponds loosely to the number of thread blocks that may 67 | * actively reside on the target device. Each segment is comprised of 68 | * consecutive tiles, where a tile is a small, constant-sized unit of input 69 | * to be processed to completion before the thread block terminates or 70 | * obtains more work. The kernel invokes \p p thread blocks, each 71 | * of which iteratively consumes a segment of n/p elements 72 | * in tile-size increments. 73 | */ 74 | GRID_MAPPING_EVEN_SHARE, 75 | 76 | /** 77 | * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. 78 | * 79 | * \par Overview 80 | * The input is treated as a queue to be dynamically consumed by a grid of 81 | * thread blocks. Work is atomically dequeued in tiles, where a tile is a 82 | * unit of input to be processed to completion before the thread block 83 | * terminates or obtains more work. The grid size \p p is constant, 84 | * loosely corresponding to the number of thread blocks that may actively 85 | * reside on the target device. 86 | */ 87 | GRID_MAPPING_DYNAMIC, 88 | }; 89 | 90 | 91 | /** @} */ // end group GridModule 92 | 93 | } // CUB namespace 94 | CUB_NS_POSTFIX // Optional outer namespace(s) 95 | 96 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/grid/grid_queue.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::GridQueue is a descriptor utility for dynamic queue management. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../util_namespace.cuh" 37 | #include "../util_debug.cuh" 38 | 39 | /// Optional outer namespace(s) 40 | CUB_NS_PREFIX 41 | 42 | /// CUB namespace 43 | namespace cub { 44 | 45 | 46 | /** 47 | * \addtogroup GridModule 48 | * @{ 49 | */ 50 | 51 | 52 | /** 53 | * \brief GridQueue is a descriptor utility for dynamic queue management. 54 | * 55 | * \par Overview 56 | * GridQueue descriptors provides abstractions for "filling" or 57 | * "draining" globally-shared vectors. 58 | * 59 | * \par 60 | * A "filling" GridQueue works by atomically-adding to a zero-initialized counter, 61 | * returning a unique offset for the calling thread to write its items. 62 | * The GridQueue maintains the total "fill-size". The fill counter must be reset 63 | * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that 64 | * will be filling. 65 | * 66 | * \par 67 | * Similarly, a "draining" GridQueue works by works by atomically-incrementing a 68 | * zero-initialized counter, returning a unique offset for the calling thread to 69 | * read its items. Threads can safely drain until the array's logical fill-size is 70 | * exceeded. The drain counter must be reset using GridQueue::ResetDrain or 71 | * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that 72 | * will be filling. (For dynamic work distribution of existing data, the corresponding fill-size 73 | * is simply the number of elements in the array.) 74 | * 75 | * \par 76 | * Iterative work management can be implemented simply with a pair of flip-flopping 77 | * work buffers, each with an associated set of fill and drain GridQueue descriptors. 78 | * 79 | * \tparam Offset Signed integer type for global offsets 80 | */ 81 | template 82 | class GridQueue 83 | { 84 | private: 85 | 86 | /// Counter indices 87 | enum 88 | { 89 | FILL = 0, 90 | DRAIN = 1, 91 | }; 92 | 93 | /// Pair of counters 94 | Offset *d_counters; 95 | 96 | public: 97 | 98 | /// Returns the device allocation size in bytes needed to construct a GridQueue instance 99 | __host__ __device__ __forceinline__ 100 | static size_t AllocationSize() 101 | { 102 | return sizeof(Offset) * 2; 103 | } 104 | 105 | 106 | /// Constructs an invalid GridQueue descriptor 107 | __host__ __device__ __forceinline__ GridQueue() 108 | : 109 | d_counters(NULL) 110 | {} 111 | 112 | 113 | /// Constructs a GridQueue descriptor around the device storage allocation 114 | __host__ __device__ __forceinline__ GridQueue( 115 | void *d_storage) ///< Device allocation to back the GridQueue. Must be at least as big as AllocationSize(). 116 | : 117 | d_counters((Offset*) d_storage) 118 | {} 119 | 120 | 121 | /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining. 122 | __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain( 123 | Offset fill_size, 124 | cudaStream_t stream = 0) 125 | { 126 | #if (CUB_PTX_ARCH > 0) 127 | d_counters[FILL] = fill_size; 128 | d_counters[DRAIN] = 0; 129 | return cudaSuccess; 130 | #else 131 | Offset counters[2]; 132 | counters[FILL] = fill_size; 133 | counters[DRAIN] = 0; 134 | return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(Offset) * 2, cudaMemcpyHostToDevice, stream)); 135 | #endif 136 | } 137 | 138 | 139 | /// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining. 140 | __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0) 141 | { 142 | #if (CUB_PTX_ARCH > 0) 143 | d_counters[DRAIN] = 0; 144 | return cudaSuccess; 145 | #else 146 | return FillAndResetDrain(0, stream); 147 | #endif 148 | } 149 | 150 | 151 | /// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling. 152 | __host__ __device__ __forceinline__ cudaError_t ResetFill() 153 | { 154 | #if (CUB_PTX_ARCH > 0) 155 | d_counters[FILL] = 0; 156 | return cudaSuccess; 157 | #else 158 | return CubDebug(cudaMemset(d_counters + FILL, 0, sizeof(Offset))); 159 | #endif 160 | } 161 | 162 | 163 | /// Returns the fill-size established by the parent or by the previous kernel. 164 | __host__ __device__ __forceinline__ cudaError_t FillSize( 165 | Offset &fill_size, 166 | cudaStream_t stream = 0) 167 | { 168 | #if (CUB_PTX_ARCH > 0) 169 | fill_size = d_counters[FILL]; 170 | return cudaSuccess; 171 | #else 172 | return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(Offset), cudaMemcpyDeviceToHost, stream)); 173 | #endif 174 | } 175 | 176 | 177 | /// Drain num_items. Returns offset from which to read items. 178 | __device__ __forceinline__ Offset Drain(Offset num_items) 179 | { 180 | return atomicAdd(d_counters + DRAIN, num_items); 181 | } 182 | 183 | 184 | /// Fill num_items. Returns offset from which to write items. 185 | __device__ __forceinline__ Offset Fill(Offset num_items) 186 | { 187 | return atomicAdd(d_counters + FILL, num_items); 188 | } 189 | }; 190 | 191 | 192 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 193 | 194 | 195 | /** 196 | * Reset grid queue (call with 1 block of 1 thread) 197 | */ 198 | template 199 | __global__ void FillAndResetDrainKernel( 200 | GridQueue grid_queue, 201 | Offset num_items) 202 | { 203 | grid_queue.FillAndResetDrain(num_items); 204 | } 205 | 206 | 207 | 208 | #endif // DOXYGEN_SHOULD_SKIP_THIS 209 | 210 | 211 | /** @} */ // end group GridModule 212 | 213 | } // CUB namespace 214 | CUB_NS_POSTFIX // Optional outer namespace(s) 215 | 216 | 217 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/host/spinlock.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Simple x86/x64 atomic spinlock, portable across MS Windows (cl.exe) & Linux (g++) 32 | */ 33 | 34 | 35 | #pragma once 36 | 37 | #if defined(_WIN32) || defined(_WIN64) 38 | #include 39 | #include 40 | #undef small // Windows is terrible for polluting macro namespace 41 | 42 | /** 43 | * Compiler read/write barrier 44 | */ 45 | #pragma intrinsic(_ReadWriteBarrier) 46 | 47 | #endif 48 | 49 | #include "../util_namespace.cuh" 50 | 51 | /// Optional outer namespace(s) 52 | CUB_NS_PREFIX 53 | 54 | /// CUB namespace 55 | namespace cub { 56 | 57 | 58 | #if defined(_MSC_VER) 59 | 60 | // Microsoft VC++ 61 | typedef long Spinlock; 62 | 63 | #else 64 | 65 | // GNU g++ 66 | typedef int Spinlock; 67 | 68 | /** 69 | * Compiler read/write barrier 70 | */ 71 | __forceinline__ void _ReadWriteBarrier() 72 | { 73 | __sync_synchronize(); 74 | } 75 | 76 | /** 77 | * Atomic exchange 78 | */ 79 | __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value) 80 | { 81 | // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier 82 | _ReadWriteBarrier(); 83 | return __sync_lock_test_and_set(Target, Value); 84 | } 85 | 86 | /** 87 | * Pause instruction to prevent excess processor bus usage 88 | */ 89 | __forceinline__ void YieldProcessor() 90 | { 91 | #ifndef __arm__ 92 | asm volatile("pause\n": : :"memory"); 93 | #endif // __arm__ 94 | } 95 | 96 | #endif // defined(_MSC_VER) 97 | 98 | /** 99 | * Return when the specified spinlock has been acquired 100 | */ 101 | __forceinline__ void Lock(volatile Spinlock *lock) 102 | { 103 | while (1) 104 | { 105 | if (!_InterlockedExchange(lock, 1)) return; 106 | while (*lock) YieldProcessor(); 107 | } 108 | } 109 | 110 | 111 | /** 112 | * Release the specified spinlock 113 | */ 114 | __forceinline__ void Unlock(volatile Spinlock *lock) 115 | { 116 | _ReadWriteBarrier(); 117 | *lock = 0; 118 | } 119 | 120 | 121 | } // CUB namespace 122 | CUB_NS_POSTFIX // Optional outer namespace(s) 123 | 124 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/iterator/cache_modified_input_iterator.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Random-access iterator types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | 39 | #include "../thread/thread_load.cuh" 40 | #include "../thread/thread_store.cuh" 41 | #include "../util_device.cuh" 42 | #include "../util_namespace.cuh" 43 | 44 | #if (THRUST_VERSION >= 100700) 45 | // This iterator is compatible with Thrust API 1.7 and newer 46 | #include 47 | #include 48 | #endif // THRUST_VERSION 49 | 50 | 51 | /// Optional outer namespace(s) 52 | CUB_NS_PREFIX 53 | 54 | /// CUB namespace 55 | namespace cub { 56 | 57 | 58 | 59 | /** 60 | * \addtogroup UtilIterator 61 | * @{ 62 | */ 63 | 64 | 65 | /** 66 | * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier. 67 | * 68 | * \par Overview 69 | * - CacheModifiedInputIterator is a random-access input iterator that wraps a native 70 | * device pointer of type ValueType*. \p ValueType references are 71 | * made by reading \p ValueType values through loads modified by \p MODIFIER. 72 | * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG", 73 | * "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.). 74 | * - Can be constructed, manipulated, and exchanged within and between host and device 75 | * functions, but can only be dereferenced within device functions. 76 | * - Compatible with Thrust API v1.7 or newer. 77 | * 78 | * \par Snippet 79 | * The code snippet below illustrates the use of \p CacheModifiedInputIterator to 80 | * dereference a device array of double using the "ldg" PTX load modifier 81 | * (i.e., load values through texture cache). 82 | * \par 83 | * \code 84 | * #include // or equivalently 85 | * 86 | * // Declare, allocate, and initialize a device array 87 | * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] 88 | * 89 | * // Create an iterator wrapper 90 | * cub::CacheModifiedInputIterator itr(d_in); 91 | * 92 | * // Within device code: 93 | * printf("%f\n", itr[0]); // 8.0 94 | * printf("%f\n", itr[1]); // 6.0 95 | * printf("%f\n", itr[6]); // 9.0 96 | * 97 | * \endcode 98 | * 99 | * \tparam CacheLoadModifier The cub::CacheLoadModifier to use when accessing data 100 | * \tparam ValueType The value type of this iterator 101 | * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) 102 | */ 103 | template < 104 | CacheLoadModifier MODIFIER, 105 | typename ValueType, 106 | typename Offset = ptrdiff_t> 107 | class CacheModifiedInputIterator 108 | { 109 | public: 110 | 111 | // Required iterator traits 112 | typedef CacheModifiedInputIterator self_type; ///< My own type 113 | typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another 114 | typedef ValueType value_type; ///< The type of the element the iterator can point to 115 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to 116 | typedef ValueType reference; ///< The type of a reference to an element the iterator can point to 117 | 118 | #if (THRUST_VERSION >= 100700) 119 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods 120 | typedef typename thrust::detail::iterator_facade_category< 121 | thrust::device_system_tag, 122 | thrust::random_access_traversal_tag, 123 | value_type, 124 | reference 125 | >::type iterator_category; ///< The iterator category 126 | #else 127 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category 128 | #endif // THRUST_VERSION 129 | 130 | 131 | private: 132 | 133 | ValueType* ptr; 134 | 135 | public: 136 | 137 | /// Constructor 138 | __host__ __device__ __forceinline__ CacheModifiedInputIterator( 139 | ValueType* ptr) ///< Native pointer to wrap 140 | : 141 | ptr(ptr) 142 | {} 143 | 144 | /// Postfix increment 145 | __host__ __device__ __forceinline__ self_type operator++(int) 146 | { 147 | self_type retval = *this; 148 | ptr++; 149 | return retval; 150 | } 151 | 152 | /// Prefix increment 153 | __host__ __device__ __forceinline__ self_type operator++() 154 | { 155 | ptr++; 156 | return *this; 157 | } 158 | 159 | /// Indirection 160 | __host__ __device__ __forceinline__ reference operator*() const 161 | { 162 | return ThreadLoad(ptr); 163 | } 164 | 165 | /// Addition 166 | template 167 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const 168 | { 169 | self_type retval(ptr + n); 170 | return retval; 171 | } 172 | 173 | /// Addition assignment 174 | template 175 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n) 176 | { 177 | ptr += n; 178 | return *this; 179 | } 180 | 181 | /// Subtraction 182 | template 183 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const 184 | { 185 | self_type retval(ptr - n); 186 | return retval; 187 | } 188 | 189 | /// Subtraction assignment 190 | template 191 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n) 192 | { 193 | ptr -= n; 194 | return *this; 195 | } 196 | 197 | /// Distance 198 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const 199 | { 200 | return ptr - other.ptr; 201 | } 202 | 203 | /// Array subscript 204 | template 205 | __host__ __device__ __forceinline__ reference operator[](Distance n) const 206 | { 207 | return ThreadLoad(ptr + n); 208 | } 209 | 210 | /// Structure dereference 211 | __host__ __device__ __forceinline__ pointer operator->() 212 | { 213 | return &ThreadLoad(ptr); 214 | } 215 | 216 | /// Equal to 217 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) 218 | { 219 | return (ptr == rhs.ptr); 220 | } 221 | 222 | /// Not equal to 223 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) 224 | { 225 | return (ptr != rhs.ptr); 226 | } 227 | 228 | /// ostream operator 229 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr) 230 | { 231 | return os; 232 | } 233 | }; 234 | 235 | 236 | 237 | /** @} */ // end group UtilIterator 238 | 239 | } // CUB namespace 240 | CUB_NS_POSTFIX // Optional outer namespace(s) 241 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/iterator/constant_input_iterator.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Random-access iterator types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | 39 | #include "../thread/thread_load.cuh" 40 | #include "../thread/thread_store.cuh" 41 | #include "../util_namespace.cuh" 42 | 43 | #if (THRUST_VERSION >= 100700) 44 | // This iterator is compatible with Thrust API 1.7 and newer 45 | #include 46 | #include 47 | #endif // THRUST_VERSION 48 | 49 | 50 | /// Optional outer namespace(s) 51 | CUB_NS_PREFIX 52 | 53 | /// CUB namespace 54 | namespace cub { 55 | 56 | 57 | /** 58 | * \addtogroup UtilIterator 59 | * @{ 60 | */ 61 | 62 | 63 | /** 64 | * \brief A random-access input generator for dereferencing a sequence of homogeneous values 65 | * 66 | * \par Overview 67 | * - Read references to a ConstantInputIterator iterator always return the supplied constant 68 | * of type \p ValueType. 69 | * - Can be used with any data type. 70 | * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device 71 | * functions. 72 | * - Compatible with Thrust API v1.7 or newer. 73 | * 74 | * \par Snippet 75 | * The code snippet below illustrates the use of \p ConstantInputIterator to 76 | * dereference a sequence of homogeneous doubles. 77 | * \par 78 | * \code 79 | * #include // or equivalently 80 | * 81 | * cub::ConstantInputIterator itr(5.0); 82 | * 83 | * printf("%f\n", itr[0]); // 5.0 84 | * printf("%f\n", itr[1]); // 5.0 85 | * printf("%f\n", itr[2]); // 5.0 86 | * printf("%f\n", itr[50]); // 5.0 87 | * 88 | * \endcode 89 | * 90 | * \tparam ValueType The value type of this iterator 91 | * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) 92 | */ 93 | template < 94 | typename ValueType, 95 | typename Offset = ptrdiff_t> 96 | class ConstantInputIterator 97 | { 98 | public: 99 | 100 | // Required iterator traits 101 | typedef ConstantInputIterator self_type; ///< My own type 102 | typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another 103 | typedef ValueType value_type; ///< The type of the element the iterator can point to 104 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to 105 | typedef ValueType reference; ///< The type of a reference to an element the iterator can point to 106 | 107 | #if (THRUST_VERSION >= 100700) 108 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods 109 | typedef typename thrust::detail::iterator_facade_category< 110 | thrust::any_system_tag, 111 | thrust::random_access_traversal_tag, 112 | value_type, 113 | reference 114 | >::type iterator_category; ///< The iterator category 115 | #else 116 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category 117 | #endif // THRUST_VERSION 118 | 119 | private: 120 | 121 | ValueType val; 122 | Offset offset; 123 | #ifdef _WIN32 124 | Offset pad[CUB_MAX(1, (16 / sizeof(Offset) - 1))]; // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) 125 | #endif 126 | 127 | public: 128 | 129 | /// Constructor 130 | __host__ __device__ __forceinline__ ConstantInputIterator( 131 | ValueType val, ///< Starting value for the iterator instance to report 132 | Offset offset = 0) ///< Base offset 133 | : 134 | val(val), 135 | offset(offset) 136 | {} 137 | 138 | /// Postfix increment 139 | __host__ __device__ __forceinline__ self_type operator++(int) 140 | { 141 | self_type retval = *this; 142 | offset++; 143 | return retval; 144 | } 145 | 146 | /// Prefix increment 147 | __host__ __device__ __forceinline__ self_type operator++() 148 | { 149 | offset++; 150 | return *this; 151 | } 152 | 153 | /// Indirection 154 | __host__ __device__ __forceinline__ reference operator*() const 155 | { 156 | return val; 157 | } 158 | 159 | /// Addition 160 | template 161 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const 162 | { 163 | self_type retval(val, offset + n); 164 | return retval; 165 | } 166 | 167 | /// Addition assignment 168 | template 169 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n) 170 | { 171 | offset += n; 172 | return *this; 173 | } 174 | 175 | /// Subtraction 176 | template 177 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const 178 | { 179 | self_type retval(val, offset - n); 180 | return retval; 181 | } 182 | 183 | /// Subtraction assignment 184 | template 185 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n) 186 | { 187 | offset -= n; 188 | return *this; 189 | } 190 | 191 | /// Distance 192 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const 193 | { 194 | return offset - other.offset; 195 | } 196 | 197 | /// Array subscript 198 | template 199 | __host__ __device__ __forceinline__ reference operator[](Distance n) const 200 | { 201 | return val; 202 | } 203 | 204 | /// Structure dereference 205 | __host__ __device__ __forceinline__ pointer operator->() 206 | { 207 | return &val; 208 | } 209 | 210 | /// Equal to 211 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) 212 | { 213 | return (offset == rhs.offset) && ((val == rhs.val)); 214 | } 215 | 216 | /// Not equal to 217 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) 218 | { 219 | return (offset != rhs.offset) || (val!= rhs.val); 220 | } 221 | 222 | /// ostream operator 223 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr) 224 | { 225 | os << "[" << itr.val << "," << itr.offset << "]"; 226 | return os; 227 | } 228 | 229 | }; 230 | 231 | 232 | /** @} */ // end group UtilIterator 233 | 234 | } // CUB namespace 235 | CUB_NS_POSTFIX // Optional outer namespace(s) 236 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/iterator/counting_input_iterator.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Random-access iterator types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | 39 | #include "../thread/thread_load.cuh" 40 | #include "../thread/thread_store.cuh" 41 | #include "../util_device.cuh" 42 | #include "../util_namespace.cuh" 43 | 44 | #if (THRUST_VERSION >= 100700) 45 | // This iterator is compatible with Thrust API 1.7 and newer 46 | #include 47 | #include 48 | #endif // THRUST_VERSION 49 | 50 | 51 | /// Optional outer namespace(s) 52 | CUB_NS_PREFIX 53 | 54 | /// CUB namespace 55 | namespace cub { 56 | 57 | /** 58 | * \addtogroup UtilIterator 59 | * @{ 60 | */ 61 | 62 | /** 63 | * \brief A random-access input generator for dereferencing a sequence of incrementing integer values. 64 | * 65 | * \par Overview 66 | * - After initializing a CountingInputIterator to a certain integer \p base, read references 67 | * at \p offset will return the value \p base + \p offset. 68 | * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device 69 | * functions. 70 | * - Compatible with Thrust API v1.7 or newer. 71 | * 72 | * \par Snippet 73 | * The code snippet below illustrates the use of \p CountingInputIterator to 74 | * dereference a sequence of incrementing integers. 75 | * \par 76 | * \code 77 | * #include // or equivalently 78 | * 79 | * cub::CountingInputIterator itr(5); 80 | * 81 | * printf("%d\n", itr[0]); // 5 82 | * printf("%d\n", itr[1]); // 6 83 | * printf("%d\n", itr[2]); // 7 84 | * printf("%d\n", itr[50]); // 55 85 | * 86 | * \endcode 87 | * 88 | * \tparam ValueType The value type of this iterator 89 | * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) 90 | */ 91 | template < 92 | typename ValueType, 93 | typename Offset = ptrdiff_t> 94 | class CountingInputIterator 95 | { 96 | public: 97 | 98 | // Required iterator traits 99 | typedef CountingInputIterator self_type; ///< My own type 100 | typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another 101 | typedef ValueType value_type; ///< The type of the element the iterator can point to 102 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to 103 | typedef ValueType reference; ///< The type of a reference to an element the iterator can point to 104 | 105 | #if (THRUST_VERSION >= 100700) 106 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods 107 | typedef typename thrust::detail::iterator_facade_category< 108 | thrust::any_system_tag, 109 | thrust::random_access_traversal_tag, 110 | value_type, 111 | reference 112 | >::type iterator_category; ///< The iterator category 113 | #else 114 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category 115 | #endif // THRUST_VERSION 116 | 117 | private: 118 | 119 | ValueType val; 120 | 121 | public: 122 | 123 | /// Constructor 124 | __host__ __device__ __forceinline__ CountingInputIterator( 125 | const ValueType &val) ///< Starting value for the iterator instance to report 126 | : 127 | val(val) 128 | {} 129 | 130 | /// Postfix increment 131 | __host__ __device__ __forceinline__ self_type operator++(int) 132 | { 133 | self_type retval = *this; 134 | val++; 135 | return retval; 136 | } 137 | 138 | /// Prefix increment 139 | __host__ __device__ __forceinline__ self_type operator++() 140 | { 141 | val++; 142 | return *this; 143 | } 144 | 145 | /// Indirection 146 | __host__ __device__ __forceinline__ reference operator*() const 147 | { 148 | return val; 149 | } 150 | 151 | /// Addition 152 | template 153 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const 154 | { 155 | self_type retval(val + n); 156 | return retval; 157 | } 158 | 159 | /// Addition assignment 160 | template 161 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n) 162 | { 163 | val += n; 164 | return *this; 165 | } 166 | 167 | /// Subtraction 168 | template 169 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const 170 | { 171 | self_type retval(val - n); 172 | return retval; 173 | } 174 | 175 | /// Subtraction assignment 176 | template 177 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n) 178 | { 179 | val -= n; 180 | return *this; 181 | } 182 | 183 | /// Distance 184 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const 185 | { 186 | return val - other.val; 187 | } 188 | 189 | /// Array subscript 190 | template 191 | __host__ __device__ __forceinline__ reference operator[](Distance n) const 192 | { 193 | return val + n; 194 | } 195 | 196 | /// Structure dereference 197 | __host__ __device__ __forceinline__ pointer operator->() 198 | { 199 | return &val; 200 | } 201 | 202 | /// Equal to 203 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) 204 | { 205 | return (val == rhs.val); 206 | } 207 | 208 | /// Not equal to 209 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) 210 | { 211 | return (val != rhs.val); 212 | } 213 | 214 | /// ostream operator 215 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr) 216 | { 217 | os << "[" << itr.val << "]"; 218 | return os; 219 | } 220 | 221 | }; 222 | 223 | 224 | 225 | /** @} */ // end group UtilIterator 226 | 227 | } // CUB namespace 228 | CUB_NS_POSTFIX // Optional outer namespace(s) 229 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/thread/thread_operators.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Simple binary operator functor types 32 | */ 33 | 34 | /****************************************************************************** 35 | * Simple functor operators 36 | ******************************************************************************/ 37 | 38 | #pragma once 39 | 40 | #include "../util_macro.cuh" 41 | #include "../util_type.cuh" 42 | #include "../util_namespace.cuh" 43 | 44 | /// Optional outer namespace(s) 45 | CUB_NS_PREFIX 46 | 47 | /// CUB namespace 48 | namespace cub { 49 | 50 | 51 | /** 52 | * \addtogroup UtilModule 53 | * @{ 54 | */ 55 | 56 | /** 57 | * \brief Default equality functor 58 | */ 59 | struct Equality 60 | { 61 | /// Boolean equality operator, returns (a == b) 62 | template 63 | __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const 64 | { 65 | return a == b; 66 | } 67 | }; 68 | 69 | 70 | /** 71 | * \brief Default inequality functor 72 | */ 73 | struct Inequality 74 | { 75 | /// Boolean inequality operator, returns (a != b) 76 | template 77 | __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const 78 | { 79 | return a != b; 80 | } 81 | }; 82 | 83 | 84 | /** 85 | * \brief Inequality functor (wraps equality functor) 86 | */ 87 | template 88 | struct InequalityWrapper 89 | { 90 | /// Wrapped equality operator 91 | EqualityOp op; 92 | 93 | /// Constructor 94 | __host__ __device__ __forceinline__ 95 | InequalityWrapper(EqualityOp op) : op(op) {} 96 | 97 | /// Boolean inequality operator, returns (a != b) 98 | template 99 | __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const 100 | { 101 | return !op(a, b); 102 | } 103 | }; 104 | 105 | 106 | /** 107 | * \brief Default sum functor 108 | */ 109 | struct Sum 110 | { 111 | /// Boolean sum operator, returns a + b 112 | template 113 | __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const 114 | { 115 | return a + b; 116 | } 117 | }; 118 | 119 | 120 | /** 121 | * \brief Default max functor 122 | */ 123 | struct Max 124 | { 125 | /// Boolean max operator, returns (a > b) ? a : b 126 | template 127 | __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const 128 | { 129 | return CUB_MAX(a, b); 130 | } 131 | }; 132 | 133 | 134 | /** 135 | * \brief Arg max functor (keeps the value and offset of the first occurrence of the l item) 136 | */ 137 | struct ArgMax 138 | { 139 | /// Boolean max operator, preferring the item having the smaller offset in case of ties 140 | template 141 | __host__ __device__ __forceinline__ ItemOffsetPair operator()( 142 | const ItemOffsetPair &a, 143 | const ItemOffsetPair &b) const 144 | { 145 | if (a.value == b.value) 146 | return (b.offset < a.offset) ? b : a; 147 | 148 | return (b.value > a.value) ? b : a; 149 | } 150 | }; 151 | 152 | 153 | /** 154 | * \brief Default min functor 155 | */ 156 | struct Min 157 | { 158 | /// Boolean min operator, returns (a < b) ? a : b 159 | template 160 | __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const 161 | { 162 | return CUB_MIN(a, b); 163 | } 164 | }; 165 | 166 | 167 | /** 168 | * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item) 169 | */ 170 | struct ArgMin 171 | { 172 | /// Boolean min operator, preferring the item having the smaller offset in case of ties 173 | template 174 | __host__ __device__ __forceinline__ ItemOffsetPair operator()( 175 | const ItemOffsetPair &a, 176 | const ItemOffsetPair &b) const 177 | { 178 | if (a.value == b.value) 179 | return (b.offset < a.offset) ? b : a; 180 | 181 | return (b.value < a.value) ? b : a; 182 | } 183 | }; 184 | 185 | 186 | /** 187 | * \brief Default cast functor 188 | */ 189 | template 190 | struct Cast 191 | { 192 | /// Boolean max operator, returns (a > b) ? a : b 193 | template 194 | __host__ __device__ __forceinline__ B operator()(const A &a) const 195 | { 196 | return (B) a; 197 | } 198 | }; 199 | 200 | 201 | 202 | /** @} */ // end group UtilModule 203 | 204 | 205 | } // CUB namespace 206 | CUB_NS_POSTFIX // Optional outer namespace(s) 207 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/thread/thread_reduce.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Thread utilities for sequential reduction over statically-sized array types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../thread/thread_operators.cuh" 37 | #include "../util_namespace.cuh" 38 | 39 | /// Optional outer namespace(s) 40 | CUB_NS_PREFIX 41 | 42 | /// CUB namespace 43 | namespace cub { 44 | 45 | /** 46 | * \addtogroup UtilModule 47 | * @{ 48 | */ 49 | 50 | /** 51 | * \name Sequential reduction over statically-sized array types 52 | * @{ 53 | */ 54 | 55 | 56 | template < 57 | int LENGTH, 58 | typename T, 59 | typename ReductionOp> 60 | __device__ __forceinline__ T ThreadReduce( 61 | T* input, ///< [in] Input array 62 | ReductionOp reduction_op, ///< [in] Binary reduction operator 63 | T prefix, ///< [in] Prefix to seed reduction with 64 | Int2Type length) 65 | { 66 | T addend = *input; 67 | prefix = reduction_op(prefix, addend); 68 | 69 | return ThreadReduce(input + 1, reduction_op, prefix, Int2Type()); 70 | } 71 | 72 | template < 73 | typename T, 74 | typename ReductionOp> 75 | __device__ __forceinline__ T ThreadReduce( 76 | T* input, ///< [in] Input array 77 | ReductionOp reduction_op, ///< [in] Binary reduction operator 78 | T prefix, ///< [in] Prefix to seed reduction with 79 | Int2Type<0> length) 80 | { 81 | return prefix; 82 | } 83 | 84 | 85 | /** 86 | * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. 87 | * 88 | * \tparam LENGTH Length of input array 89 | * \tparam T [inferred] The data type to be reduced. 90 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 91 | */ 92 | template < 93 | int LENGTH, 94 | typename T, 95 | typename ReductionOp> 96 | __device__ __forceinline__ T ThreadReduce( 97 | T* input, ///< [in] Input array 98 | ReductionOp reduction_op, ///< [in] Binary reduction operator 99 | T prefix) ///< [in] Prefix to seed reduction with 100 | { 101 | return ThreadReduce(input, reduction_op, prefix, Int2Type()); 102 | } 103 | 104 | 105 | /** 106 | * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned. 107 | * 108 | * \tparam LENGTH Length of input array 109 | * \tparam T [inferred] The data type to be reduced. 110 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 111 | */ 112 | template < 113 | int LENGTH, 114 | typename T, 115 | typename ReductionOp> 116 | __device__ __forceinline__ T ThreadReduce( 117 | T* input, ///< [in] Input array 118 | ReductionOp reduction_op) ///< [in] Binary reduction operator 119 | { 120 | T prefix = input[0]; 121 | return ThreadReduce(input + 1, reduction_op, prefix); 122 | } 123 | 124 | 125 | /** 126 | * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. 127 | * 128 | * \tparam LENGTH [inferred] Length of \p input array 129 | * \tparam T [inferred] The data type to be reduced. 130 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 131 | */ 132 | template < 133 | int LENGTH, 134 | typename T, 135 | typename ReductionOp> 136 | __device__ __forceinline__ T ThreadReduce( 137 | T (&input)[LENGTH], ///< [in] Input array 138 | ReductionOp reduction_op, ///< [in] Binary reduction operator 139 | T prefix) ///< [in] Prefix to seed reduction with 140 | { 141 | return ThreadReduce(input, reduction_op, prefix); 142 | } 143 | 144 | 145 | /** 146 | * \brief Serial reduction with the specified operator 147 | * 148 | * \tparam LENGTH [inferred] Length of \p input array 149 | * \tparam T [inferred] The data type to be reduced. 150 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 151 | */ 152 | template < 153 | int LENGTH, 154 | typename T, 155 | typename ReductionOp> 156 | __device__ __forceinline__ T ThreadReduce( 157 | T (&input)[LENGTH], ///< [in] Input array 158 | ReductionOp reduction_op) ///< [in] Binary reduction operator 159 | { 160 | return ThreadReduce((T*) input, reduction_op); 161 | } 162 | 163 | 164 | //@} end member group 165 | 166 | /** @} */ // end group UtilModule 167 | 168 | } // CUB namespace 169 | CUB_NS_POSTFIX // Optional outer namespace(s) 170 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/util_arch.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Static architectural properties by SM version. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "util_namespace.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | 45 | /** 46 | * \addtogroup UtilMgmt 47 | * @{ 48 | */ 49 | 50 | 51 | /// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass). 52 | #ifndef __CUDA_ARCH__ 53 | #define CUB_PTX_ARCH 0 54 | #else 55 | #define CUB_PTX_ARCH __CUDA_ARCH__ 56 | #endif 57 | 58 | 59 | /// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. 60 | #if (CUB_PTX_ARCH == 0) || defined(CUB_CDP) 61 | #define CUB_RUNTIME_ENABLED 62 | #define CUB_RUNTIME_FUNCTION __host__ __device__ 63 | #else 64 | #define CUB_RUNTIME_FUNCTION __host__ 65 | #endif 66 | 67 | 68 | 69 | /// Number of threads per warp (log) 70 | #define CUB_LOG_WARP_THREADS(arch) \ 71 | (5) 72 | 73 | /// Number of threads per warp 74 | #define CUB_WARP_THREADS(arch) \ 75 | (1 << CUB_LOG_WARP_THREADS(arch)) 76 | 77 | /// Number of smem banks (log) 78 | #define CUB_LOG_SMEM_BANKS(arch) \ 79 | ((arch >= 200) ? \ 80 | (5) : \ 81 | (4)) 82 | 83 | /// Number of smem banks 84 | #define CUB_SMEM_BANKS(arch) \ 85 | (1 << CUB_LOG_SMEM_BANKS(arch)) 86 | 87 | /// Number of bytes per smem bank 88 | #define CUB_SMEM_BANK_BYTES(arch) \ 89 | (4) 90 | 91 | /// Number of smem bytes provisioned per SM 92 | #define CUB_SMEM_BYTES(arch) \ 93 | ((arch >= 200) ? \ 94 | (48 * 1024) : \ 95 | (16 * 1024)) 96 | 97 | /// Smem allocation size in bytes 98 | #define CUB_SMEM_ALLOC_UNIT(arch) \ 99 | ((arch >= 300) ? \ 100 | (256) : \ 101 | ((arch >= 200) ? \ 102 | (128) : \ 103 | (512))) 104 | 105 | /// Whether or not the architecture allocates registers by block (or by warp) 106 | #define CUB_REGS_BY_BLOCK(arch) \ 107 | ((arch >= 200) ? \ 108 | (false) : \ 109 | (true)) 110 | 111 | /// Number of registers allocated at a time per block (or by warp) 112 | #define CUB_REG_ALLOC_UNIT(arch) \ 113 | ((arch >= 300) ? \ 114 | (256) : \ 115 | ((arch >= 200) ? \ 116 | (64) : \ 117 | ((arch >= 120) ? \ 118 | (512) : \ 119 | (256)))) 120 | 121 | /// Granularity of warps for which registers are allocated 122 | #define CUB_WARP_ALLOC_UNIT(arch) \ 123 | ((arch >= 300) ? \ 124 | (4) : \ 125 | (2)) 126 | 127 | /// Maximum number of threads per SM 128 | #define CUB_MAX_SM_THREADS(arch) \ 129 | ((arch >= 300) ? \ 130 | (2048) : \ 131 | ((arch >= 200) ? \ 132 | (1536) : \ 133 | ((arch >= 120) ? \ 134 | (1024) : \ 135 | (768)))) 136 | 137 | /// Maximum number of thread blocks per SM 138 | #define CUB_MAX_SM_BLOCKS(arch) \ 139 | ((arch >= 300) ? \ 140 | (16) : \ 141 | (8)) 142 | 143 | /// Maximum number of threads per thread block 144 | #define CUB_MAX_BLOCK_THREADS(arch) \ 145 | ((arch >= 200) ? \ 146 | (1024) : \ 147 | (512)) 148 | 149 | /// Maximum number of registers per SM 150 | #define CUB_MAX_SM_REGISTERS(arch) \ 151 | ((arch >= 300) ? \ 152 | (64 * 1024) : \ 153 | ((arch >= 200) ? \ 154 | (32 * 1024) : \ 155 | ((arch >= 120) ? \ 156 | (16 * 1024) : \ 157 | (8 * 1024)))) 158 | 159 | /// Oversubscription factor 160 | #define CUB_SUBSCRIPTION_FACTOR(arch) \ 161 | ((arch >= 300) ? \ 162 | (5) : \ 163 | ((arch >= 200) ? \ 164 | (3) : \ 165 | (10))) 166 | 167 | /// Prefer padding overhead vs X-way conflicts greater than this threshold 168 | #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \ 169 | ((arch >= 300) ? \ 170 | (1) : \ 171 | (4)) 172 | 173 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 174 | 175 | #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH) 176 | #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH) 177 | #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH) 178 | #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH) 179 | #define CUB_PTX_SMEM_BANK_BYTES CUB_SMEM_BANK_BYTES(CUB_PTX_ARCH) 180 | #define CUB_PTX_SMEM_BYTES CUB_SMEM_BYTES(CUB_PTX_ARCH) 181 | #define CUB_PTX_SMEM_ALLOC_UNIT CUB_SMEM_ALLOC_UNIT(CUB_PTX_ARCH) 182 | #define CUB_PTX_REGS_BY_BLOCK CUB_REGS_BY_BLOCK(CUB_PTX_ARCH) 183 | #define CUB_PTX_REG_ALLOC_UNIT CUB_REG_ALLOC_UNIT(CUB_PTX_ARCH) 184 | #define CUB_PTX_WARP_ALLOC_UNIT CUB_WARP_ALLOC_UNIT(CUB_PTX_ARCH) 185 | #define CUB_PTX_MAX_SM_THREADS CUB_MAX_SM_THREADS(CUB_PTX_ARCH) 186 | #define CUB_PTX_MAX_SM_BLOCKS CUB_MAX_SM_BLOCKS(CUB_PTX_ARCH) 187 | #define CUB_PTX_MAX_BLOCK_THREADS CUB_MAX_BLOCK_THREADS(CUB_PTX_ARCH) 188 | #define CUB_PTX_MAX_SM_REGISTERS CUB_MAX_SM_REGISTERS(CUB_PTX_ARCH) 189 | #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH) 190 | 191 | #endif // Do not document 192 | 193 | 194 | /** @} */ // end group UtilMgmt 195 | 196 | } // CUB namespace 197 | CUB_NS_POSTFIX // Optional outer namespace(s) 198 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/util_debug.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Error and event logging routines. 32 | * 33 | * The following macros definitions are supported: 34 | * - \p CUB_LOG. Simple event messages are printed to \p stdout. 35 | */ 36 | 37 | #pragma once 38 | 39 | #include 40 | #include "util_namespace.cuh" 41 | #include "util_arch.cuh" 42 | 43 | /// Optional outer namespace(s) 44 | CUB_NS_PREFIX 45 | 46 | /// CUB namespace 47 | namespace cub { 48 | 49 | 50 | /** 51 | * \addtogroup UtilMgmt 52 | * @{ 53 | */ 54 | 55 | 56 | /// CUB error reporting macro (prints error messages to stderr) 57 | #if (defined(DEBUG) || defined(_DEBUG)) 58 | #define CUB_STDERR 59 | #endif 60 | 61 | 62 | 63 | /** 64 | * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context. 65 | * 66 | * \return The CUDA error. 67 | */ 68 | __host__ __device__ __forceinline__ cudaError_t Debug( 69 | cudaError_t error, 70 | const char* filename, 71 | int line) 72 | { 73 | #ifdef CUB_STDERR 74 | if (error) 75 | { 76 | #if (CUB_PTX_ARCH == 0) 77 | fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); 78 | fflush(stderr); 79 | #elif (CUB_PTX_ARCH >= 200) 80 | printf("CUDA error %d [block %d, thread %d, %s, %d]\n", error, blockIdx.x, threadIdx.x, filename, line); 81 | #endif 82 | } 83 | #endif 84 | return error; 85 | } 86 | 87 | 88 | /** 89 | * \brief Debug macro 90 | */ 91 | #define CubDebug(e) cub::Debug((e), __FILE__, __LINE__) 92 | 93 | 94 | /** 95 | * \brief Debug macro with exit 96 | */ 97 | #define CubDebugExit(e) if (cub::Debug((e), __FILE__, __LINE__)) { exit(1); } 98 | 99 | 100 | /** 101 | * \brief Log macro for printf statements. 102 | */ 103 | #if (CUB_PTX_ARCH == 0) 104 | #define CubLog(format, ...) printf(format,__VA_ARGS__); 105 | #elif (CUB_PTX_ARCH >= 200) 106 | #define CubLog(format, ...) printf("[block %d, thread %d]: " format, blockIdx.x, threadIdx.x, __VA_ARGS__); 107 | #endif 108 | 109 | 110 | 111 | 112 | /** @} */ // end group UtilMgmt 113 | 114 | } // CUB namespace 115 | CUB_NS_POSTFIX // Optional outer namespace(s) 116 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/util_macro.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /****************************************************************************** 30 | * Common C/C++ macro utilities 31 | ******************************************************************************/ 32 | 33 | #pragma once 34 | 35 | #include "util_namespace.cuh" 36 | 37 | /// Optional outer namespace(s) 38 | CUB_NS_PREFIX 39 | 40 | /// CUB namespace 41 | namespace cub { 42 | 43 | 44 | /** 45 | * \addtogroup UtilModule 46 | * @{ 47 | */ 48 | 49 | /** 50 | * Align struct 51 | */ 52 | #if defined(_WIN32) || defined(_WIN64) 53 | #define CUB_ALIGN(bytes) __declspec(align(32)) 54 | #else 55 | #define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) 56 | #endif 57 | 58 | /** 59 | * Select maximum(a, b) 60 | */ 61 | #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) 62 | 63 | /** 64 | * Select minimum(a, b) 65 | */ 66 | #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) 67 | 68 | /** 69 | * Quotient of x/y rounded down to nearest integer 70 | */ 71 | #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) 72 | 73 | /** 74 | * Quotient of x/y rounded up to nearest integer 75 | */ 76 | #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) 77 | 78 | /** 79 | * x rounded up to the nearest multiple of y 80 | */ 81 | #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y) 82 | 83 | /** 84 | * x rounded down to the nearest multiple of y 85 | */ 86 | #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) 87 | 88 | /** 89 | * Return character string for given type 90 | */ 91 | #define CUB_TYPE_STRING(type) ""#type 92 | 93 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 94 | #define CUB_CAT_(a, b) a ## b 95 | #define CUB_CAT(a, b) CUB_CAT_(a, b) 96 | #endif // DOXYGEN_SHOULD_SKIP_THIS 97 | 98 | /** 99 | * Static assert 100 | */ 101 | #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] 102 | 103 | 104 | /** @} */ // end group UtilModule 105 | 106 | } // CUB namespace 107 | CUB_NS_POSTFIX // Optional outer namespace(s) 108 | -------------------------------------------------------------------------------- /external/cub-1.3.2/cub/util_namespace.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Place-holder for prefixing the cub namespace 32 | */ 33 | 34 | #pragma once 35 | 36 | // For example: 37 | //#define CUB_NS_PREFIX namespace thrust{ namespace detail { 38 | //#define CUB_NS_POSTFIX } } 39 | 40 | #define CUB_NS_PREFIX 41 | #define CUB_NS_POSTFIX 42 | -------------------------------------------------------------------------------- /python/SmoothParticleNets/ImageProjection.py: -------------------------------------------------------------------------------- 1 | 2 | import numbers 3 | import numpy as np 4 | 5 | import torch 6 | import torch.autograd 7 | 8 | import _ext 9 | import _extc 10 | import error_checking as ec 11 | from kernels import KERNELS, KERNEL_NAMES 12 | 13 | MAX_FLOAT = float(np.finfo(np.float32).max) 14 | 15 | 16 | class ImageProjection(torch.nn.Module): 17 | """ 18 | """ 19 | 20 | def __init__(self, camera_fl): 21 | """ Initialize a ParticleProjection layer. 22 | TODO 23 | 24 | Arguments: 25 | -camera_fl: The camera focal length in pixels (all pixels are 26 | assumed to be square. This layer does not simulate 27 | any image warping e.g. radial distortion). 28 | """ 29 | super(ImageProjection, self).__init__() 30 | 31 | self.camera_fl = ec.check_conditions(camera_fl, "camera_fl", 32 | "%s > 0", "isinstance(%s, numbers.Real)") 33 | 34 | self.register_buffer("empty_depth_mask", 35 | torch.ones(1, 1, 1)*MAX_FLOAT) 36 | 37 | def _rotationMatrixFromQuaternion(self, quat): 38 | """ 39 | 1 - 2*qy2 - 2*qz2 2*qx*qy - 2*qz*qw 2*qx*qz + 2*qy*qw 40 | 2*qx*qy + 2*qz*qw 1 - 2*qx2 - 2*qz2 2*qy*qz - 2*qx*qw 41 | 2*qx*qz - 2*qy*qw 2*qy*qz + 2*qx*qw 1 - 2*qx2 - 2*qy2 42 | """ 43 | quat = quat.data 44 | qx = quat[:, 0] 45 | qy = quat[:, 1] 46 | qz = quat[:, 2] 47 | qw = quat[:, 3] 48 | qx2 = qx*qx 49 | qxqy = qx*qy 50 | qxqz = qx*qz 51 | qxqw = qx*qw 52 | qy2 = qy*qy 53 | qyqz = qy*qz 54 | qyqw = qy*qw 55 | qz2 = qz*qz 56 | qzqw = qz*qw 57 | ret = quat.new(quat.size()[0], 3, 3) 58 | ret[:, 0, 0] = 1 - 2*qy2 - 2*qz2 59 | ret[:, 1, 0] = 2*qxqy - 2*qzqw 60 | ret[:, 2, 0] = 2*qxqz + 2*qyqw 61 | ret[:, 0, 1] = 2*qxqy + 2*qzqw 62 | ret[:, 1, 1] = 1 - 2*qx2 - 2*qz2 63 | ret[:, 2, 1] = 2*qyqz - 2*qxqw 64 | ret[:, 0, 2] = 2*qxqz - 2*qyqw 65 | ret[:, 1, 2] = 2*qyqz + 2*qxqw 66 | ret[:, 2, 2] = 1 - 2*qx2 - 2*qy2 67 | return torch.autograd.Variable(ret, requires_grad=False) 68 | 69 | def forward(self, locs, image, camera_pose, camera_rot, depth_mask=None): 70 | """ Forwad pass for the particle projection. Takes in the set of 71 | particles and outputs an image. 72 | TODO 73 | 74 | Arguments: 75 | -locs: A BxNx3 tensor where B is the batch size, N is the number 76 | of particles, and 3 is the dimensionality of the 77 | particles' coordinate space (this layer currently only 78 | supports 3D projections). 79 | -camera_pose: A Bx3 tensor containing the camera translation. 80 | -camera_rot: A Bx4 tensor containing the camera rotation as a 81 | quaternion in xyzw format. 82 | -depth_mask: An optional BxHxW tensor where W and H are the 83 | camera image width and height respectively. If not 84 | None, then this is used to compute occlusions. The 85 | value in each pixel in the depth_mask should be 86 | the distance to the first object. Any particles 87 | further away than that value will not be projected 88 | onto the output image. 89 | 90 | Returns: A BxHxW tensor of the projected particles. 91 | """ 92 | 93 | # Error checking. 94 | batch_size = locs.size()[0] 95 | N = locs.size()[1] 96 | width = image.size()[3] 97 | height = image.size()[2] 98 | channels = image.size()[1] 99 | ec.check_tensor_dims(locs, "locs", (batch_size, N, 3)) 100 | ec.check_tensor_dims( 101 | image, "image", (batch_size, channels, height, width)) 102 | ec.check_tensor_dims(camera_pose, "camera_pose", (batch_size, 3)) 103 | ec.check_tensor_dims(camera_rot, "camera_rot", (batch_size, 4)) 104 | 105 | ec.check_nans(locs, "locs") 106 | ec.check_nans(image, "image") 107 | ec.check_nans(camera_pose, "camera_pose") 108 | ec.check_nans(camera_rot, "camera_rot") 109 | 110 | if depth_mask is not None: 111 | ec.check_tensor_dims(depth_mask, "depth_mask", (batch_size, 112 | height, width)) 113 | ec.check_nans(depth_mask, "depth_mask") 114 | depth_mask = depth_mask.contiguous() 115 | else: 116 | if (self.empty_depth_mask.size()[0] != batch_size or 117 | self.empty_depth_mask.size()[1] != height or 118 | self.empty_depth_mask.size()[2] != width): 119 | self.empty_depth_mask.resize_(batch_size, height, width) 120 | self.empty_depth_mask.fill_(MAX_FLOAT) 121 | depth_mask = torch.autograd.Variable( 122 | self.empty_depth_mask, requires_grad=False) 123 | if locs.is_cuda: 124 | depth_mask = depth_mask.cuda() 125 | 126 | # Let's transform the particles to camera space here. 127 | locs = locs - camera_pose.unsqueeze(1) 128 | # Ensure the rotation quaternion is normalized. 129 | camera_rot = camera_rot / \ 130 | torch.sqrt(torch.sum(camera_rot**2, 1, keepdim=True)) 131 | # Invert the rotation. 132 | inv = camera_rot.data.new(1, 4) 133 | inv[0, 0] = -1 134 | inv[0, 1] = -1 135 | inv[0, 2] = -1 136 | inv[0, 3] = 1 137 | inv = torch.autograd.Variable(inv, requires_grad=False) 138 | camera_rot = camera_rot*inv 139 | rot = self._rotationMatrixFromQuaternion(camera_rot) 140 | if (rot != rot).data.any(): 141 | raise ValueError("No NaNs found in camera_rot argument, but NaNs created when" 142 | " constructing a rotation matrix from it.") 143 | # Rotate the locs into camera space. 144 | try: 145 | # There's a bug that causes this to fail on the first call when using cuda. 146 | # To fix that, just call it again. 147 | locs = torch.bmm(locs, rot) 148 | except RuntimeError: 149 | locs = torch.bmm(locs, rot) 150 | if (locs != locs).data.any(): 151 | raise ValueError( 152 | "Rotating locs by rotation matrix resulted in NaNs.") 153 | 154 | locs = locs.contiguous() 155 | image = image.contiguous() 156 | proj = _ImageProjectionFunction(self.camera_fl) 157 | ret = proj(locs, image, depth_mask) 158 | return ret 159 | 160 | 161 | """ 162 | 163 | INTERNAL FUNCTIONS 164 | 165 | """ 166 | 167 | 168 | class _ImageProjectionFunction(torch.autograd.Function): 169 | 170 | def __init__(self, camera_fl): 171 | super(_ImageProjectionFunction, self).__init__() 172 | self.camera_fl = camera_fl 173 | 174 | def forward(self, locs, image, depth_mask): 175 | self.save_for_backward(locs, image, depth_mask) 176 | batch_size = locs.size()[0] 177 | N = locs.size()[1] 178 | channels = image.size()[1] 179 | ret = locs.new(batch_size, N, channels) 180 | ret.fill_(0) 181 | if locs.is_cuda: 182 | if not _extc.spnc_imageprojection_forward(locs, image, 183 | self.camera_fl, depth_mask, ret): 184 | raise Exception("Cuda error") 185 | else: 186 | _ext.spn_imageprojection_forward(locs, image, 187 | self.camera_fl, depth_mask, ret) 188 | 189 | return ret 190 | 191 | def backward(self, grad_output): 192 | locs, image, depth_mask = self.saved_tensors 193 | ret_locs = grad_output.new(locs.size()) 194 | ret_locs.fill_(0) 195 | ret_image = grad_output.new(image.size()) 196 | ret_image.fill_(0) 197 | ret_depth_mask = grad_output.new(depth_mask.size()) 198 | ret_depth_mask.fill_(0) 199 | if grad_output.is_cuda: 200 | if not _extc.spnc_imageprojection_backward(locs, image, 201 | self.camera_fl, depth_mask, grad_output, ret_locs, ret_image): 202 | raise Exception("Cuda error") 203 | else: 204 | _ext.spn_imageprojection_backward(locs, image, 205 | self.camera_fl, depth_mask, grad_output, ret_locs, ret_image) 206 | 207 | return (ret_locs, 208 | ret_image, 209 | ret_depth_mask,) 210 | -------------------------------------------------------------------------------- /python/SmoothParticleNets/ParticleProjection.py: -------------------------------------------------------------------------------- 1 | 2 | import numbers 3 | import numpy as np 4 | 5 | import torch 6 | import torch.autograd 7 | 8 | import _ext 9 | import _extc 10 | import error_checking as ec 11 | from kernels import KERNELS, KERNEL_NAMES 12 | 13 | MAX_FLOAT = float(np.finfo(np.float32).max) 14 | 15 | 16 | class ParticleProjection(torch.nn.Module): 17 | """ The particle projection layer. Projects the given set of particles onto 18 | a camera image plane. For each particle, this layer finds its location on 19 | the image plane, then adds a small circular Gaussian centered at that location 20 | to the image. The contributions from all particles are added together into 21 | a final image. Note that unlike the other layers in this package, this layer 22 | only works with 3D particles. 23 | """ 24 | 25 | def __init__(self, camera_fl, camera_size, filter_std, filter_scale): 26 | """ Initialize a ParticleProjection layer. 27 | 28 | Arguments: 29 | -camera_fl: The camera focal length in pixels (all pixels are 30 | assumed to be square. This layer does not simulate 31 | any image warping e.g. radial distortion). 32 | -camera_size: 2-tuple with the image width and height in pixels. 33 | -filter_std: The standard deviation of the Gaussian that is 34 | added at each pixel location. 35 | -filter_scale: Before adding the Gaussian for an individual 36 | particle, it is scaled by this value. 37 | """ 38 | super(ParticleProjection, self).__init__() 39 | 40 | self.camera_size = ec.make_list(camera_size, 2, "camera_size", 41 | "%s > 0", "isinstance(%s, numbers.Integral)") 42 | 43 | self.camera_fl = ec.check_conditions(camera_fl, "camera_fl", 44 | "%s > 0", "isinstance(%s, numbers.Real)") 45 | self.filter_std = ec.check_conditions(filter_std, "filter_std", 46 | "%s > 0", "isinstance(%s, numbers.Real)") 47 | self.filter_scale = ec.check_conditions(filter_scale, "filter_scale", 48 | "%s > 0", "isinstance(%s, numbers.Real)") 49 | 50 | self.register_buffer("empty_depth_mask", 51 | torch.ones(1, self.camera_size[1], self.camera_size[0])*MAX_FLOAT) 52 | 53 | def _rotationMatrixFromQuaternion(self, quat): 54 | """ 55 | 1 - 2*qy2 - 2*qz2 2*qx*qy - 2*qz*qw 2*qx*qz + 2*qy*qw 56 | 2*qx*qy + 2*qz*qw 1 - 2*qx2 - 2*qz2 2*qy*qz - 2*qx*qw 57 | 2*qx*qz - 2*qy*qw 2*qy*qz + 2*qx*qw 1 - 2*qx2 - 2*qy2 58 | """ 59 | quat = quat.data 60 | qx = quat[:, 0] 61 | qy = quat[:, 1] 62 | qz = quat[:, 2] 63 | qw = quat[:, 3] 64 | qx2 = qx*qx 65 | qxqy = qx*qy 66 | qxqz = qx*qz 67 | qxqw = qx*qw 68 | qy2 = qy*qy 69 | qyqz = qy*qz 70 | qyqw = qy*qw 71 | qz2 = qz*qz 72 | qzqw = qz*qw 73 | ret = quat.new(quat.size()[0], 3, 3) 74 | ret[:, 0, 0] = 1 - 2*qy2 - 2*qz2 75 | ret[:, 1, 0] = 2*qxqy - 2*qzqw 76 | ret[:, 2, 0] = 2*qxqz + 2*qyqw 77 | ret[:, 0, 1] = 2*qxqy + 2*qzqw 78 | ret[:, 1, 1] = 1 - 2*qx2 - 2*qz2 79 | ret[:, 2, 1] = 2*qyqz - 2*qxqw 80 | ret[:, 0, 2] = 2*qxqz - 2*qyqw 81 | ret[:, 1, 2] = 2*qyqz + 2*qxqw 82 | ret[:, 2, 2] = 1 - 2*qx2 - 2*qy2 83 | return torch.autograd.Variable(ret, requires_grad=False) 84 | 85 | def forward(self, locs, camera_pose, camera_rot, depth_mask=None): 86 | """ Forwad pass for the particle projection. Takes in the set of 87 | particles and outputs an image. 88 | 89 | Arguments: 90 | -locs: A BxNx3 tensor where B is the batch size, N is the number 91 | of particles, and 3 is the dimensionality of the 92 | particles' coordinate space (this layer currently only 93 | supports 3D projections). 94 | -camera_pose: A Bx3 tensor containing the camera translation. 95 | -camera_rot: A Bx4 tensor containing the camera rotation as a 96 | quaternion in xyzw format. 97 | -depth_mask: An optional BxHxW tensor where W and H are the 98 | camera image width and height respectively. If not 99 | None, then this is used to compute occlusions. The 100 | value in each pixel in the depth_mask should be 101 | the distance to the first object. Any particles 102 | further away than that value will not be projected 103 | onto the output image. 104 | 105 | Returns: A BxHxW tensor of the projected particles. 106 | """ 107 | 108 | # Error checking. 109 | batch_size = locs.size()[0] 110 | N = locs.size()[1] 111 | ec.check_tensor_dims(locs, "locs", (batch_size, N, 3)) 112 | ec.check_tensor_dims(camera_pose, "camera_pose", (batch_size, 3)) 113 | ec.check_tensor_dims(camera_rot, "camera_rot", (batch_size, 4)) 114 | 115 | if depth_mask is not None: 116 | ec.check_tensor_dims(depth_mask, "depth_mask", (batch_size, 117 | self.camera_size[1], self.camera_size[0])) 118 | depth_mask = depth_mask.contiguous() 119 | else: 120 | if self.empty_depth_mask.size()[0] != batch_size: 121 | self.empty_depth_mask.resize_( 122 | batch_size, self.camera_size[1], self.camera_size[0]) 123 | self.empty_depth_mask.fill_(MAX_FLOAT) 124 | depth_mask = torch.autograd.Variable( 125 | self.empty_depth_mask, requires_grad=False) 126 | if locs.is_cuda: 127 | depth_mask = depth_mask.cuda() 128 | 129 | # Let's transform the particles to camera space here. 130 | locs = locs - camera_pose.unsqueeze(1) 131 | # Ensure the rotation quaternion is normalized. 132 | camera_rot = camera_rot / \ 133 | torch.sqrt(torch.sum(camera_rot**2, 1, keepdim=True)) 134 | # Invert the rotation. 135 | inv = camera_rot.data.new(1, 4) 136 | inv[0, 0] = -1 137 | inv[0, 1] = -1 138 | inv[0, 2] = -1 139 | inv[0, 3] = 1 140 | inv = torch.autograd.Variable(inv, requires_grad=False) 141 | camera_rot = camera_rot*inv 142 | rot = self._rotationMatrixFromQuaternion(camera_rot) 143 | # Rotate the locs into camera space. 144 | try: 145 | # There's a bug that causes this to fail on the first call when using cuda. 146 | # To fix that, just call it again. 147 | locs = torch.bmm(locs, rot) 148 | except RuntimeError: 149 | locs = torch.bmm(locs, rot) 150 | 151 | locs = locs.contiguous() 152 | proj = _ParticleProjectionFunction(self.camera_fl, self.camera_size, self.filter_std, 153 | self.filter_scale) 154 | ret = proj(locs, depth_mask) 155 | return ret 156 | 157 | 158 | """ 159 | 160 | INTERNAL FUNCTIONS 161 | 162 | """ 163 | 164 | 165 | class _ParticleProjectionFunction(torch.autograd.Function): 166 | 167 | def __init__(self, camera_fl, camera_size, filter_std, filter_scale): 168 | super(_ParticleProjectionFunction, self).__init__() 169 | self.camera_fl = camera_fl 170 | self.camera_size = camera_size 171 | self.filter_std = filter_std 172 | self.filter_scale = filter_scale 173 | 174 | def forward(self, locs, depth_mask): 175 | self.save_for_backward(locs, depth_mask) 176 | batch_size = locs.size()[0] 177 | ret = locs.new(batch_size, self.camera_size[1], self.camera_size[0]) 178 | ret.fill_(0) 179 | if locs.is_cuda: 180 | if not _extc.spnc_particleprojection_forward(locs, self.camera_fl, 181 | self.filter_std, self.filter_scale, depth_mask, ret): 182 | raise Exception("Cuda error") 183 | else: 184 | _ext.spn_particleprojection_forward(locs, self.camera_fl, 185 | self.filter_std, self.filter_scale, depth_mask, ret) 186 | 187 | return ret 188 | 189 | def backward(self, grad_output): 190 | locs, depth_mask = self.saved_tensors 191 | ret_locs = grad_output.new(locs.size()) 192 | ret_locs.fill_(0) 193 | ret_depth_mask = grad_output.new(depth_mask.size()) 194 | ret_depth_mask.fill_(0) 195 | if grad_output.is_cuda: 196 | if not _extc.spnc_particleprojection_backward(locs, 197 | self.camera_fl, self.filter_std, self.filter_scale, depth_mask, grad_output, ret_locs): 198 | raise Exception("Cuda error") 199 | else: 200 | _ext.spn_particleprojection_backward(locs, 201 | self.camera_fl, self.filter_std, self.filter_scale, depth_mask, grad_output, ret_locs) 202 | 203 | return (ret_locs, 204 | ret_depth_mask,) 205 | -------------------------------------------------------------------------------- /python/SmoothParticleNets/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from os.path import dirname, basename, isfile 3 | import glob 4 | import sys 5 | sys.path.append(dirname(__file__)) 6 | modules = glob.glob(dirname(__file__)+"/*.py") 7 | __all__ = [basename(f)[:-3] for f in modules if isfile(f)] 8 | for f in modules: 9 | if isfile(f) and "__init__" not in f and "install" not in f: 10 | exec('from %s import *' % basename(f)[:-3]) 11 | -------------------------------------------------------------------------------- /python/SmoothParticleNets/error_checking.py: -------------------------------------------------------------------------------- 1 | 2 | import numbers 3 | import numpy as np 4 | 5 | import torch 6 | 7 | def check_nans(v, name): 8 | if (v != v).data.any(): 9 | raise ValueError("Found NaNs in %s" % name) 10 | 11 | def throws_exception(exception_type, func, *args, **kwargs): 12 | try: 13 | func(*args, **kwargs) 14 | return False 15 | except exception_type: 16 | return True 17 | 18 | def check_conditions(v, name, *conditions): 19 | for condition in conditions: 20 | if not eval(condition % "v"): 21 | raise ValueError(("%s must meet the following condition: " + condition) 22 | % (name, name)) 23 | return v 24 | 25 | def make_list(l, length, name, *conditions): 26 | if throws_exception(TypeError, list, l): 27 | l = [l]*length 28 | else: 29 | l = list(l) 30 | if len(l) != length: 31 | raise ValueError("%s must be a list of length %d." % (name, length)) 32 | for i, ll in enumerate(l): 33 | l[i] = check_conditions(ll, name, *conditions) 34 | return l 35 | 36 | def check_tensor_dims(t, name, dims): 37 | s = t.size() 38 | if len(s) != len(dims): 39 | raise ValueError("%s must be a %d-dimensional tensor." % (name, len(dims))) 40 | for i in range(len(dims)): 41 | if dims[i] >= 0 and s[i] != dims[i]: 42 | raise ValueError("The %dth dimension of %s must have size %d, not %d." 43 | % (i, name, dims[i], s[i])) 44 | 45 | def list2tensor(l): 46 | return torch.from_numpy(np.array(l, dtype=np.float32)) -------------------------------------------------------------------------------- /python/SmoothParticleNets/kernels.py: -------------------------------------------------------------------------------- 1 | 2 | KERNELS = {} 3 | DKERNELS = {} 4 | 5 | 6 | """ DEFAULT: 7 | \eta * \sigma * max(0, H^2 - d^2)^3 8 | H = radius 9 | d = distance 10 | \sigma = 1/pi (dim norm) 11 | \eta = 315/(64*H^9) (norm) 12 | """ 13 | KERNELS["default"] = ( 14 | "(315.0f/(64.0f*M_PI*H*H*H*H*H*H*H*H*H))*(H*H-d*d)*(H*H-d*d)*(H*H-d*d)") 15 | 16 | """ DDEFAULT: 17 | \eta * \sigma * d * max(0, H^2 - d^2)^2 18 | H = radius 19 | d = distance 20 | \sigma = 1/pi (dim norm) 21 | \eta = -945/(32*H^9) (norm) 22 | """ 23 | KERNELS["ddefault"] = "(-945.0f/(32.0f*M_PI*H*H*H*H*H*H*H*H*H))*(H*H-d*d)*(H*H-d*d)*d" 24 | DKERNELS["default"] = KERNELS["ddefault"] 25 | 26 | """ DDEFAULT2: 27 | \eta * \sigma * (H^4 - 6*H^2*d^2 + 5d^4) 28 | H = radius 29 | d = distance 30 | \sigma = 1/pi (dim norm) 31 | \eta = -945/(32*H^9) (norm) 32 | """ 33 | KERNELS["ddefault2"] = "(-945.0f/(32.0f*M_PI*H*H*H*H*H*H*H*H*H))*(H*H*H*H - 6*H*H*d*d + 5*d*d*d*d)" 34 | DKERNELS["ddefault"] = KERNELS["ddefault2"] 35 | DKERNELS["ddefault2"] = "(-945.0f/(32.0f*M_PI*H*H*H*H*H*H*H*H*H))*(20*d*d*d - 12*H*H*d)" 36 | 37 | """ PRESSURE: 38 | \eta * \sigma * max(0, H - d)^3 39 | H = radius 40 | d = distance 41 | \sigma = 1/pi (dim norm) 42 | \eta = 15/(H^6) (norm) 43 | """ 44 | KERNELS["pressure"] = "(15.0f/(M_PI*H*H*H*H*H*H))*(H-d)*(H-d)*(H-d)" 45 | 46 | """ DPRESSURE: 47 | \eta * \sigma * max(0, H - d)^2 48 | H = radius 49 | d = distance 50 | \sigma = 1/pi (dim norm) 51 | \eta = -45/(H^6) (norm) 52 | """ 53 | KERNELS["dpressure"] = "(-45.0f/(M_PI*H*H*H*H*H*H))*(H-d)*(H-d)" 54 | DKERNELS["pressure"] = KERNELS["dpressure"] 55 | 56 | """ DPRESSURE2: 57 | \eta * \sigma * max(0, H - d) * (H - 2*d)/2 58 | H = radius 59 | d = distance 60 | \sigma = 1/pi (dim norm) 61 | \eta = -90/(H^6) (norm) 62 | """ 63 | KERNELS["dpressure2"] = "(90.0f/(M_PI*H*H*H*H*H*H))*(H-d)" 64 | DKERNELS["dpressure"] = KERNELS["dpressure2"] 65 | DKERNELS["dpressure2"] = "(-90.0f/(M_PI*H*H*H*H*H*H))" 66 | 67 | """ INDIRECT: 68 | H - d 69 | H = radius 70 | d = distance 71 | """ 72 | KERNELS["indirect"] = "H - d" 73 | DKERNELS["indirect"] = "-1.0f" 74 | 75 | """ CONSTANT: 76 | 1 77 | """ 78 | KERNELS["constant"] = "1.0f" 79 | DKERNELS["constant"] = "0.0f" 80 | 81 | """ SPIKY: 82 | \eta * \sigma * (1 - d/H)^2 83 | H = radius 84 | d = distance 85 | \sigma = 1/pi (dim norm) 86 | \eta = 15/(H^3) (norm) 87 | """ 88 | KERNELS["spiky"] = "15.0f/(M_PI*H*H*H)*(1.0f-d/H)*(1.0f-d/H)" 89 | 90 | """ DSPIKY: 91 | \eta * \sigma * 2 * (1 - d/H)/H 92 | H = radius 93 | d = distance 94 | \sigma = 1/pi (dim norm) 95 | \eta = 15/(H^3) (norm) 96 | """ 97 | KERNELS["dspiky"] = "-15.0f/(M_PI*H*H*H)*2.0f*(1.0f - d/H)/H" 98 | DKERNELS["spiky"] = KERNELS["dspiky"] 99 | DKERNELS["dspiky"] = "-15.0f/(M_PI*H*H*H)*2.0f*(-1.0f/H)/H" 100 | 101 | """ COHESION: 102 | -(1.0f + \eta)/\eta^2*(d/H)^3 + (\eta^2 + \eta + 1)/\eta^2*(d/H)^2 - 1 103 | \eta * \sigma * (1 - d/H)^2 104 | H = radius 105 | d = distance 106 | \eta = 0.5 (rest) 107 | """ 108 | KERNELS["cohesion"] = "-6.0f*(d/H)*(d/H)*(d/H) + 7*(d/H)*(d/H) - 1" 109 | DKERNELS["cohesion"] = "2.0f*d*(7.0f*H - 9.0f*d)/(H*H*H)" 110 | 111 | """ SIGMOID: 112 | 1/(1 + exp((d - C*H)*S/H)) 113 | H = radius 114 | d = distance 115 | S = 20 (sharpness) 116 | C = 0.2 (center ratio) 117 | """ 118 | KERNELS["sigmoid"] = "1.0f/(1.0f + expf((d - 0.2f*H)*20.0f/H))" 119 | # -S*expf((d - C*H)*S/H)/(H*(expf((d - C*H)*S/H) + 1.0f)*(expf((d - C*H)*S/H) + 1.0f)) 120 | DKERNELS["sigmoid"] = ("-20.0f*expf((d - 0.2f*H)*20.0f/H)/" + 121 | "(H*(expf((d - 0.2f*H)*20.0f/H) + 1.0f)*(expf((d - 0.2f*H)*20.0f/H) + 1.0f))") 122 | 123 | KERNEL_NAMES = sorted(KERNELS.keys()) 124 | 125 | import math 126 | KERNEL_FN = {k : eval("lambda d, H: " + v 127 | .replace("M_PI", "math.pi") 128 | .replace("fmaxf", "max") 129 | .replace("expf", "math.exp") 130 | .replace("f", "")) 131 | for k,v in KERNELS.items()} -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | 5 | from setuptools import setup 6 | import torch 7 | from torch.utils.cpp_extension import CppExtension, BuildExtension, CUDAExtension 8 | 9 | # Parse command line args. 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--with_cuda', action="store_true", default=None) 12 | parser.add_argument('--without_cuda', action="store_true", default=None) 13 | args, unknown = parser.parse_known_args() 14 | sys.argv = sys.argv[:2] + unknown 15 | 16 | if args.with_cuda is None: 17 | if args.without_cuda is not None: 18 | args.with_cuda = not args.without_cuda 19 | else: 20 | print("--with_cuda or --without_cuda not specified, using PyTorch to decide...") 21 | args.with_cuda = torch.cuda.is_available() 22 | if args.with_cuda: 23 | print("torch.cuda.is_available says True, proceeding to build with cuda.") 24 | else: 25 | print("torch.cuda.is_available says False, proceeding to build without cuda.") 26 | 27 | 28 | # Setup global variables. 29 | root_dir = os.path.dirname(os.path.abspath(__file__)) 30 | test_dir = os.path.join(root_dir, "tests") 31 | py_dir = os.path.join(root_dir, "python", "SmoothParticleNets") 32 | src_dir = os.path.join(root_dir, "src") 33 | 34 | # Create pytest args. 35 | pytest_args = { 36 | 'with_cuda': args.with_cuda, 37 | } 38 | fp = open(os.path.join(test_dir, "pytest_args.py"), "w") 39 | for k, v in pytest_args.items(): 40 | if isinstance(v, str): 41 | v = "'" + v + "'" 42 | fp.write("%s = %s\n" % (k, str(v))) 43 | fp.close() 44 | 45 | # Build kernel_constants.h 46 | # Add path to python source to path. 47 | sys.path.append(py_dir) 48 | from kernels import KERNELS, KERNEL_NAMES, DKERNELS 49 | fp = open(os.path.join(src_dir, "kernel_constants.h"), "w") 50 | fp.write("// THIS FILE IS AUTOGENERATED. DO NOT ALTER.\n") 51 | fp.write("#ifndef __kernel_constants_h__\n") 52 | fp.write("#define __kernel_constants_h__\n") 53 | fp.write("#ifdef __cplusplus\n") 54 | fp.write("extern \"C\" {\n") 55 | fp.write("#endif\n") 56 | fp.write("\n") 57 | fp.write("#include \n") 58 | fp.write("#include \n") 59 | fp.write("\n") 60 | fp.write("#ifdef CUDA\n") 61 | fp.write("__host__ __device__\n") 62 | fp.write("#endif\n") 63 | fp.write("inline\n") 64 | fp.write("float KERNEL_W(float d, float H, int fn) {\n") 65 | fp.write(" float ret = 0.0f;\n") 66 | for i, k in enumerate(KERNEL_NAMES): 67 | fp.write(" if(fn == %d) { ret = (%s); }\n" % (i, KERNELS[k])) 68 | fp.write(" return ret;\n") 69 | fp.write("}\n\n") 70 | fp.write("#ifdef CUDA\n") 71 | fp.write("__host__ __device__\n") 72 | fp.write("#endif\n") 73 | fp.write("inline\n") 74 | fp.write("float KERNEL_DW(float d, float H, int fn) {\n") 75 | fp.write(" float ret = 0.0f;\n") 76 | for i, k in enumerate(KERNEL_NAMES): 77 | fp.write(" if(fn == %d) { ret = (%s); }\n" % (i, DKERNELS[k])) 78 | fp.write(" return ret;\n") 79 | fp.write("}\n\n") 80 | fp.write("#define VALIDATE_KERNEL_ID(fn) (fn >= 0 && fn < %d)" % len(KERNELS)) 81 | fp.write("\n") 82 | fp.write("#ifdef __cplusplus\n") 83 | fp.write("}\n") 84 | fp.write("#endif\n") 85 | fp.write("#endif\n") 86 | fp.flush() 87 | fp.close() 88 | 89 | # Define extensions. 90 | ext_modules = [ 91 | CppExtension('SmoothParticleNets._ext', [ 92 | os.path.join(src_dir, 'cpu_layer_funcs.cpp'), 93 | ]), 94 | ] 95 | if args.with_cuda: 96 | ext_modules.append(CUDAExtension('SmoothParticleNets._extc', [ 97 | os.path.join(src_dir, 'cuda_layer_funcs.cpp'), 98 | os.path.join(src_dir, 'gpu_kernels.cu'), 99 | ])) 100 | 101 | # The main setup call. 102 | setup( 103 | name='SmoothParticleNets', 104 | package_dir={'': 'python'}, 105 | packages=['SmoothParticleNets'], 106 | ext_modules=ext_modules, 107 | cmdclass={ 108 | 'build_ext': BuildExtension 109 | }) 110 | -------------------------------------------------------------------------------- /src/constants.h: -------------------------------------------------------------------------------- 1 | #ifndef __constants_h__ 2 | #define __constants_h__ 3 | #ifdef __cplusplus 4 | extern "C" { 5 | #endif 6 | 7 | #define MAX_CARTESIAN_DIM 20 8 | 9 | #ifdef __cplusplus 10 | } 11 | #endif 12 | 13 | #endif -------------------------------------------------------------------------------- /src/gpu_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef __gpu_kernels_h__ 2 | #define __gpu_kernels_h__ 3 | #ifdef __cplusplus 4 | extern "C" { 5 | #endif 6 | 7 | int cuda_convsp( 8 | const float* qlocs, 9 | const float* locs, 10 | const float* data, 11 | const float* neighbors, 12 | const float* weight, 13 | const float* bias, 14 | const int batch_size, 15 | const int M, 16 | const int N, 17 | const int nchannels, 18 | const int ndims, 19 | const int max_neighbors, 20 | const int nkernels, 21 | const int ncells, 22 | const float radius, 23 | const float* kernel_size, 24 | const float* dilation, 25 | const int dis_norm, 26 | const int kernel_fn, 27 | float* out, 28 | float* dqlocs, 29 | float* dlocs, 30 | float* ddata, 31 | float* dweight, 32 | cudaStream_t stream, 33 | const size_t nshared_device_mem); 34 | 35 | int cuda_convsdf( 36 | const float* locs, 37 | const int batch_size, 38 | const int N, 39 | const int ndims, 40 | const float* idxs, 41 | const float* poses, 42 | const float* scales, 43 | const int M, 44 | const int pose_len, 45 | const float* sdfs, 46 | const float* sdf_offsets, 47 | const float* sdf_shapes, 48 | const float* weight, 49 | const float* bias, 50 | const int nkernels, 51 | const int ncells, 52 | const float* kernel_size, 53 | const float* dilation, 54 | const float max_distance, 55 | float* out, 56 | float* dlocs, 57 | float* dweight, 58 | float* dposes, 59 | cudaStream_t stream); 60 | 61 | size_t GetSharedMemPerBlock(int device); 62 | 63 | int cuda_hashgrid_order( 64 | float* locs, 65 | const float* low, 66 | const float* grid_dims, 67 | float* cellIDs, 68 | float* idxs, 69 | float* buffer, 70 | const int batch_size, 71 | const int N, 72 | const int ndims, 73 | const float cellEdge, 74 | cudaStream_t stream); 75 | 76 | int cuda_compute_collisions( 77 | const float* qlocs, 78 | const float* locs, 79 | const float* low, 80 | const float* grid_dims, 81 | const float* cellIDs, 82 | float* cellStarts, 83 | float* cellEnds, 84 | float* collisions, 85 | const int batch_size, 86 | const int M, 87 | const int N, 88 | const int ndims, 89 | const int max_collisions, 90 | const int ncells, 91 | const float cellEdge, 92 | const float radius, 93 | const int include_self, 94 | cudaStream_t stream); 95 | 96 | int cuda_reorder_data( 97 | float* locs, 98 | float* data, 99 | float* idxs, 100 | float* nlocs, 101 | float* ndata, 102 | const int batch_size, 103 | const int N, 104 | const int ndims, 105 | const int nchannels, 106 | const int reverse, 107 | cudaStream_t stream); 108 | 109 | size_t get_radixsort_buffer_size(cudaStream_t stream); 110 | 111 | int cuda_particleprojection( 112 | const float* locs, 113 | const float camera_fl, 114 | const float filter_std, 115 | const float filter_scale, 116 | const float* depth_mask, 117 | const int batch_size, 118 | const int N, 119 | const int width, 120 | const int height, 121 | float* out, 122 | float* dlocs, 123 | cudaStream_t stream); 124 | 125 | int cuda_imageprojection( 126 | const float* locs, 127 | const float* image, 128 | const float camera_fl, 129 | const float* depth_mask, 130 | const int batch_size, 131 | const int N, 132 | const int width, 133 | const int height, 134 | const int channels, 135 | float* out, 136 | float* dlocs, 137 | float* dimage, 138 | cudaStream_t stream); 139 | 140 | #ifdef __cplusplus 141 | } 142 | #endif 143 | 144 | #endif -------------------------------------------------------------------------------- /tests/test_convsp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | # Add path to python source to path. 4 | sys.path.append(os.path.join(os.path.dirname( 5 | os.path.dirname(os.path.abspath(__file__))), "python")) 6 | import SmoothParticleNets as spn 7 | 8 | import itertools 9 | import numpy as np 10 | import torch 11 | import torch.autograd 12 | 13 | from gradcheck import gradcheck 14 | try: 15 | import pytest_args 16 | except ImportError: 17 | print("Make sure to compile SmoothParticleNets before running tests.") 18 | raise 19 | 20 | 21 | def pyconvsp(qlocs, locs, data, weights, biases, kernel_fn, KERNEL_SIZE, RADIUS, DILATION, NKERNELS): 22 | w = spn.KERNEL_FN[kernel_fn] 23 | 24 | BATCH_SIZE = locs.shape[0] 25 | M = qlocs.shape[1] 26 | N = locs.shape[1] 27 | NDIM = locs.shape[-1] 28 | 29 | kernel_centers = (np.array(KERNEL_SIZE) - 1)/2 30 | ground_truth = np.zeros((BATCH_SIZE, M, NKERNELS), dtype=data.dtype) 31 | for b in range(BATCH_SIZE): 32 | for i in range(M): 33 | for j in range(N): 34 | dd = np.square(qlocs[b, i, :] - locs[b, j, :]).sum() 35 | nr = DILATION*max(KERNEL_SIZE)/2 + RADIUS 36 | if dd > nr*nr: 37 | continue 38 | for k, idxs in enumerate(itertools.product(*[range(x) for x in KERNEL_SIZE[::-1]])): 39 | dd = np.square(qlocs[b, i, :] + (idxs[::-1] - kernel_centers)*DILATION 40 | - locs[b, j, :]).sum() 41 | if dd > RADIUS*RADIUS: 42 | continue 43 | ground_truth[b, i, :] += weights[:, :, k].dot( 44 | w(np.sqrt(dd), RADIUS)*data[b, j, :]) 45 | ground_truth += biases[np.newaxis, np.newaxis, :] 46 | return ground_truth 47 | 48 | 49 | def test_convsp(cpu=True, cuda=True): 50 | if cpu: 51 | print("Testing CPU implementation of ConvSP...") 52 | eval_convsp(cuda=False) 53 | print("CPU implementation passed!") 54 | print("") 55 | 56 | if cuda: 57 | if pytest_args.with_cuda: 58 | print("Testing CUDA implementation of ConvSP...") 59 | eval_convsp(cuda=True) 60 | print("CUDA implementation passed!") 61 | else: 62 | print("Not compiled with CUDA, skipping CUDA test.") 63 | 64 | 65 | def eval_convsp(cuda=False): 66 | BATCH_SIZE = 2 67 | N = 5 68 | M = 3 69 | NDIM = 2 70 | KERNEL_SIZE = (3, 1) 71 | RADIUS = 1.0 72 | DILATION = 0.05 73 | NCHANNELS = 2 74 | NKERNELS = 3 75 | 76 | np.random.seed(0) 77 | 78 | locs = np.random.rand(BATCH_SIZE, N, NDIM).astype(np.float32) 79 | qlocs = np.random.rand(BATCH_SIZE, M, NDIM).astype(np.float32) 80 | data = np.random.rand(BATCH_SIZE, N, NCHANNELS).astype(np.float32) 81 | weights = np.random.rand(NKERNELS, NCHANNELS, np.prod( 82 | KERNEL_SIZE)).astype(np.float32) 83 | biases = np.random.rand(NKERNELS).astype(np.float32) 84 | 85 | def use_cuda(x): 86 | if cuda: 87 | return x.cuda() 88 | else: 89 | return x 90 | 91 | def undo_cuda(x): 92 | if cuda: 93 | return x.cpu() 94 | else: 95 | return x 96 | 97 | for use_qlocs in (True, False): 98 | 99 | locs_t = torch.autograd.Variable( 100 | use_cuda(torch.FloatTensor(locs)), requires_grad=True) 101 | if use_qlocs: 102 | qlocs_t = torch.autograd.Variable( 103 | use_cuda(torch.FloatTensor(qlocs)), requires_grad=True) 104 | else: 105 | qlocs_t = None 106 | data_t = torch.autograd.Variable( 107 | use_cuda(torch.FloatTensor(data)), requires_grad=True) 108 | weights_t = torch.nn.Parameter( 109 | torch.FloatTensor(weights), requires_grad=True) 110 | biases_t = torch.nn.Parameter( 111 | torch.FloatTensor(biases), requires_grad=True) 112 | 113 | coll = use_cuda(spn.ParticleCollision(NDIM, 114 | RADIUS + DILATION*max((k - 1)/2 for k in KERNEL_SIZE))) 115 | locs_t, data_t, idxs_t, neighbors_t = coll( 116 | locs_t, data_t, (qlocs_t if use_qlocs else None)) 117 | 118 | for kernel_fn in spn.KERNEL_NAMES: 119 | print("\tTesting kernel %s (%s query locations)..." % 120 | (kernel_fn, "with" if use_qlocs else "without")) 121 | ground_truth = pyconvsp((qlocs if use_qlocs else locs), locs, data, weights, biases, 122 | kernel_fn, KERNEL_SIZE, RADIUS, DILATION, NKERNELS) 123 | 124 | convsp = spn.ConvSP(NCHANNELS, NKERNELS, NDIM, KERNEL_SIZE, DILATION, RADIUS, 125 | kernel_fn=kernel_fn) 126 | convsp.weight = weights_t 127 | convsp.bias = biases_t 128 | convsp = use_cuda(convsp) 129 | 130 | pred_t = undo_cuda(convsp(locs_t, data_t, neighbors_t, qlocs_t)) 131 | np.testing.assert_array_almost_equal( 132 | pred_t.data.numpy(), ground_truth, decimal=3) 133 | 134 | dt = torch.autograd.Variable(data_t.data, requires_grad=True) 135 | lt = torch.autograd.Variable(locs_t.data, requires_grad=True) 136 | if use_qlocs: 137 | qt = torch.autograd.Variable(qlocs_t.data, requires_grad=True) 138 | wt = torch.nn.Parameter(weights_t.data, requires_grad=True) 139 | bt = torch.nn.Parameter(biases_t.data, requires_grad=True) 140 | # Use pyconvsp to allow for double precision when computing numeric grads. 141 | 142 | def func_numerical(l, d, w, b, q=None): 143 | return (torch.autograd.Variable(torch.from_numpy( 144 | pyconvsp((q.data.cpu().numpy() if use_qlocs else l.data.cpu().numpy()), 145 | l.data.cpu().numpy(), 146 | d.data.cpu().numpy(), w.data.cpu().numpy(), b.data.cpu().numpy(), 147 | kernel_fn, KERNEL_SIZE, RADIUS, DILATION, NKERNELS))),) 148 | 149 | def func_analytical(l, d, w, b, q=None): 150 | convsp.weight = w 151 | convsp.bias = b 152 | return (convsp(l, d, neighbors_t, (q if use_qlocs else None)),) 153 | assert gradcheck(func_analytical, 154 | ((lt, dt, wt, bt, qt) 155 | if use_qlocs else (lt, dt, wt, bt,)), 156 | eps=1e-4, atol=1e-3, rtol=1e-1, func_numerical=func_numerical, use_double=True) 157 | 158 | 159 | if __name__ == '__main__': 160 | import argparse 161 | parser = argparse.ArgumentParser() 162 | parser.add_argument('--cpu', dest='cpu', action="store_true", default=True) 163 | parser.add_argument('--no-cpu', dest='cpu', action="store_false") 164 | parser.add_argument('--cuda', dest='cuda', 165 | action="store_true", default=True) 166 | parser.add_argument('--no-cuda', dest='cuda', action="store_false") 167 | args = parser.parse_args() 168 | test_convsp(cpu=args.cpu, cuda=args.cuda) 169 | -------------------------------------------------------------------------------- /tests/test_imageprojection.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | # Add path to python source to path. 4 | sys.path.append(os.path.join(os.path.dirname( 5 | os.path.dirname(os.path.abspath(__file__))), "python")) 6 | import SmoothParticleNets as spn 7 | 8 | import itertools 9 | import numpy as np 10 | import torch 11 | import torch.autograd 12 | 13 | from gradcheck import gradcheck 14 | from test_convsdf import quaternionMult, quaternionConjugate 15 | from regular_grid_interpolater import RegularGridInterpolator 16 | try: 17 | import pytest_args 18 | except ImportError: 19 | print("Make sure to compile SmoothParticleNets before running tests.") 20 | raise 21 | 22 | 23 | def pyproject(locs, image, camera_fl, camera_pose, 24 | camera_rot, depth_mask=None, dtype=np.float32): 25 | batch_size = locs.shape[0] 26 | N = locs.shape[1] 27 | channels = image.shape[1] 28 | width = image.shape[3] 29 | height = image.shape[2] 30 | ret = np.zeros((batch_size, N, channels), dtype=dtype) 31 | if depth_mask is None: 32 | depth_mask = np.ones((batch_size, height, width), 33 | dtype=dtype)*np.finfo(np.float32).max 34 | depth_fns = [RegularGridInterpolator( 35 | [np.arange(0.5, width, 1), np.arange(0.5, height, 1)], 36 | depth_mask[b, ...].transpose(), bounds_error=False, fill_value=np.finfo(np.float32).max) 37 | for b in range(batch_size)] 38 | for b in range(batch_size): 39 | r = locs[b, ...] - camera_pose[b, ...] 40 | r = np.concatenate((r, np.zeros((N, 1), dtype=r.dtype)), axis=-1) 41 | r = np.array([quaternionMult(quaternionConjugate(camera_rot[b, :]), 42 | quaternionMult(r[i, ...], camera_rot[b, :])) for i in range(N)], dtype=dtype) 43 | ijs = np.concatenate(( 44 | r[:, 0:1]*camera_fl/r[:, 2:3] + width/2.0, 45 | r[:, 1:2]*camera_fl/r[:, 2:3] + height/2.0, 46 | ), axis=-1) 47 | depths = depth_fns[b](ijs) 48 | mask = (r[:, 2] <= depths)*(r[:, 2] > 0) 49 | for c in range(channels): 50 | fn = RegularGridInterpolator( 51 | [np.arange(0.5, width, 1), np.arange(0.5, height, 1)], 52 | image[b, c, ...].transpose(), bounds_error=False, fill_value=0) 53 | ret[b, :, c] = fn(ijs)*mask 54 | 55 | return ret 56 | 57 | 58 | def test_imageprojection(cpu=True, cuda=True): 59 | if cpu: 60 | print("Testing CPU implementation of ImageProjection...") 61 | eval_imageprojection(cuda=False) 62 | print("CPU implementation passed!") 63 | print("") 64 | 65 | if cuda: 66 | if pytest_args.with_cuda: 67 | print("Testing CUDA implementation of ImageProjection...") 68 | eval_imageprojection(cuda=True) 69 | print("CUDA implementation passed!") 70 | else: 71 | print("Not compiled with CUDA, skipping CUDA test.") 72 | 73 | 74 | def eval_imageprojection(cuda=False): 75 | np.random.seed(1) 76 | BATCH_SIZE = 2 77 | N = 5 78 | CHANNELS = 2 79 | CAMERA_FOV = 45.0/180.0*np.pi 80 | CAMERA_SIZE = (30, 30) 81 | CAMERA_FL = CAMERA_SIZE[0]/2/(CAMERA_FOV/2.0) 82 | CAMERA_POSE = 5.0*(np.random.rand(BATCH_SIZE, 3).astype(np.float32) - 0.5) 83 | CAMERA_TARGET = np.array([(0.0, 0.0, 0.0)]*BATCH_SIZE, dtype=np.float32) 84 | 85 | CAMERA_ROT = np.zeros((BATCH_SIZE, 4), dtype=np.float32) 86 | for b in range(BATCH_SIZE): 87 | CAMERA_ROT[b, :] = pointAt( 88 | CAMERA_POSE[b, :], np.array([0, 0, 0], dtype=np.float32)) 89 | 90 | locs = 2.0*(np.random.rand(BATCH_SIZE, N, 3).astype(np.float32) - 0.5) 91 | image = np.random.rand(BATCH_SIZE, CHANNELS, 92 | CAMERA_SIZE[1], CAMERA_SIZE[0]) 93 | depth_mask = np.ones((BATCH_SIZE, CAMERA_SIZE[1], CAMERA_SIZE[0]), 94 | dtype=np.float32)*np.finfo(np.float32).max 95 | ir = (int(CAMERA_SIZE[0]/2 - CAMERA_SIZE[0]*0.2), 96 | int(CAMERA_SIZE[0]/2 + CAMERA_SIZE[0]*0.2) + 1) 97 | jr = (int(CAMERA_SIZE[1]/2 - CAMERA_SIZE[1]*0.2), 98 | int(CAMERA_SIZE[1]/2 + CAMERA_SIZE[1]*0.2) + 1) 99 | ul = 0.0 100 | lr = 10.0 101 | ur = 5.0 102 | ll = 3.5 103 | for i in range(ir[0], ir[1]): 104 | for j in range(jr[0], jr[1]): 105 | ii = 1.0*(i - ir[0])/(ir[1] - ir[0]) 106 | jj = 1.0*(j - jr[0])/(jr[1] - jr[0]) 107 | l = ul*(1 - jj) + ll*jj 108 | r = ur*(1 - jj) + lr*jj 109 | depth_mask[0, j, i] = l*(1 - ii) + r*ii 110 | 111 | def use_cuda(x): 112 | if cuda: 113 | return x.cuda() 114 | else: 115 | return x 116 | 117 | def undo_cuda(x): 118 | if cuda: 119 | return x.cpu() 120 | else: 121 | return x 122 | 123 | def np2var(t): 124 | return torch.autograd.Variable(use_cuda(torch.from_numpy(t)), requires_grad=False) 125 | 126 | locs_t = torch.autograd.Variable( 127 | use_cuda(torch.FloatTensor(locs)), requires_grad=True) 128 | image_t = torch.autograd.Variable( 129 | use_cuda(torch.FloatTensor(image)), requires_grad=True) 130 | depth_mask_t = torch.autograd.Variable( 131 | use_cuda(torch.FloatTensor(depth_mask)), requires_grad=False) 132 | camera_pose_t = torch.autograd.Variable(use_cuda(torch.FloatTensor(CAMERA_POSE)), 133 | requires_grad=False) 134 | camera_rot_t = torch.autograd.Variable(use_cuda(torch.FloatTensor(CAMERA_ROT)), 135 | requires_grad=False) 136 | 137 | imageProjection = spn.ImageProjection(CAMERA_FL) 138 | 139 | ground_truth = pyproject(locs, image, CAMERA_FL, 140 | CAMERA_POSE, CAMERA_ROT, depth_mask) 141 | pred_t = imageProjection( 142 | locs_t, image_t, camera_pose_t, camera_rot_t, depth_mask_t) 143 | pred = undo_cuda(pred_t).data.numpy() 144 | np.testing.assert_array_almost_equal(pred, ground_truth, decimal=3) 145 | 146 | # Use pyproject to allow for double precision when computing numeric grads. 147 | def func_numerical(l, i): 148 | ll = undo_cuda(l).data.numpy() 149 | ii = undo_cuda(i).data.numpy() 150 | return torch.autograd.Variable(use_cuda(torch.from_numpy(pyproject(ll, ii, CAMERA_FL, CAMERA_POSE, 151 | CAMERA_ROT, dtype=np.float64))), requires_grad=False) 152 | 153 | def func_analytical(l, i): 154 | return imageProjection(l, i, camera_pose_t, camera_rot_t) 155 | assert torch.autograd.gradcheck(func_analytical, (locs_t, image_t,), 156 | eps=1e-3, atol=1e-3, rtol=1e-1) 157 | 158 | 159 | def quaternionFromMatrix(matrix): 160 | M = matrix 161 | m00 = M[0, 0] 162 | m01 = M[0, 1] 163 | m02 = M[0, 2] 164 | m10 = M[1, 0] 165 | m11 = M[1, 1] 166 | m12 = M[1, 2] 167 | m20 = M[2, 0] 168 | m21 = M[2, 1] 169 | m22 = M[2, 2] 170 | # symmetric matrix K 171 | K = np.array([[m00-m11-m22, 0.0, 0.0, 0.0], 172 | [m01+m10, m11-m00-m22, 0.0, 0.0], 173 | [m02+m20, m12+m21, m22-m00-m11, 0.0], 174 | [m21-m12, m02-m20, m10-m01, m00+m11+m22]]) 175 | K /= 3.0 176 | # quaternion is eigenvector of K that corresponds to largest eigenvalue 177 | w, V = np.linalg.eigh(K) 178 | q = V[[3, 0, 1, 2], np.argmax(w)] 179 | if q[0] < 0.0: 180 | np.negative(q, q) 181 | return [q[1], q[2], q[3], q[0]] 182 | 183 | 184 | def pointAt(pose, target): 185 | # Convention: +Z=out of camera, +Y=Down, +X=right 186 | z = target - pose 187 | z /= np.sqrt(np.sum(z**2)) 188 | y = np.array([0, -1, 0], dtype=np.float32) 189 | x = np.cross(y, z) 190 | x /= np.sqrt(np.sum(x**2)) 191 | y = np.cross(z, x) 192 | ret = quaternionFromMatrix(np.array([x, y, z]).transpose()) 193 | return ret 194 | 195 | 196 | if __name__ == '__main__': 197 | import argparse 198 | parser = argparse.ArgumentParser() 199 | parser.add_argument('--cpu', dest='cpu', action="store_true", default=True) 200 | parser.add_argument('--no-cpu', dest='cpu', action="store_false") 201 | parser.add_argument('--cuda', dest='cuda', 202 | action="store_true", default=True) 203 | parser.add_argument('--no-cuda', dest='cuda', action="store_false") 204 | args = parser.parse_args() 205 | test_imageprojection(cpu=args.cpu, cuda=args.cuda) 206 | -------------------------------------------------------------------------------- /tests/test_particlecollision.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | # Add path to python source to path. 4 | sys.path.append(os.path.join(os.path.dirname(os.path.dirname( 5 | os.path.abspath(__file__))), "python")) 6 | import SmoothParticleNets as spn 7 | 8 | import itertools 9 | import numpy as np 10 | import torch 11 | import torch.autograd 12 | 13 | from gradcheck import gradcheck 14 | try: 15 | import pytest_args 16 | except ImportError: 17 | print("Make sure to compile SmoothParticleNets before running tests.") 18 | raise 19 | 20 | 21 | def test_particlecollision(cpu=True, cuda=True): 22 | if cpu: 23 | print("Testing CPU implementation of ParticleCollision...") 24 | eval_particlecollision(cuda=False) 25 | print("CPU implementation passed!") 26 | print("") 27 | 28 | if cuda: 29 | if pytest_args.with_cuda: 30 | print("Testing CUDA implementation of ParticleCollision...") 31 | eval_particlecollision(cuda=True) 32 | print("CUDA implementation passed!") 33 | else: 34 | print("Not compiled with CUDA, skipping CUDA test.") 35 | 36 | def eval_particlecollision(cuda=False): 37 | BATCH_SIZE = 2 38 | N = 100 39 | M = 77 40 | NDIM = 2 41 | RADIUS = 0.2 42 | NCHANNELS = 2 43 | 44 | np.random.seed(0) 45 | 46 | locs = np.random.rand(BATCH_SIZE, N, NDIM).astype(np.float32) 47 | qlocs = np.random.rand(BATCH_SIZE, M, NDIM).astype(np.float32) 48 | data = np.random.rand(BATCH_SIZE, N, NCHANNELS).astype(np.float32) 49 | 50 | gt_neighbors = np.ones((BATCH_SIZE, M, N), dtype=int)*-1 51 | for b in range(BATCH_SIZE): 52 | for i in range(M): 53 | for j in range(N): 54 | d = np.square(qlocs[b, i, :] - locs[b, j, :]).sum() 55 | if d <= RADIUS*RADIUS: 56 | nc = min(np.where(gt_neighbors[b, i, :] < 0)[0]) 57 | gt_neighbors[b, i, nc] = j 58 | 59 | def use_cuda(x): 60 | if cuda: 61 | return x.cuda() 62 | else: 63 | return x 64 | def undo_cuda(x): 65 | if cuda: 66 | return x.cpu() 67 | else: 68 | return x 69 | 70 | olocs = locs 71 | oqlocs = qlocs 72 | odata = data 73 | locs = torch.autograd.Variable(use_cuda(torch.FloatTensor(locs.copy())), 74 | requires_grad=False) 75 | qlocs = torch.autograd.Variable(use_cuda(torch.FloatTensor(qlocs.copy())), 76 | requires_grad=False) 77 | data = torch.autograd.Variable(use_cuda(torch.FloatTensor(data.copy())), 78 | requires_grad=False) 79 | 80 | coll = spn.ParticleCollision(NDIM, RADIUS, max_collisions=N) 81 | convsp = use_cuda(coll) 82 | 83 | vlocs, vdata, vidxs, vneighbors = coll(locs, data, qlocs) 84 | 85 | idxs = undo_cuda(vidxs).data.numpy().astype(int) 86 | neighbors = undo_cuda(vneighbors).data.numpy().astype(int) 87 | nlocs = undo_cuda(vlocs).data.numpy() 88 | ndata = undo_cuda(vdata).data.numpy() 89 | 90 | # First make sure all the indexes are in idxs. 91 | for b in range(BATCH_SIZE): 92 | for i in range(N): 93 | assert i in idxs[b, :] 94 | 95 | # Next make sure locs and data are in the order idxs says they're in. 96 | for b in range(BATCH_SIZE): 97 | for i, j in enumerate(idxs[b, :]): 98 | assert all(olocs[b, j, :] == nlocs[b, i, :]) 99 | assert all(odata[b, j, :] == ndata[b, i, :]) 100 | 101 | # Make sure the input locs and data weren't altered. 102 | assert np.all(undo_cuda(locs).data.numpy() == olocs) 103 | assert np.all(undo_cuda(data).data.numpy() == odata) 104 | 105 | # Check the neighbor list. 106 | for b in range(BATCH_SIZE): 107 | for i in range(M): 108 | for j in neighbors[b, i, :]: 109 | if j < 0: 110 | break 111 | assert idxs[b, j] in gt_neighbors[b, i, :] 112 | for j in gt_neighbors[b, i, :]: 113 | if j < 0: 114 | break 115 | jj = np.where(idxs[b, :] == j)[0][0] 116 | assert jj in neighbors[b, i, :] 117 | 118 | # Finally put the locations and data back in their original order. 119 | reorder = use_cuda(spn.ReorderData(reverse=True)) 120 | vlocs, vdata = reorder(vidxs, vlocs, vdata) 121 | assert np.all(undo_cuda(vlocs).data.numpy() == olocs) 122 | assert np.all(undo_cuda(vdata).data.numpy() == odata) 123 | 124 | # Test gradients. 125 | def func(l, d, q): 126 | return coll(l, d, q)[:2] 127 | assert gradcheck(func, (locs, data, qlocs), eps=1e-2, atol=1e-3) 128 | 129 | 130 | 131 | if __name__ == '__main__': 132 | import argparse 133 | parser = argparse.ArgumentParser() 134 | parser.add_argument('--cpu', dest='cpu', action="store_true", default=True) 135 | parser.add_argument('--no-cpu', dest='cpu', action="store_false") 136 | parser.add_argument('--cuda', dest='cuda', action="store_true", default=True) 137 | parser.add_argument('--no-cuda', dest='cuda', action="store_false") 138 | args = parser.parse_args() 139 | test_particlecollision(cpu=args.cpu, cuda=args.cuda) --------------------------------------------------------------------------------