├── .gitignore
├── LICENSE
├── README.md
├── REQUIREMENTS.txt
├── _config.yml
├── docs
├── convsdf
│ ├── README.md
│ └── diagram.png
├── convsp
│ ├── README.md
│ ├── conv_diagram.png
│ └── kernel_diagram.png
├── imageprojection
│ └── README.md
├── particlecollision
│ └── README.md
├── particleprojection
│ └── README.md
└── reorderdata
│ └── README.md
├── examples
├── convsp_example.py
├── fluid_sim.py
└── tblogger.py
├── external
└── cub-1.3.2
│ └── cub
│ ├── block
│ ├── block_discontinuity.cuh
│ ├── block_exchange.cuh
│ ├── block_histogram.cuh
│ ├── block_load.cuh
│ ├── block_radix_rank.cuh
│ ├── block_radix_sort.cuh
│ ├── block_raking_layout.cuh
│ ├── block_reduce.cuh
│ ├── block_scan.cuh
│ ├── block_shift.cuh
│ ├── block_store.cuh
│ └── specializations
│ │ ├── block_histogram_atomic.cuh
│ │ ├── block_histogram_sort.cuh
│ │ ├── block_reduce_raking.cuh
│ │ ├── block_reduce_raking_commutative_only.cuh
│ │ ├── block_reduce_warp_reductions.cuh
│ │ ├── block_scan_raking.cuh
│ │ └── block_scan_warp_scans.cuh
│ ├── block_range
│ ├── block_range_histo.cuh
│ ├── block_range_radix_sort_downsweep.cuh
│ ├── block_range_radix_sort_upsweep.cuh
│ ├── block_range_reduce.cuh
│ ├── block_range_reduce_by_key.cuh
│ ├── block_range_scan.cuh
│ ├── block_range_select.cuh
│ ├── block_scan_prefix_operators.cuh
│ └── specializations
│ │ ├── block_range_histo_gatomic.cuh
│ │ ├── block_range_histo_satomic.cuh
│ │ └── block_range_histo_sort.cuh
│ ├── cub.cuh
│ ├── device
│ ├── device_histogram.cuh
│ ├── device_partition.cuh
│ ├── device_radix_sort.cuh
│ ├── device_reduce.cuh
│ ├── device_scan.cuh
│ ├── device_select.cuh
│ └── dispatch
│ │ ├── device_histogram_dispatch.cuh
│ │ ├── device_radix_sort_dispatch.cuh
│ │ ├── device_reduce_by_key_dispatch.cuh
│ │ ├── device_reduce_dispatch.cuh
│ │ ├── device_scan_dispatch.cuh
│ │ └── device_select_dispatch.cuh
│ ├── grid
│ ├── grid_barrier.cuh
│ ├── grid_even_share.cuh
│ ├── grid_mapping.cuh
│ └── grid_queue.cuh
│ ├── host
│ └── spinlock.cuh
│ ├── iterator
│ ├── arg_index_input_iterator.cuh
│ ├── cache_modified_input_iterator.cuh
│ ├── cache_modified_output_iterator.cuh
│ ├── constant_input_iterator.cuh
│ ├── counting_input_iterator.cuh
│ ├── tex_obj_input_iterator.cuh
│ ├── tex_ref_input_iterator.cuh
│ └── transform_input_iterator.cuh
│ ├── thread
│ ├── thread_load.cuh
│ ├── thread_operators.cuh
│ ├── thread_reduce.cuh
│ ├── thread_scan.cuh
│ └── thread_store.cuh
│ ├── util_allocator.cuh
│ ├── util_arch.cuh
│ ├── util_debug.cuh
│ ├── util_device.cuh
│ ├── util_macro.cuh
│ ├── util_namespace.cuh
│ ├── util_ptx.cuh
│ ├── util_type.cuh
│ └── warp
│ ├── specializations
│ ├── warp_reduce_shfl.cuh
│ ├── warp_reduce_smem.cuh
│ ├── warp_scan_shfl.cuh
│ └── warp_scan_smem.cuh
│ ├── warp_reduce.cuh
│ └── warp_scan.cuh
├── python
└── SmoothParticleNets
│ ├── ImageProjection.py
│ ├── ParticleCollision.py
│ ├── ParticleProjection.py
│ ├── __init__.py
│ ├── convsdf.py
│ ├── convsp.py
│ ├── error_checking.py
│ └── kernels.py
├── setup.py
├── src
├── common_funcs.h
├── constants.h
├── cpu_layer_funcs.cpp
├── cuda_layer_funcs.cpp
├── gpu_kernels.cu
└── gpu_kernels.h
└── tests
├── gradcheck.py
├── regular_grid_interpolater.py
├── test_convsdf.py
├── test_convsp.py
├── test_imageprojection.py
├── test_particlecollision.py
└── test_particleprojection.py
/.gitignore:
--------------------------------------------------------------------------------
1 | lib/gpu_kernels.cu.o
2 | test/__pycache__/test_f_grid.cpython-27-PYTEST.pyc
3 | test/__pycache__/test_particles2grid.cpython-27-PYTEST.pyc
4 | python/SmoothParticleNets/_ext/_ext.so
5 | *.pyc
6 | python/SmoothParticleNets/_ext/__ext.so
7 | .cache/
8 | test/.cache/
9 | test/pytest_args.py
10 | ._timings_n2_shared.csv
11 | src/kernel_constants.h
12 | build
13 | *.so
14 | *.egg-info
15 | tests/pytest_args.py
16 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 cschenck
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SmoothParticleNets
2 |
3 | Smooth Particle Networks (SmoothParticleNets or SPNets) is a set of custom PyTorch layers to facilitate computation with unordered particle sets.
4 | They were created for the purpose of enabling particle-based fluid dynamics inside a deep network, but the layers can be used for other purposes.
5 | Broadly, the layers enable computing particle-particle interactions, particle-object interactions, and projections onto and out of a camera image.
6 | The interface to this library is in Python.
7 | This library contains 6 layers, listed below.
8 | Note that this library provides only the basic functionality and no additional utilities, e.g., the library does not include a particle visualizer and the library does not include a tool for processing 3D object mesh files into signed distance fields.
9 |
10 | ## Layers
11 |
12 | Below is the list of each layer contained in this library.
13 | Clicking on the layer's name will take you to a description of what that layer does and how to use it.
14 |
15 | * [ConvSP](https://cschenck.github.io/SmoothParticleNets/docs/convsp)
16 | * [ConvSDF](https://cschenck.github.io/SmoothParticleNets/docs/convsdf)
17 | * [ImageProjection](https://cschenck.github.io/SmoothParticleNets/docs/imageprojection)
18 | * [ParticleProjection](https://cschenck.github.io/SmoothParticleNets/docs/particleprojection)
19 | * [ParticleCollision](https://cschenck.github.io/SmoothParticleNets/docs/particlecollision)
20 | * [ReorderData](https://cschenck.github.io/SmoothParticleNets/docs/reorderdata)
21 |
22 | ## Requirements
23 |
24 | This library only requires PyTorch as a dependency.
25 | The current version of the library has been tested to work with PyTorch 0.4.1.
26 | Furthermore, this library only supports Python 3, and does not support Python 2.
27 |
28 | Note that this library was developed only under linux and may or may not run directly without modification on other platforms.
29 | Specifically, this library is confirmed to work on Ubuntu 18.04 with PyTorch 0.4.1, Cuda 10.0, and the 410 Nvidia drivers (although that should not matter).
30 |
31 | ## Installation
32 |
33 | To install this library, download the source from github.
34 | Once downloaded, enter the root directory of the source and run
35 | ```bash
36 | sudo python3 setup.py install
37 | ```
38 |
39 | Once installed, in Python you should be able to call 'import SmoothParticleNets', which will import the library.
40 |
41 | ## Citation
42 |
43 | In published works please cite this as
44 | > C. Schenck and D. Fox, "SPNets: Differentiable Fluid Dynamics for Deep Neural Networks," in *Proceedings of the Second Conference on Robot Learning (CoRL),* Zurich, Switzerland, 2018.
45 |
46 | ```bibtex
47 | @inproceedings{spnets2018,
48 | title={SPNets: Differentiable Fluid Dynamics for Deep Neural Networks},
49 | author={Schenck, C. and Fox, D.},
50 | booktitle={Proceedings of the Second Conference on Robot Learning (CoRL)},
51 | year={2018},
52 | address={Zurich, Switzerland}
53 | }
54 | ```
55 |
--------------------------------------------------------------------------------
/REQUIREMENTS.txt:
--------------------------------------------------------------------------------
1 | torch 0.4.1
2 | torchvision
3 | CUDA 10
4 | nvidia drivers 410
5 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
--------------------------------------------------------------------------------
/docs/convsdf/README.md:
--------------------------------------------------------------------------------
1 | # ConvSDF
2 |
3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets)
4 |
5 | ## Description
6 |
7 | The ConvSDF layer is the other primary layer in addition to the ConvSP layer.
8 | ConvSDF stands for Signed Distance Field Convolution.
9 | The purpose of this layer is to enable particle-object interactions.
10 | The particles are represented as a list of coordinate locations.
11 | The objects are represented as signed distance fields (SDFs).
12 | SDFs are functions that take in a point in space relative to the object and return the signed distance to the closest point on the surface of the object, where the sign indicates if the query point is inside the object (negative) or outside (positive).
13 | For ConvSDF, this function is represented as a lookup table in the form of a grid.
14 | ConvSDF accepts a grid with the SDF values for each grid cell filled in, then performs linear interpolation when looking up the SDF value for a specific point.
15 |
16 | ConvSDF works as follows.
17 | ConvSDF operates on sets of query locations, but for simplicity the following describes a single query location.
18 | For a given query point, ConvSDF places a convolutional kernel around that point's location in space.
19 | Then it looks up the SDF values at the center of each of the kernel cells.
20 | This is then convolved with a set of weights in the same manner as a standard convolutional layer, the values are multiplied by a set of weights and then summed.
21 | The following diagram illustrates this process.
22 |
23 | 
24 |
25 | The SDF field is shown as a heatmap, with the object boundry shown in black.
26 | The large red dot is the query location, with the smaller red dots showing the kernel cell centers.
27 | The output of ConvSDF is the convolved value for the given query location.
28 |
29 | The ConvSDF layer is given the pre-computed SDF grids; it does not compute grids from mesh files.
30 | That must be done externally.
31 | SmoothParticleNets does not include any tools to do this (although some can be found by searching online).
32 | This was done intentnionally to reduce the dependencies that this library requires.
33 | Furthermore, for simplicity, ConvSDF assumes the origin of all the SDF grids is the bottom corner grid.
34 | Ensure that when generating SDF grids that you note if the origin in the mesh file differs from the bottom corner of the grid and ensure you update all poses to take this into account.
35 | SDFs in 1D or in 4+D are not really well-defined, so for now ConvSDF only supports 2D or 3D.
36 |
37 | One common usecase for ConvSDF is to compute when particles are inside objects and how to move them away from the object.
38 | This can be done by using ConvSDF to first compute which particles have a negative SDF value, and then by using another ConvSDF layer with fixed +1/-1 weights to compute numerical gradients.
39 | Multiplying the gradients by the distance yields the vector to move the particle by.
40 |
41 | ConvSDF is implemented as a subclass of torch.nn.Module.
42 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d).
43 | ConvSDF is implemented with gradients for the query locations and the object poses so that it can be used during a backward call.
44 | ConvSdf is impelemented in native code with Cuda support, so it can be evaluated efficiently.
45 |
46 | ## Example
47 |
48 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches.
49 | ```python
50 | # Let's make a simple SDF grid.
51 | sdf = torch.Tensor([[0.7, 0.5, 0.5, 0.7], [0.5, -0.5, -0.5, 0.5], [-0.5, 0.5, 0.5, -0.5], [0.7, 0.5, 0.5, 0.7]])
52 | # Construct a ConvSDF layer with 5 kernels.
53 | ConvSDF(sdfs=[sdf], sdf_sizes=[1.0], out_channels=5, ndim=2, kernel_size=1, dilation=0.1, max_distance=1.0, with_params=True, compute_pose_grads=True)
54 | # Convolve at the particle locations. Put the object at the origin with no rotation.
55 | new_data = conv(locs, torch.Tensor([[0]]*locs.shape[0]), torch.Tensor([[0.0, 0.0, 0.0, 0.0]]*locs.shape[0]), torch.Tensor([[1.0]]*locs.shape[0]))
56 | ```
57 |
58 |
59 | ## Documentation
60 |
61 | ConvSDF provides three functions: a constructor, SetSDFs, and forward.
62 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer).
63 |
64 | * ### ConvSDF(sdfs, sdf_sizes, out_channels, ndim, kernel_size, dilation, max_distance, with_params=True, compute_pose_grads=False):
65 | * Arguments
66 | * **sdfs**[list of torch.Tensor]: The pre-computed SDF grids for every object that may be encountered. During the forward call, specific objects can be selected. When there are multiple objects in a scene, the SDFs are combined using the MIN operator (e.g., when evaluating each at a specific query location, the SDF with the smallest value is used). Each value in the grids should be the distance to the surface of the object and negative iff it is inside the object.
67 | * **sdf_sizes**[list of float]: The size of one side of a grid cell for each SDF. The grid cells are assumed to be hypercubes.
68 | * **out_channels**[int]: Similar to standard convolutions, this is the number of convolutional kernels to create. The output is then a feature vector for each query location. Unlike ConvSP, the input is not an arbitrary feature vector but an SDF, so there is no corresponding in_channels argument.
69 | * **ndim**[int]: The dimensionality of the coordinate space.
70 | * **kernel_size**[int or tuple]: The size of the kernel. If a tuple, then len(kernel_size) == ndim must be True. If an integer, the same size is used for each dimension. Kernel sizes must be odd.
71 | * **dilation**[float or tuple]: The size of a kernel cell. If a tuple, then len(dilation) == nimd must be True. If a float, then the same size is used for each dimension. Unlike standard convolutions, where the size of a kernel cell is fixed by the size of a grid cell (e.g., size of a pixel), the unordered particle sets do not provide that structure, so this size must be specified.
72 | * **max_distance**[float]: When looking up the SDF value in an SDF grid, if it is larger than this value, this value is used instead. This is useful when query locations may fall outside of the pre-computed SDF grids.
73 | * **with_params**[boolean]: (optional) If True (default), the parameters of the layer (weights and bias) will be instantiated as torch.nn.Parameters so that they are treated as parameters by PyTorch's built-in operators. If False, then they are added as torch.autograd.Variables and will not be modified by PyTorch directly. This can be useful if desiring fixed (non-trainable) parameters or for debugging.
74 | * **compuse_pose_grads**[boolean]: (optional) If False, will not compute gradients with respect to the poses of the objects during backpropagation. This can speed up the backward pass when these gradients are not desired.
75 |
76 | * ### SetSDFs(sdfs, sdf_sizes):
77 | * Arguments
78 | * **sdfs**[list of torch.Tensor]: The pre-computed SDF grids for every object that may be encountered. During the forward call, specific objects can be selected. When there are multiple objects in a scene, the SDFs are combined using the MIN operator (e.g., when evaluating each at a specific query location, the SDF with the smallest value is used). Each value in the grids should be the distance to the surface of the object and negative iff it is inside the object.
79 | * **sdf_sizes**[list of float]: The size of one side of a grid cell for each SDF. The grid cells are assumed to be hypercubes.
80 |
81 | * ### forward(locs, idxs, poses, scales):
82 | * Arguments
83 | * **locs**[BxNxD torch.autograd.Variable]: The batched list of query locations. D must match the ndim argument to the constructor.
84 | * **idxs**[BxM torch.autograd.Variable]: The indices of the objects to use, where M is the number of objects in the scene. The indices index into the sdfs passed into the constructor. Not every element in the batch must have M objects. Any element that has fewer than M objects may simply set the usused indices to -1.
85 | * **poses**[BxMxDD torch.autograd.Variable]: The pose of each object in the scene. The first D values are the translation, and the remaining values are the rotation. For 2D, the rotation is a single angle. For 3D, the rotation is a quaternion in xyzw format. Only 2D and 3D are supported. The origina for all objects is the lower corner of its SDF grid.
86 | * **scales**[BxM torch.autograd.Variable]: The scale for each object, where 0.5 shrinks the object by half and 2.0 doubles the size of the object.
87 | * Returns
88 | * **new_data**[BxMxG torch.autograd.Variable]: The result of the convolutions. G is the out_channels argument passed to the constructor. This is a new feature vector for each of the query locations.
89 |
90 |
91 |
--------------------------------------------------------------------------------
/docs/convsdf/diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cschenck/SmoothParticleNets/1bfde9bd6ce00dcb8750a48f49ce03f4400fb8cc/docs/convsdf/diagram.png
--------------------------------------------------------------------------------
/docs/convsp/README.md:
--------------------------------------------------------------------------------
1 | # ConvSP
2 |
3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets)
4 |
5 | ## Description
6 |
7 | The ConvSP layer is the main workhorse layer of SmoothParticleNets.
8 | ConvSP stands for Smooth Particle Convolution.
9 | The ConvSP layer operates on unordered particle sets.
10 | Each particle has a feature vector associated with it, and the ConvSP performs a convolution on these features, similar to how a Conv2D layer performs a convolution on the channels of a feature image.
11 | However, unlike in a standard convolution on a gird, the features associated with each particle here create a continuous vector field across space.
12 |
13 | More formally, a set of particles represents a continuous vector field in space.
14 | That is, at everypoint in space it is possible to evaluate the features represented by the particle set.
15 | This is illustrated in the following diagram and equation
16 |
17 | 
18 |
19 | Given an arbitrary query location (the red dot), the features of each nearby particle (x_j) are averaged together, weighted based on their distance to the query point using a kernel function W.
20 |
21 | This is then used to perform convolutions.
22 | Unlike in the standard convolution, here there isn't a well-defined grid to convolve on.
23 | Instead, the ConvSP layer convolves in free space.
24 | This is illustrated in the following diagram.
25 |
26 | 
27 |
28 | In the above 2D case, the kernel used is 3x3.
29 | Given a query location (the large red dot), the kernel is placed on top of that location.
30 | Then the above field lookup equation is used to evaluate the continuous vector field at the center of each kernel cell (small red dots).
31 | The resulting values are then multiplied by kernel weights and summed in the same manner as a standard convolution.
32 | The key difference between ConvSP and a standard convolution is the use of the smoothing kernel average above to allow evaluating the kernel at any arbitrary point in space.
33 |
34 |
35 | ConvSP is implemented as a subclass of torch.nn.Module.
36 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d).
37 | ConvSP is implemented with gradients so that it can be used during a backward call.
38 | ConvSP is impelemented in native code with Cuda support, so it can be evaluated efficiently.
39 |
40 | ## Example
41 |
42 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches and data is a tensor containing a feature vector for each particle.
43 | ```python
44 | # Create a ConvSP layer with 5 output channels, 3 size kernel with dilation of 0.05, and a radius of 0.1.
45 | conv = ConvSP(in_channels=data.shape[2], out_channels=5, locs.shape[2], kernel_size=3, dilation=0.05, radius=0.1, dis_norm=False, with_params=True, kernel_fn='spiky')
46 | # The ConvSP layer requires a ParticleCollision layer to generate the neighbor list. The radius of the neighbor list should be the maximum distance a neighor of any kernel cell could be from the center of the kernel, which is radius + kernel_size/2*dilation.
47 | coll = ParticleCollision(ndim=locs.shape[2], radius=(0.1 + 0.05))
48 | # PartileCollision reorders locs and data.
49 | locs, data, idxs, neighbors = coll(locs, data)
50 | # Get the new features. We'll use the particle locations as the query locations, so we won't be passing anything for qlocs.
51 | new_data = conv(locs, data, neighbors)
52 | # new_data is still reordered according to the reordered locs, but we might want them in the original order.
53 | reorder = ReorderData(reverse=True)
54 | locs, new_data = reorder(idxs, locs, new_data)
55 | ```
56 |
57 |
58 | ## Documentation
59 |
60 | ConvSP provides two functions: a constructor and forward.
61 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer).
62 |
63 | * ### ConvSP(in_channels, out_channels, ndim, kernel_size, dilation, radius, dis_norm=False, kernel_fn='default', with_params=True):
64 | * Arguments
65 | * **in_channels**[int]: The dimensionality of the feature vectors associated with each particle.
66 | * **out_channels**[int]: Similar to standard convolutions, this is the number of convolutional kernels to create. The output is then a feature vector for each query location.
67 | * **ndim**[int]: The dimensionality of the particle's coordinate space.
68 | * **kernel_size**[int or tuple]: The size of the kernel. If a tuple, then len(kernel_size) == ndim must be True. If an integer, the same size is used for each dimension. Kernel sizes must be odd.
69 | * **dilation**[float or tuple]: The size of a kernel cell. If a tuple, then len(dilation) == nimd must be True. If a float, then the same size is used for each dimension. Unlike standard convolutions, where the size of a kernel cell is fixed by the size of a grid cell (e.g., size of a pixel), the unordered particle sets do not provide that structure, so this size must be specified.
70 | * **radius**[float]: The radius to use when computing the smoothing kernel average. Only particles within this distance of the query location are used in the average.
71 | * **dis_norm**[boolean]: (optional) If true, the features in the smoothing kernel average will be divided by the distance from the query location to the particle. This normalization can be useful for some computations.
72 | * **kernel_fn**[string]: (optional) The kernel function to use in the smoothing kernel average. SmoothParticleNets provides many options for the kernel. Refer to kernels.py for a complete list.
73 | * **with_params**[boolean]: (optional) If True (default), the parameters of the layer (weights and bias) will be instantiated as torch.nn.Parameters so that they are treated as parameters by PyTorch's built-in operators. If False, then they are added as torch.autograd.Variables and will not be modified by PyTorch directly. This can be useful if desiring fixed (non-trainable) parameters or for debugging.
74 |
75 | * ### forward(locs, data, neighbors, qlocs=None):
76 | * Arguments
77 | * **locs**[BxNxD torch.autograd.Variable]: The batched list of particle locations. D must match the ndim argument to the constructor.
78 | * **data**[BxNxK torch.autograd.Variable]: The feature vectors associated with each particle. K must be the same as the in_channels argument to the constructor.
79 | * **neighbors**[BxMxF torch.autograd.Variable]: The pre-computed neighbor list for each query location. This can be generated using the ParticleCollision layer. This is necessary for evaluating the kernel smoothing average.
80 | * **qlocs**[BxMxD torch.autograd.Variable]: (optional) The set of locations to perform convolutions around. Usually this will be the same as the particle locations, but not always. If this argument is not provided, locs is used.
81 | * Returns
82 | * **new_data**[BxMxG torch.autograd.Variable]: The result of the convolutions. G is the out_channels argument passed to the constructor. This is a new feature vector for each of the query locations.
83 |
84 |
--------------------------------------------------------------------------------
/docs/convsp/conv_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cschenck/SmoothParticleNets/1bfde9bd6ce00dcb8750a48f49ce03f4400fb8cc/docs/convsp/conv_diagram.png
--------------------------------------------------------------------------------
/docs/convsp/kernel_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cschenck/SmoothParticleNets/1bfde9bd6ce00dcb8750a48f49ce03f4400fb8cc/docs/convsp/kernel_diagram.png
--------------------------------------------------------------------------------
/docs/imageprojection/README.md:
--------------------------------------------------------------------------------
1 | # ImageProjection
2 |
3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets)
4 |
5 | ## Description
6 |
7 | The ImageProjection layer projects an image feature map onto a set of particles in the view frame of the camera.
8 | That is, given an image of C channels, it first projects each particle onto the image using given camera intrinsics (focal length, etc.) and extrinsics (pose).
9 | Then it uses bilinear interpolation between the 4 adjacent pixels to generate a feature vector for the given particle.
10 | The output is a C-length feature vector for each particle.
11 | The ImageProjection layer currently only supports 3D coordinate spaces.
12 |
13 | ImageProjection is implemented as a subclass of torch.nn.Module.
14 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d).
15 | ImageProjection can compute gradients with respect to the camera or particle poses and the image features, and is implemented with Cuda support for efficient computation.
16 |
17 | ## Example
18 |
19 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches and image is a [BxHxWxC] feature image.
20 | ```python
21 | # First create the ParticleProjection layer.
22 | proj = ImageProjection(camera_fl=540)
23 | # Setup the camera pose.
24 | camera_pose = torch.Tensor([0.0, 0.0, 0.0])
25 | camera_rotation = torch.Tensor([0.0, 0.0, 0.0, 1.0])
26 | new_data = proj(locs, image, camera_pose, camera_rotation)
27 | ```
28 |
29 |
30 | ## Documentation
31 |
32 | ImageProjection provides two functions: a constructor and forward.
33 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer).
34 |
35 | * ### ImageProjection(camera_fl):
36 | * Arguments
37 | * **camera_fl**[float]: The focal length of the camera.
38 |
39 | * ### forward(locs, image, camera_pose, camera_rot, depth_mask=None):
40 | * Arguments
41 | * **locs**[BxNx3 torch.autograd.Variable]: The batched list of particle locations. Only 3D particle loations are supported.
42 | * **image**[BxHxWxC torch.autograd.Variable]: The image to project onto the particles. H and W are the height and width, respectively, and C is the number of channels.
43 | * **camera_pose**[Bx3 torch.autograd.Variable]: The camera translation in the environment.
44 | * **camera_rot**[Bx4 torch.autograd.Variable]: The camera rotation in the environment, represented as a quaternion in xyzw format.
45 | * **depth_mask**[BxHxW torch.autograd.Variable]: (optional) If passed, this is used to mask particles that are obscured by obstructions in the environment. If the depth of a pixel is less than the depth of the particle, nothing is projected onto that particle.
46 | * Returns
47 | * **new_data**[BxNxC torch.autograd.Variable]: The set of features for each particle after projecting the image features onto them.
48 |
49 |
50 |
--------------------------------------------------------------------------------
/docs/particlecollision/README.md:
--------------------------------------------------------------------------------
1 | # ParticleCollision
2 |
3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets)
4 |
5 | ## Description
6 |
7 | The ParticleCollision layer pre-computes neighbor lists (i.e., "colliding" particles) for each given particle.
8 | That is, given a list of particle positions and a fixed radius, this layer returns a short list for each particle with the index of all other particles that are within that radius of it.
9 | To do this, internally the ParticleCollision layer creates a hashgrid and performs lookups based on that grid.
10 | The resulting neighbor list is designed to be used by the ConvSP layer to compute particle-particle interactions.
11 |
12 | An important operation that this layer does alongside computing collisions is to reorder the particle list.
13 | The reordering places particles falling in the same grid cell in the hash grid next to each other in memory.
14 | By doing so, cache hits are increased dramatically during the computation of particle-particle interactions in ConvSP, resulting in a large speedup.
15 | Due to this reordering, the returned list of colliding neighbor indices are indices in the *reordered* list, not in the original.
16 | The standard use of this layer is to compute collisions, make as many calls to ConvSP as are desired, then use the ReorderData layer to return the particle list to its original order.
17 | It is important to emphasize that reordering the data according to the hash grid is critical for perfomance of the ConvSP layer.
18 |
19 | ParticleCollision is implemented as a subclass of torch.nn.Module.
20 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d).
21 | There are no gradients to compute for this layer, so it simply passes them through when calling backward.
22 |
23 | ## Example
24 |
25 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches and vel is a same size tensor containing the particle's velocities.
26 | ```python
27 | coll = ParticleCollision(ndim, radius)
28 | # PartileCollision reorders locs and vel.
29 | locs, vel, idxs, neighbors = coll(locs, vel)
30 | ```
31 |
32 |
33 | ## Documentation
34 |
35 | ParticleCollision provides two functions: a constructor and forward.
36 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer).
37 |
38 | * ### ParticleCollision(ndim, radius, max_grid_dim=96, max_collisions=128, include_self=True):
39 | * Arguments
40 | * **ndim**[int]: The dimensionality of the particle's coordinate space.
41 | * **radius**[float]: The maximum distance a particle can be from another and still be colliding.
42 | * **max_grid_dims**[int]: (optional) The maximum size of the hash grid in any dimension. This is useful for limiting memory consumpation in cases where the particles are very spread out relative to the collision radius. Particles that don't fall in the hash grid are placed in the cell closest to them.
43 | * **max_collisions**[int]: (optional) The maximum number of neighbors to return. The returned neighbor list for each particle will always be this length (although not necessarily entirely filled in), so selecting this parameter is a balance between memory consumption and ensuring all colliding particles are included.
44 | * **include_self**[boolean]: (optional) If True, the particle will be in its own list of neighbors. If False it will not be.
45 |
46 | * ### forward(idxs, locs, data=None, qlocs=None):
47 | * Arguments
48 | * **locs**[BxNxD torch.autograd.Variable]: The batched list of particle locations. D must match the ndim argument to the constructor.
49 | * **data**[BxNxK torch.autograd.Variable]: (optional) Additional data associated with each particle. This data is not used during the forward call, however since the locs are reordered, any data associated with each particle must also be reordered. Technically this could also be accomplished instead by calling the ReorderData layer on the data after calling forward, but doing so here helps to prevent bugs when calling ConvSP with reordered locs but non-reordered data.
50 | * **qlocs**[BxMxD torch.autograd.Variable]: (optional) In the case where it is desired to compute collisions between two different particle sets, this is the second set. Rather than returning the neighbor list for particles in locs, if this argument is passed, the returned neighbor list is a list for each particle in qlocs of the indices of particles in locs (after reordering) that it collides with.
51 | * Returns
52 | * **locs**[BxNxD torch.autograd.Variable]: The reordered list of particle positions.
53 | * **data**[BxNxK torch.autograd.Variable]: (optional) If data was passed as an input, then the data reordered is returned.
54 | * **idxs**[BxNxD torch.autograd.Variable]: The index list for the reordered particle list. Each index value indicates where the original index of that particle in the original locs, i.e., idxs[b, i] = j where i is the new index of the particle after reordering and j is its original index (b being the batch).
55 | * **neighbors**[Bx(N/M)xC torch.autograd.Variable]: The neighbor list for each particle. If qlocs was passed as an argument, then it is the neighbors of each particle in qlocs instead of locs. Each value indicates the index in locs (after reordering) of the neighboring particle. C is the value of max_collisions as passed to the constructor. Note that not all particles will have max_collisions neighbors. In that event, the values in each particle's list are filled sequentially, with unfilled values in the list being set to -1.
56 |
--------------------------------------------------------------------------------
/docs/particleprojection/README.md:
--------------------------------------------------------------------------------
1 | # ParticleProjection
2 |
3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets)
4 |
5 | ## Description
6 |
7 | The ParticleProjection layer is designed to allow comparison of the particle state with a camera image.
8 | It does this by projecting the particles onto a virtual camera image, which can then be compared to other camera images as desired.
9 | Each particle is projected onto the virtual image as a small Gaussian, which allows for smooth gradients with respect to the particle positions or camera pose.
10 | The layer computes the image coordinate of a given particle location using the pinhole camera model, not taking into account any distortions, e.g., radial distortion.
11 | ParticleProjection currently only supports 3D particle locations.
12 |
13 | ParticleProjection is implemented as a subclass of torch.nn.Module.
14 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d).
15 | ParticleProjection can compute gradients with respect to the camera or particle poses, and is implemented with Cuda support for efficient computation.
16 |
17 | ## Example
18 |
19 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches.
20 | ```python
21 | # First create the ParticleProjection layer.
22 | proj = ParticleProjection(camera_fl=540, camera_size=(480, 640), filter_std=5.0, filter_scale=10.0)
23 | # Setup the camera pose.
24 | camera_pose = torch.Tensor([0.0, 0.0, 0.0])
25 | camera_rotation = torch.Tensor([0.0, 0.0, 0.0, 1.0])
26 | image = proj(locs, camera_pose, camera_rotation)
27 | ```
28 |
29 |
30 | ## Documentation
31 |
32 | ParticleProjection provides two functions: a constructor and forward.
33 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer).
34 |
35 | * ### ParticleProjection(camera_fl, camera_size, filter_std, filter_scale):
36 | * Arguments
37 | * **camera_fl**[float]: The focal length of the camera.
38 | * **camera_size**[tuple]: A tuple of the camera image height and width (in that order) in pixels.
39 | * **filter_std**[float]: The standard deviation (in pixels) of the Gaussian for each particle. The Gaussian will be added to all pixels within 2x of this to the particle's image coordinate.
40 | * **filter_scale**[float]: All values added to a pixel will be multiplied by this to allow control of the intensity of the Gaussians for each particle. This is equivalent to multiplying the output image by this value after the fact.
41 |
42 | * ### forward(locs, camera_pose, camera_rot, depth_mask=None):
43 | * Arguments
44 | * **locs**[BxNx3 torch.autograd.Variable]: The batched list of particle locations. Only 3D particle loations are supported.
45 | * **camera_pose**[Bx3 torch.autograd.Variable]: The camera translation in the environment.
46 | * **camera_rot**[Bx4 torch.autograd.Variable]: The camera rotation in the environment, represented as a quaternion in xyzw format.
47 | * **depth_mask**[BxHxW torch.autograd.Variable]: (optional) If passed, this is used to mask particles that are obscured by obstructions in the environment. If the depth of a pixel is less than the depth of the particle, the particle's contribution to that pixel is not added. H and W must match the camera image height and width passed to the constructor.
48 | * Returns
49 | * **image**[BxHxW torch.autograd.Variable]: The projected image. Particles appear as small Gaussians, and where particles overlap the Gaussians are added together.
50 |
51 |
--------------------------------------------------------------------------------
/docs/reorderdata/README.md:
--------------------------------------------------------------------------------
1 | # ReorderData
2 |
3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets)
4 |
5 | ## Description
6 |
7 | The ReorderData layer is fairly simple.
8 | The layer reorders a given tensor based on a tensor containing the indices for the data in the first tensor.
9 | More formally, assume that DATA is a BxNxD tensor containing N D-dimensional data points (e.g., XYZ particle locations) over B batches.
10 | Let IDXS be a BxN tensor, where each IDXS[i, :] contains the numbers 0 to N-1 in some arbitrary order.
11 | This layer then returns DATA where the second dimension has been rearranged according to IDXS.
12 | This is equivalent to
13 | ```python
14 | DATA[i, :, :] = DATA[i, IDXS[i, :], :]
15 | ```
16 | in PyTorch syntax, however this layer is specialized for this specific kind of indexing resulting in a faster implementation.
17 | This layer is designed as a helper layer for the ParticleCollision layer.
18 |
19 | ReorderData is implemented as a subclass of torch.nn.Module.
20 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d).
21 | Additionally, this layer computes graidents, so it can be used in a backward pass.
22 |
23 | ## Example
24 |
25 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches and *vel* is a same size tensor containing the particles' velocities.
26 | ```python
27 | # ReorderData is most commonly used in conjunction with ParticleCollision.
28 | coll = ParticleCollision(ndim, radius)
29 | # Set reverse=True. ParticleCollision calls ReorderData internally, so we want to undo that reordering when we're done.
30 | reorder = ReorderData(reverse=True)
31 | # PartileCollision reorders locs and vel.
32 | locs, vel, idxs, neighbors = coll(locs, vel)
33 | # Perform desired operations with locs, vel, neighbors...
34 | # When we're done, return locs and vel to their original order using ReorderData.
35 | locs, vel = reorder(idxs, locs, vel)
36 | ```
37 |
38 |
39 | ## Documentation
40 |
41 | ReorderData provides two functions: a constructor and forward.
42 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer).
43 |
44 | * ### ReorderData(reverse=True):
45 | * Arguments
46 | * **reverse**[boolean]: (optional) When False, behaves as normal, using the given indices to reorder the data. When True, this layer assumes that the given data was already reordered according to the given indices, and so reverses that process and retursn the data to the original order.
47 |
48 | * ### forward(idxs, locs, data=None):
49 | * Arguments
50 | * **idxs**[BxN torch.autograd.Variable]: The list of indices to redorder the input by.
51 | * **locs**[BxNxD torch.autograd.Variable]: The main data to be reordered. It is called *locs* because ReorderData is primarily a helper for ParticleCollision, which reorders the locations of the particles.
52 | * **data**[BxNxK torch.autograd.Variable]: (optional) Additional data to reorder alongside locs. Calling forward with both locs and data is equivalent to calling it twice in a row with each individually. This argument is provided as a convenience.
53 | * Returns
54 | * **locs**[BxNxD torch.autograd.Variable]: A new tensor with the same values as in the locs argument reordered based in idxs.
55 | * **data**[BxNxK torch.autograd.Variable]: (optional) If the data argument is passed, then forward will return a pair of tensors, where the second has the same values as data but reordered according to idxs.
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/block/block_raking_layout.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
32 | */
33 |
34 |
35 | #pragma once
36 |
37 | #include "../util_macro.cuh"
38 | #include "../util_arch.cuh"
39 | #include "../util_namespace.cuh"
40 |
41 | /// Optional outer namespace(s)
42 | CUB_NS_PREFIX
43 |
44 | /// CUB namespace
45 | namespace cub {
46 |
47 | /**
48 | * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data. 
49 | * \ingroup BlockModule
50 | *
51 | * \par Overview
52 | * This type facilitates a shared memory usage pattern where a block of CUDA
53 | * threads places elements into shared memory and then reduces the active
54 | * parallelism to one "raking" warp of threads for serially aggregating consecutive
55 | * sequences of shared items. Padding is inserted to eliminate bank conflicts
56 | * (for most data types).
57 | *
58 | * \tparam T The data type to be exchanged.
59 | * \tparam BLOCK_THREADS The thread block size in threads.
60 | * \tparam PTX_ARCH [optional] \ptxversion
61 | */
62 | template <
63 | typename T,
64 | int BLOCK_THREADS,
65 | int PTX_ARCH = CUB_PTX_ARCH>
66 | struct BlockRakingLayout
67 | {
68 | //---------------------------------------------------------------------
69 | // Constants and type definitions
70 | //---------------------------------------------------------------------
71 |
72 | enum
73 | {
74 | /// The total number of elements that need to be cooperatively reduced
75 | SHARED_ELEMENTS = BLOCK_THREADS,
76 |
77 | /// Maximum number of warp-synchronous raking threads
78 | MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
79 |
80 | /// Number of raking elements per warp-synchronous raking thread (rounded up)
81 | SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
82 |
83 | /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
84 | RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
85 |
86 | /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
87 | HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
88 |
89 | /// Degree of bank conflicts (e.g., 4-way)
90 | CONFLICT_DEGREE = (HAS_CONFLICTS) ?
91 | (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
92 | 1,
93 |
94 | /// Pad each segment length with one element if degree of bank conflicts is greater than 4-way (heuristic)
95 | SEGMENT_PADDING = (CONFLICT_DEGREE > CUB_PREFER_CONFLICT_OVER_PADDING(PTX_ARCH)) ? 1 : 0,
96 | // SEGMENT_PADDING = (HAS_CONFLICTS) ? 1 : 0,
97 |
98 | /// Total number of elements in the raking grid
99 | GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + SEGMENT_PADDING),
100 |
101 | /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
102 | UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
103 | };
104 |
105 |
106 | /**
107 | * \brief Shared memory storage type
108 | */
109 | typedef T _TempStorage[BlockRakingLayout::GRID_ELEMENTS];
110 |
111 | /// Alias wrapper allowing storage to be unioned
112 | struct TempStorage : Uninitialized<_TempStorage> {};
113 |
114 |
115 | /**
116 | * \brief Returns the location for the calling thread to place data into the grid
117 | */
118 | static __device__ __forceinline__ T* PlacementPtr(
119 | TempStorage &temp_storage,
120 | int linear_tid)
121 | {
122 | // Offset for partial
123 | unsigned int offset = linear_tid;
124 |
125 | // Add in one padding element for every segment
126 | if (SEGMENT_PADDING > 0)
127 | {
128 | offset += offset / SEGMENT_LENGTH;
129 | }
130 |
131 | // Incorporating a block of padding partials every shared memory segment
132 | return temp_storage.Alias() + offset;
133 | }
134 |
135 |
136 | /**
137 | * \brief Returns the location for the calling thread to begin sequential raking
138 | */
139 | static __device__ __forceinline__ T* RakingPtr(
140 | TempStorage &temp_storage,
141 | int linear_tid)
142 | {
143 | return temp_storage.Alias() + (linear_tid * (SEGMENT_LENGTH + SEGMENT_PADDING));
144 | }
145 | };
146 |
147 | } // CUB namespace
148 | CUB_NS_POSTFIX // Optional outer namespace(s)
149 |
150 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/block/specializations/block_histogram_atomic.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
32 | */
33 |
34 | #pragma once
35 |
36 | #include "../../util_namespace.cuh"
37 |
38 | /// Optional outer namespace(s)
39 | CUB_NS_PREFIX
40 |
41 | /// CUB namespace
42 | namespace cub {
43 |
44 |
45 | /**
46 | * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
47 | */
48 | template
49 | struct BlockHistogramAtomic
50 | {
51 | /// Shared memory storage layout type
52 | struct TempStorage {};
53 |
54 |
55 | /// Constructor
56 | __device__ __forceinline__ BlockHistogramAtomic(
57 | TempStorage &temp_storage)
58 | {}
59 |
60 |
61 | /// Composite data onto an existing histogram
62 | template <
63 | typename T,
64 | typename HistoCounter,
65 | int ITEMS_PER_THREAD>
66 | __device__ __forceinline__ void Composite(
67 | T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram
68 | HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram
69 | {
70 | // Update histogram
71 | #pragma unroll
72 | for (int i = 0; i < ITEMS_PER_THREAD; ++i)
73 | {
74 | atomicAdd(histogram + items[i], 1);
75 | }
76 | }
77 |
78 | };
79 |
80 | } // CUB namespace
81 | CUB_NS_POSTFIX // Optional outer namespace(s)
82 |
83 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/block/specializations/block_histogram_sort.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
32 | */
33 |
34 | #pragma once
35 |
36 | #include "../../block/block_radix_sort.cuh"
37 | #include "../../block/block_discontinuity.cuh"
38 | #include "../../util_ptx.cuh"
39 | #include "../../util_namespace.cuh"
40 |
41 | /// Optional outer namespace(s)
42 | CUB_NS_PREFIX
43 |
44 | /// CUB namespace
45 | namespace cub {
46 |
47 |
48 |
49 | /**
50 | * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
51 | */
52 | template <
53 | typename T, ///< Sample type
54 | int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension
55 | int ITEMS_PER_THREAD, ///< The number of samples per thread
56 | int BINS, ///< The number of bins into which histogram samples may fall
57 | int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension
58 | int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension
59 | int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective
60 | struct BlockHistogramSort
61 | {
62 | /// Constants
63 | enum
64 | {
65 | /// The thread block size in threads
66 | BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
67 | };
68 |
69 | // Parameterize BlockRadixSort type for our thread block
70 | typedef BlockRadixSort<
71 | T,
72 | BLOCK_DIM_X,
73 | ITEMS_PER_THREAD,
74 | NullType,
75 | 4,
76 | (PTX_ARCH >= 350) ? true : false,
77 | BLOCK_SCAN_WARP_SCANS,
78 | (PTX_ARCH >= 350) ? cudaSharedMemBankSizeEightByte : cudaSharedMemBankSizeFourByte,
79 | BLOCK_DIM_Y,
80 | BLOCK_DIM_Z,
81 | PTX_ARCH>
82 | BlockRadixSortT;
83 |
84 | // Parameterize BlockDiscontinuity type for our thread block
85 | typedef BlockDiscontinuity<
86 | T,
87 | BLOCK_DIM_X,
88 | BLOCK_DIM_Y,
89 | BLOCK_DIM_Z,
90 | PTX_ARCH>
91 | BlockDiscontinuityT;
92 |
93 | /// Shared memory
94 | union _TempStorage
95 | {
96 | // Storage for sorting bin values
97 | typename BlockRadixSortT::TempStorage sort;
98 |
99 | struct
100 | {
101 | // Storage for detecting discontinuities in the tile of sorted bin values
102 | typename BlockDiscontinuityT::TempStorage flag;
103 |
104 | // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
105 | unsigned int run_begin[BINS];
106 | unsigned int run_end[BINS];
107 | };
108 | };
109 |
110 |
111 | /// Alias wrapper allowing storage to be unioned
112 | struct TempStorage : Uninitialized<_TempStorage> {};
113 |
114 |
115 | // Thread fields
116 | _TempStorage &temp_storage;
117 | int linear_tid;
118 |
119 |
120 | /// Constructor
121 | __device__ __forceinline__ BlockHistogramSort(
122 | TempStorage &temp_storage)
123 | :
124 | temp_storage(temp_storage.Alias()),
125 | linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
126 | {}
127 |
128 |
129 | // Discontinuity functor
130 | struct DiscontinuityOp
131 | {
132 | // Reference to temp_storage
133 | _TempStorage &temp_storage;
134 |
135 | // Constructor
136 | __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
137 | temp_storage(temp_storage)
138 | {}
139 |
140 | // Discontinuity predicate
141 | __device__ __forceinline__ bool operator()(const T &a, const T &b, unsigned int b_index)
142 | {
143 | if (a != b)
144 | {
145 | // Note the begin/end offsets in shared storage
146 | temp_storage.run_begin[b] = b_index;
147 | temp_storage.run_end[a] = b_index;
148 |
149 | return true;
150 | }
151 | else
152 | {
153 | return false;
154 | }
155 | }
156 | };
157 |
158 |
159 | // Composite data onto an existing histogram
160 | template <
161 | typename HistoCounter>
162 | __device__ __forceinline__ void Composite(
163 | T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram
164 | HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram
165 | {
166 | enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
167 |
168 | // Sort bytes in blocked arrangement
169 | BlockRadixSortT(temp_storage.sort).Sort(items);
170 |
171 | __syncthreads();
172 |
173 | // Initialize the shared memory's run_begin and run_end for each bin
174 | int histo_offset = 0;
175 |
176 | #pragma unroll
177 | for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
178 | {
179 | temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
180 | temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
181 | }
182 | // Finish up with guarded initialization if necessary
183 | if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
184 | {
185 | temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
186 | temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
187 | }
188 |
189 | __syncthreads();
190 |
191 | int flags[ITEMS_PER_THREAD]; // unused
192 |
193 | // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
194 | DiscontinuityOp flag_op(temp_storage);
195 | BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
196 |
197 | // Update begin for first item
198 | if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
199 |
200 | __syncthreads();
201 |
202 | // Composite into histogram
203 | histo_offset = 0;
204 |
205 | #pragma unroll
206 | for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
207 | {
208 | int thread_offset = histo_offset + linear_tid;
209 | HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
210 | histogram[thread_offset] += count;
211 | }
212 |
213 | // Finish up with guarded composition if necessary
214 | if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
215 | {
216 | int thread_offset = histo_offset + linear_tid;
217 | HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
218 | histogram[thread_offset] += count;
219 | }
220 | }
221 |
222 | };
223 |
224 | } // CUB namespace
225 | CUB_NS_POSTFIX // Optional outer namespace(s)
226 |
227 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/block/specializations/block_reduce_raking_commutative_only.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators.
32 | */
33 |
34 | #pragma once
35 |
36 | #include "block_reduce_raking.cuh"
37 | #include "../../warp/warp_reduce.cuh"
38 | #include "../../thread/thread_reduce.cuh"
39 | #include "../../util_ptx.cuh"
40 | #include "../../util_namespace.cuh"
41 |
42 | /// Optional outer namespace(s)
43 | CUB_NS_PREFIX
44 |
45 | /// CUB namespace
46 | namespace cub {
47 |
48 |
49 | /**
50 | * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. Does not support block sizes that are not a multiple of the warp size.
51 | */
52 | template <
53 | typename T, ///< Data type being reduced
54 | int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension
55 | int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension
56 | int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension
57 | int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective
58 | struct BlockReduceRakingCommutativeOnly
59 | {
60 | /// Constants
61 | enum
62 | {
63 | /// The thread block size in threads
64 | BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
65 | };
66 |
67 | // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values
68 | typedef BlockReduceRaking FallBack;
69 |
70 | /// Constants
71 | enum
72 | {
73 | /// Number of warp threads
74 | WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
75 |
76 | /// Whether or not to use fall-back
77 | USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
78 |
79 | /// Number of raking threads
80 | RAKING_THREADS = WARP_THREADS,
81 |
82 | /// Number of threads actually sharing items with the raking threads
83 | SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
84 |
85 | /// Number of raking elements per warp synchronous raking thread
86 | SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
87 | };
88 |
89 | /// WarpReduce utility type
90 | typedef WarpReduce WarpReduce;
91 |
92 | /// Layout type for padded thread block raking grid
93 | typedef BlockRakingLayout BlockRakingLayout;
94 |
95 | /// Shared memory storage layout type
96 | struct _TempStorage
97 | {
98 | union
99 | {
100 | struct
101 | {
102 | typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction
103 | typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid
104 | };
105 | typename FallBack::TempStorage fallback_storage; ///< Fall-back storage for non-commutative block scan
106 | };
107 | };
108 |
109 |
110 | /// Alias wrapper allowing storage to be unioned
111 | struct TempStorage : Uninitialized<_TempStorage> {};
112 |
113 |
114 | // Thread fields
115 | _TempStorage &temp_storage;
116 | int linear_tid;
117 |
118 |
119 | /// Constructor
120 | __device__ __forceinline__ BlockReduceRakingCommutativeOnly(
121 | TempStorage &temp_storage)
122 | :
123 | temp_storage(temp_storage.Alias()),
124 | linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
125 | {}
126 |
127 |
128 | /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0.
129 | template
130 | __device__ __forceinline__ T Sum(
131 | T partial, ///< [in] Calling thread's input partial reductions
132 | int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
133 | {
134 | if (USE_FALLBACK || !FULL_TILE)
135 | {
136 | return FallBack(temp_storage.fallback_storage).template Sum(partial, num_valid);
137 | }
138 | else
139 | {
140 | // Place partial into shared memory grid
141 | if (linear_tid >= RAKING_THREADS)
142 | *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
143 |
144 | __syncthreads();
145 |
146 | // Reduce parallelism to one warp
147 | if (linear_tid < RAKING_THREADS)
148 | {
149 | // Raking reduction in grid
150 | T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
151 | partial = ThreadReduce(raking_segment, cub::Sum(), partial);
152 |
153 | // Warpscan
154 | partial = WarpReduce(temp_storage.warp_storage).Sum(partial);
155 | }
156 | }
157 |
158 | return partial;
159 | }
160 |
161 |
162 | /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0.
163 | template <
164 | bool FULL_TILE,
165 | typename ReductionOp>
166 | __device__ __forceinline__ T Reduce(
167 | T partial, ///< [in] Calling thread's input partial reductions
168 | int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
169 | ReductionOp reduction_op) ///< [in] Binary reduction operator
170 | {
171 | if (USE_FALLBACK || !FULL_TILE)
172 | {
173 | return FallBack(temp_storage.fallback_storage).template Reduce(partial, num_valid, reduction_op);
174 | }
175 | else
176 | {
177 | // Place partial into shared memory grid
178 | if (linear_tid >= RAKING_THREADS)
179 | *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
180 |
181 | __syncthreads();
182 |
183 | // Reduce parallelism to one warp
184 | if (linear_tid < RAKING_THREADS)
185 | {
186 | // Raking reduction in grid
187 | T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
188 | partial = ThreadReduce(raking_segment, reduction_op, partial);
189 |
190 | // Warpscan
191 | partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op);
192 | }
193 | }
194 |
195 | return partial;
196 | }
197 |
198 | };
199 |
200 | } // CUB namespace
201 | CUB_NS_POSTFIX // Optional outer namespace(s)
202 |
203 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_gatomic.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * cub::BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram.
32 | */
33 |
34 | #pragma once
35 |
36 | #include
37 |
38 | #include "../../util_type.cuh"
39 | #include "../../util_namespace.cuh"
40 |
41 | /// Optional outer namespace(s)
42 | CUB_NS_PREFIX
43 |
44 | /// CUB namespace
45 | namespace cub {
46 |
47 |
48 |
49 | /**
50 | * BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics
51 | */
52 | template <
53 | typename BlockRangeHistogramPolicy, ///< Tuning policy
54 | int BINS, ///< Number of histogram bins per channel
55 | int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
56 | int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed
57 | typename InputIterator, ///< The input iterator type \iterator. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
58 | typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin
59 | typename Offset> ///< Signed integer type for global offsets
60 | struct BlockRangeHistogramGlobalAtomic
61 | {
62 | //---------------------------------------------------------------------
63 | // Types and constants
64 | //---------------------------------------------------------------------
65 |
66 | // Sample type
67 | typedef typename std::iterator_traits::value_type SampleT;
68 |
69 | // Constants
70 | enum
71 | {
72 | BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS,
73 | ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD,
74 | TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
75 | TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS,
76 | };
77 |
78 | // Shared memory type required by this thread block
79 | typedef NullType TempStorage;
80 |
81 |
82 | //---------------------------------------------------------------------
83 | // Per-thread fields
84 | //---------------------------------------------------------------------
85 |
86 | /// Reference to output histograms
87 | HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
88 |
89 | /// Input data to reduce
90 | InputIterator d_in;
91 |
92 |
93 | //---------------------------------------------------------------------
94 | // Interface
95 | //---------------------------------------------------------------------
96 |
97 | /**
98 | * Constructor
99 | */
100 | __device__ __forceinline__ BlockRangeHistogramGlobalAtomic(
101 | TempStorage &temp_storage, ///< Reference to temp_storage
102 | InputIterator d_in, ///< Input data to reduce
103 | HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms
104 | :
105 | d_in(d_in),
106 | d_out_histograms(d_out_histograms)
107 | {}
108 |
109 |
110 | /**
111 | * Process a single tile of input
112 | */
113 | template
114 | __device__ __forceinline__ void ConsumeTile(
115 | Offset block_offset, ///< The offset the tile to consume
116 | int valid_items = TILE_ITEMS) ///< The number of valid items in the tile
117 | {
118 | if (FULL_TILE)
119 | {
120 | // Full tile of samples to read and composite
121 | SampleT items[ITEMS_PER_THREAD][CHANNELS];
122 |
123 | #pragma unroll
124 | for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
125 | {
126 | #pragma unroll
127 | for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
128 | {
129 | if (CHANNEL < ACTIVE_CHANNELS)
130 | {
131 | items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
132 | }
133 | }
134 | }
135 |
136 | __threadfence_block();
137 |
138 | #pragma unroll
139 | for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
140 | {
141 | #pragma unroll
142 | for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
143 | {
144 | if (CHANNEL < ACTIVE_CHANNELS)
145 | {
146 | atomicAdd(d_out_histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
147 | }
148 | }
149 | }
150 | }
151 | else
152 | {
153 | // Only a partially-full tile of samples to read and composite
154 | int bounds = valid_items - (threadIdx.x * CHANNELS);
155 |
156 | #pragma unroll
157 | for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
158 | {
159 | #pragma unroll
160 | for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
161 | {
162 | if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
163 | {
164 | SampleT item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
165 | atomicAdd(d_out_histograms[CHANNEL] + item, 1);
166 | }
167 | }
168 | }
169 |
170 | }
171 | }
172 |
173 |
174 | /**
175 | * Aggregate results into output
176 | */
177 | __device__ __forceinline__ void AggregateOutput()
178 | {}
179 | };
180 |
181 |
182 | } // CUB namespace
183 | CUB_NS_POSTFIX // Optional outer namespace(s)
184 |
185 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/cub.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * CUB umbrella include file
32 | */
33 |
34 | #pragma once
35 |
36 |
37 | // Block
38 | #include "block/block_histogram.cuh"
39 | #include "block/block_discontinuity.cuh"
40 | #include "block/block_exchange.cuh"
41 | #include "block/block_load.cuh"
42 | #include "block/block_radix_rank.cuh"
43 | #include "block/block_radix_sort.cuh"
44 | #include "block/block_reduce.cuh"
45 | #include "block/block_scan.cuh"
46 | #include "block/block_store.cuh"
47 | #include "block/block_shift.cuh"
48 |
49 | // Device
50 | #include "device/device_histogram.cuh"
51 | #include "device/device_partition.cuh"
52 | #include "device/device_radix_sort.cuh"
53 | #include "device/device_reduce.cuh"
54 | #include "device/device_scan.cuh"
55 | #include "device/device_select.cuh"
56 |
57 | // Grid
58 | //#include "grid/grid_barrier.cuh"
59 | #include "grid/grid_even_share.cuh"
60 | #include "grid/grid_mapping.cuh"
61 | #include "grid/grid_queue.cuh"
62 |
63 | // Host
64 | #include "host/spinlock.cuh"
65 |
66 | // Thread
67 | #include "thread/thread_load.cuh"
68 | #include "thread/thread_operators.cuh"
69 | #include "thread/thread_reduce.cuh"
70 | #include "thread/thread_scan.cuh"
71 | #include "thread/thread_store.cuh"
72 |
73 | // Warp
74 | #include "warp/warp_reduce.cuh"
75 | #include "warp/warp_scan.cuh"
76 |
77 | // Iterator
78 | #include "iterator/arg_index_input_iterator.cuh"
79 | #include "iterator/cache_modified_input_iterator.cuh"
80 | #include "iterator/cache_modified_output_iterator.cuh"
81 | #include "iterator/constant_input_iterator.cuh"
82 | #include "iterator/counting_input_iterator.cuh"
83 | #include "iterator/tex_obj_input_iterator.cuh"
84 | #include "iterator/tex_ref_input_iterator.cuh"
85 | #include "iterator/transform_input_iterator.cuh"
86 |
87 | // Util
88 | #include "util_allocator.cuh"
89 | #include "util_arch.cuh"
90 | #include "util_debug.cuh"
91 | #include "util_device.cuh"
92 | #include "util_macro.cuh"
93 | #include "util_ptx.cuh"
94 | #include "util_type.cuh"
95 |
96 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/grid/grid_barrier.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
32 | */
33 |
34 | #pragma once
35 |
36 | #include "../util_debug.cuh"
37 | #include "../util_namespace.cuh"
38 | #include "../thread/thread_load.cuh"
39 |
40 | /// Optional outer namespace(s)
41 | CUB_NS_PREFIX
42 |
43 | /// CUB namespace
44 | namespace cub {
45 |
46 |
47 | /**
48 | * \addtogroup GridModule
49 | * @{
50 | */
51 |
52 |
53 | /**
54 | * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
55 | */
56 | class GridBarrier
57 | {
58 | protected :
59 |
60 | typedef unsigned int SyncFlag;
61 |
62 | // Counters in global device memory
63 | SyncFlag* d_sync;
64 |
65 | public:
66 |
67 | /**
68 | * Constructor
69 | */
70 | GridBarrier() : d_sync(NULL) {}
71 |
72 |
73 | /**
74 | * Synchronize
75 | */
76 | __device__ __forceinline__ void Sync() const
77 | {
78 | volatile SyncFlag *d_vol_sync = d_sync;
79 |
80 | // Threadfence and syncthreads to make sure global writes are visible before
81 | // thread-0 reports in with its sync counter
82 | __threadfence();
83 | __syncthreads();
84 |
85 | if (blockIdx.x == 0)
86 | {
87 | // Report in ourselves
88 | if (threadIdx.x == 0)
89 | {
90 | d_vol_sync[blockIdx.x] = 1;
91 | }
92 |
93 | __syncthreads();
94 |
95 | // Wait for everyone else to report in
96 | for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
97 | {
98 | while (ThreadLoad(d_sync + peer_block) == 0)
99 | {
100 | __threadfence_block();
101 | }
102 | }
103 |
104 | __syncthreads();
105 |
106 | // Let everyone know it's safe to proceed
107 | for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
108 | {
109 | d_vol_sync[peer_block] = 0;
110 | }
111 | }
112 | else
113 | {
114 | if (threadIdx.x == 0)
115 | {
116 | // Report in
117 | d_vol_sync[blockIdx.x] = 1;
118 |
119 | // Wait for acknowledgment
120 | while (ThreadLoad(d_sync + blockIdx.x) == 1)
121 | {
122 | __threadfence_block();
123 | }
124 | }
125 |
126 | __syncthreads();
127 | }
128 | }
129 | };
130 |
131 |
132 | /**
133 | * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
134 | *
135 | * Uses RAII for lifetime, i.e., device resources are reclaimed when
136 | * the destructor is called.
137 | */
138 | class GridBarrierLifetime : public GridBarrier
139 | {
140 | protected:
141 |
142 | // Number of bytes backed by d_sync
143 | size_t sync_bytes;
144 |
145 | public:
146 |
147 | /**
148 | * Constructor
149 | */
150 | GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
151 |
152 |
153 | /**
154 | * DeviceFrees and resets the progress counters
155 | */
156 | cudaError_t HostReset()
157 | {
158 | cudaError_t retval = cudaSuccess;
159 | if (d_sync)
160 | {
161 | CubDebug(retval = cudaFree(d_sync));
162 | d_sync = NULL;
163 | }
164 | sync_bytes = 0;
165 | return retval;
166 | }
167 |
168 |
169 | /**
170 | * Destructor
171 | */
172 | virtual ~GridBarrierLifetime()
173 | {
174 | HostReset();
175 | }
176 |
177 |
178 | /**
179 | * Sets up the progress counters for the next kernel launch (lazily
180 | * allocating and initializing them if necessary)
181 | */
182 | cudaError_t Setup(int sweep_grid_size)
183 | {
184 | cudaError_t retval = cudaSuccess;
185 | do {
186 | size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
187 | if (new_sync_bytes > sync_bytes)
188 | {
189 | if (d_sync)
190 | {
191 | if (CubDebug(retval = cudaFree(d_sync))) break;
192 | }
193 |
194 | sync_bytes = new_sync_bytes;
195 |
196 | // Allocate and initialize to zero
197 | if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
198 | if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
199 | }
200 | } while (0);
201 |
202 | return retval;
203 | }
204 | };
205 |
206 |
207 | /** @} */ // end group GridModule
208 |
209 | } // CUB namespace
210 | CUB_NS_POSTFIX // Optional outer namespace(s)
211 |
212 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/grid/grid_even_share.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains).
32 | */
33 |
34 |
35 | #pragma once
36 |
37 | #include "../util_namespace.cuh"
38 | #include "../util_macro.cuh"
39 |
40 | /// Optional outer namespace(s)
41 | CUB_NS_PREFIX
42 |
43 | /// CUB namespace
44 | namespace cub {
45 |
46 |
47 | /**
48 | * \addtogroup GridModule
49 | * @{
50 | */
51 |
52 |
53 | /**
54 | * \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains).
55 | *
56 | * \par Overview
57 | * GridEvenShare indicates which sections of input are to be mapped onto which threadblocks.
58 | * Threadblocks may receive one of three different amounts of work: "big", "normal",
59 | * and "last". The "big" workloads are one scheduling grain larger than "normal". The "last" work unit
60 | * for the last threadblock may be partially-full if the input is not an even multiple of
61 | * the scheduling grain size.
62 | *
63 | * \par
64 | * Before invoking a child grid, a parent thread will typically construct an instance of
65 | * GridEvenShare. The instance can be passed to child threadblocks which can
66 | * initialize their per-threadblock offsets using \p BlockInit().
67 | *
68 | * \tparam Offset Signed integer type for global offsets
69 | */
70 | template
71 | struct GridEvenShare
72 | {
73 | Offset total_grains;
74 | int big_blocks;
75 | Offset big_share;
76 | Offset normal_share;
77 | Offset normal_base_offset;
78 |
79 | /// Total number of input items
80 | Offset num_items;
81 |
82 | /// Grid size in threadblocks
83 | int grid_size;
84 |
85 | /// Offset into input marking the beginning of the owning thread block's segment of input tiles
86 | Offset block_offset;
87 |
88 | /// Offset into input of marking the end (one-past) of the owning thread block's segment of input tiles
89 | Offset block_end;
90 |
91 | /**
92 | * \brief Default constructor. Zero-initializes block-specific fields.
93 | */
94 | __host__ __device__ __forceinline__ GridEvenShare() :
95 | num_items(0),
96 | grid_size(0),
97 | block_offset(0),
98 | block_end(0) {}
99 |
100 | /**
101 | * \brief Constructor. Initializes the grid-specific members \p num_items and \p grid_size. To be called prior prior to kernel launch)
102 | */
103 | __host__ __device__ __forceinline__ GridEvenShare(
104 | Offset num_items, ///< Total number of input items
105 | int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
106 | int schedule_granularity) ///< Granularity by which the input can be parcelled into and distributed among threablocks. Usually the thread block's native tile size (or a multiple thereof.
107 | {
108 | this->num_items = num_items;
109 | this->block_offset = num_items;
110 | this->block_end = num_items;
111 | this->total_grains = (num_items + schedule_granularity - 1) / schedule_granularity;
112 | this->grid_size = CUB_MIN(total_grains, max_grid_size);
113 | Offset grains_per_block = total_grains / grid_size;
114 | this->big_blocks = total_grains - (grains_per_block * grid_size); // leftover grains go to big blocks
115 | this->normal_share = grains_per_block * schedule_granularity;
116 | this->normal_base_offset = big_blocks * schedule_granularity;
117 | this->big_share = normal_share + schedule_granularity;
118 | }
119 |
120 |
121 |
122 | /**
123 | * \brief Initializes ranges for the specified partition index
124 | */
125 | __device__ __forceinline__ void Init(int partition_id)
126 | {
127 | if (partition_id < big_blocks)
128 | {
129 | // This threadblock gets a big share of grains (grains_per_block + 1)
130 | block_offset = (partition_id * big_share);
131 | block_end = block_offset + big_share;
132 | }
133 | else if (partition_id < total_grains)
134 | {
135 | // This threadblock gets a normal share of grains (grains_per_block)
136 | block_offset = normal_base_offset + (partition_id * normal_share);
137 | block_end = CUB_MIN(num_items, block_offset + normal_share);
138 | }
139 | }
140 |
141 |
142 | /**
143 | * \brief Initializes ranges for the current thread block (e.g., to be called by each threadblock after startup)
144 | */
145 | __device__ __forceinline__ void BlockInit()
146 | {
147 | Init(blockIdx.x);
148 | }
149 |
150 |
151 | /**
152 | * Print to stdout
153 | */
154 | __host__ __device__ __forceinline__ void Print()
155 | {
156 | printf(
157 | #if (CUB_PTX_ARCH > 0)
158 | "\tthreadblock(%d) "
159 | "block_offset(%lu) "
160 | "block_end(%lu) "
161 | #endif
162 | "num_items(%lu) "
163 | "total_grains(%lu) "
164 | "big_blocks(%lu) "
165 | "big_share(%lu) "
166 | "normal_share(%lu)\n",
167 | #if (CUB_PTX_ARCH > 0)
168 | blockIdx.x,
169 | (unsigned long) block_offset,
170 | (unsigned long) block_end,
171 | #endif
172 | (unsigned long) num_items,
173 | (unsigned long) total_grains,
174 | (unsigned long) big_blocks,
175 | (unsigned long) big_share,
176 | (unsigned long) normal_share);
177 | }
178 | };
179 |
180 |
181 |
182 | /** @} */ // end group GridModule
183 |
184 | } // CUB namespace
185 | CUB_NS_POSTFIX // Optional outer namespace(s)
186 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/grid/grid_mapping.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
32 | */
33 |
34 | #pragma once
35 |
36 | #include "../util_namespace.cuh"
37 |
38 | /// Optional outer namespace(s)
39 | CUB_NS_PREFIX
40 |
41 | /// CUB namespace
42 | namespace cub {
43 |
44 |
45 | /**
46 | * \addtogroup GridModule
47 | * @{
48 | */
49 |
50 |
51 | /******************************************************************************
52 | * Mapping policies
53 | *****************************************************************************/
54 |
55 |
56 | /**
57 | * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
58 | */
59 | enum GridMappingStrategy
60 | {
61 | /**
62 | * \brief An "even-share" strategy for assigning input tiles to thread blocks.
63 | *
64 | * \par Overview
65 | * The input is evenly partitioned into \p p segments, where \p p is
66 | * constant and corresponds loosely to the number of thread blocks that may
67 | * actively reside on the target device. Each segment is comprised of
68 | * consecutive tiles, where a tile is a small, constant-sized unit of input
69 | * to be processed to completion before the thread block terminates or
70 | * obtains more work. The kernel invokes \p p thread blocks, each
71 | * of which iteratively consumes a segment of n/p elements
72 | * in tile-size increments.
73 | */
74 | GRID_MAPPING_EVEN_SHARE,
75 |
76 | /**
77 | * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
78 | *
79 | * \par Overview
80 | * The input is treated as a queue to be dynamically consumed by a grid of
81 | * thread blocks. Work is atomically dequeued in tiles, where a tile is a
82 | * unit of input to be processed to completion before the thread block
83 | * terminates or obtains more work. The grid size \p p is constant,
84 | * loosely corresponding to the number of thread blocks that may actively
85 | * reside on the target device.
86 | */
87 | GRID_MAPPING_DYNAMIC,
88 | };
89 |
90 |
91 | /** @} */ // end group GridModule
92 |
93 | } // CUB namespace
94 | CUB_NS_POSTFIX // Optional outer namespace(s)
95 |
96 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/grid/grid_queue.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * cub::GridQueue is a descriptor utility for dynamic queue management.
32 | */
33 |
34 | #pragma once
35 |
36 | #include "../util_namespace.cuh"
37 | #include "../util_debug.cuh"
38 |
39 | /// Optional outer namespace(s)
40 | CUB_NS_PREFIX
41 |
42 | /// CUB namespace
43 | namespace cub {
44 |
45 |
46 | /**
47 | * \addtogroup GridModule
48 | * @{
49 | */
50 |
51 |
52 | /**
53 | * \brief GridQueue is a descriptor utility for dynamic queue management.
54 | *
55 | * \par Overview
56 | * GridQueue descriptors provides abstractions for "filling" or
57 | * "draining" globally-shared vectors.
58 | *
59 | * \par
60 | * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
61 | * returning a unique offset for the calling thread to write its items.
62 | * The GridQueue maintains the total "fill-size". The fill counter must be reset
63 | * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
64 | * will be filling.
65 | *
66 | * \par
67 | * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
68 | * zero-initialized counter, returning a unique offset for the calling thread to
69 | * read its items. Threads can safely drain until the array's logical fill-size is
70 | * exceeded. The drain counter must be reset using GridQueue::ResetDrain or
71 | * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
72 | * will be filling. (For dynamic work distribution of existing data, the corresponding fill-size
73 | * is simply the number of elements in the array.)
74 | *
75 | * \par
76 | * Iterative work management can be implemented simply with a pair of flip-flopping
77 | * work buffers, each with an associated set of fill and drain GridQueue descriptors.
78 | *
79 | * \tparam Offset Signed integer type for global offsets
80 | */
81 | template
82 | class GridQueue
83 | {
84 | private:
85 |
86 | /// Counter indices
87 | enum
88 | {
89 | FILL = 0,
90 | DRAIN = 1,
91 | };
92 |
93 | /// Pair of counters
94 | Offset *d_counters;
95 |
96 | public:
97 |
98 | /// Returns the device allocation size in bytes needed to construct a GridQueue instance
99 | __host__ __device__ __forceinline__
100 | static size_t AllocationSize()
101 | {
102 | return sizeof(Offset) * 2;
103 | }
104 |
105 |
106 | /// Constructs an invalid GridQueue descriptor
107 | __host__ __device__ __forceinline__ GridQueue()
108 | :
109 | d_counters(NULL)
110 | {}
111 |
112 |
113 | /// Constructs a GridQueue descriptor around the device storage allocation
114 | __host__ __device__ __forceinline__ GridQueue(
115 | void *d_storage) ///< Device allocation to back the GridQueue. Must be at least as big as AllocationSize().
116 | :
117 | d_counters((Offset*) d_storage)
118 | {}
119 |
120 |
121 | /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining.
122 | __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain(
123 | Offset fill_size,
124 | cudaStream_t stream = 0)
125 | {
126 | #if (CUB_PTX_ARCH > 0)
127 | d_counters[FILL] = fill_size;
128 | d_counters[DRAIN] = 0;
129 | return cudaSuccess;
130 | #else
131 | Offset counters[2];
132 | counters[FILL] = fill_size;
133 | counters[DRAIN] = 0;
134 | return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(Offset) * 2, cudaMemcpyHostToDevice, stream));
135 | #endif
136 | }
137 |
138 |
139 | /// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining.
140 | __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
141 | {
142 | #if (CUB_PTX_ARCH > 0)
143 | d_counters[DRAIN] = 0;
144 | return cudaSuccess;
145 | #else
146 | return FillAndResetDrain(0, stream);
147 | #endif
148 | }
149 |
150 |
151 | /// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling.
152 | __host__ __device__ __forceinline__ cudaError_t ResetFill()
153 | {
154 | #if (CUB_PTX_ARCH > 0)
155 | d_counters[FILL] = 0;
156 | return cudaSuccess;
157 | #else
158 | return CubDebug(cudaMemset(d_counters + FILL, 0, sizeof(Offset)));
159 | #endif
160 | }
161 |
162 |
163 | /// Returns the fill-size established by the parent or by the previous kernel.
164 | __host__ __device__ __forceinline__ cudaError_t FillSize(
165 | Offset &fill_size,
166 | cudaStream_t stream = 0)
167 | {
168 | #if (CUB_PTX_ARCH > 0)
169 | fill_size = d_counters[FILL];
170 | return cudaSuccess;
171 | #else
172 | return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(Offset), cudaMemcpyDeviceToHost, stream));
173 | #endif
174 | }
175 |
176 |
177 | /// Drain num_items. Returns offset from which to read items.
178 | __device__ __forceinline__ Offset Drain(Offset num_items)
179 | {
180 | return atomicAdd(d_counters + DRAIN, num_items);
181 | }
182 |
183 |
184 | /// Fill num_items. Returns offset from which to write items.
185 | __device__ __forceinline__ Offset Fill(Offset num_items)
186 | {
187 | return atomicAdd(d_counters + FILL, num_items);
188 | }
189 | };
190 |
191 |
192 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
193 |
194 |
195 | /**
196 | * Reset grid queue (call with 1 block of 1 thread)
197 | */
198 | template
199 | __global__ void FillAndResetDrainKernel(
200 | GridQueue grid_queue,
201 | Offset num_items)
202 | {
203 | grid_queue.FillAndResetDrain(num_items);
204 | }
205 |
206 |
207 |
208 | #endif // DOXYGEN_SHOULD_SKIP_THIS
209 |
210 |
211 | /** @} */ // end group GridModule
212 |
213 | } // CUB namespace
214 | CUB_NS_POSTFIX // Optional outer namespace(s)
215 |
216 |
217 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/host/spinlock.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Simple x86/x64 atomic spinlock, portable across MS Windows (cl.exe) & Linux (g++)
32 | */
33 |
34 |
35 | #pragma once
36 |
37 | #if defined(_WIN32) || defined(_WIN64)
38 | #include
39 | #include
40 | #undef small // Windows is terrible for polluting macro namespace
41 |
42 | /**
43 | * Compiler read/write barrier
44 | */
45 | #pragma intrinsic(_ReadWriteBarrier)
46 |
47 | #endif
48 |
49 | #include "../util_namespace.cuh"
50 |
51 | /// Optional outer namespace(s)
52 | CUB_NS_PREFIX
53 |
54 | /// CUB namespace
55 | namespace cub {
56 |
57 |
58 | #if defined(_MSC_VER)
59 |
60 | // Microsoft VC++
61 | typedef long Spinlock;
62 |
63 | #else
64 |
65 | // GNU g++
66 | typedef int Spinlock;
67 |
68 | /**
69 | * Compiler read/write barrier
70 | */
71 | __forceinline__ void _ReadWriteBarrier()
72 | {
73 | __sync_synchronize();
74 | }
75 |
76 | /**
77 | * Atomic exchange
78 | */
79 | __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
80 | {
81 | // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
82 | _ReadWriteBarrier();
83 | return __sync_lock_test_and_set(Target, Value);
84 | }
85 |
86 | /**
87 | * Pause instruction to prevent excess processor bus usage
88 | */
89 | __forceinline__ void YieldProcessor()
90 | {
91 | #ifndef __arm__
92 | asm volatile("pause\n": : :"memory");
93 | #endif // __arm__
94 | }
95 |
96 | #endif // defined(_MSC_VER)
97 |
98 | /**
99 | * Return when the specified spinlock has been acquired
100 | */
101 | __forceinline__ void Lock(volatile Spinlock *lock)
102 | {
103 | while (1)
104 | {
105 | if (!_InterlockedExchange(lock, 1)) return;
106 | while (*lock) YieldProcessor();
107 | }
108 | }
109 |
110 |
111 | /**
112 | * Release the specified spinlock
113 | */
114 | __forceinline__ void Unlock(volatile Spinlock *lock)
115 | {
116 | _ReadWriteBarrier();
117 | *lock = 0;
118 | }
119 |
120 |
121 | } // CUB namespace
122 | CUB_NS_POSTFIX // Optional outer namespace(s)
123 |
124 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/iterator/cache_modified_input_iterator.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Random-access iterator types
32 | */
33 |
34 | #pragma once
35 |
36 | #include
37 | #include
38 |
39 | #include "../thread/thread_load.cuh"
40 | #include "../thread/thread_store.cuh"
41 | #include "../util_device.cuh"
42 | #include "../util_namespace.cuh"
43 |
44 | #if (THRUST_VERSION >= 100700)
45 | // This iterator is compatible with Thrust API 1.7 and newer
46 | #include
47 | #include
48 | #endif // THRUST_VERSION
49 |
50 |
51 | /// Optional outer namespace(s)
52 | CUB_NS_PREFIX
53 |
54 | /// CUB namespace
55 | namespace cub {
56 |
57 |
58 |
59 | /**
60 | * \addtogroup UtilIterator
61 | * @{
62 | */
63 |
64 |
65 | /**
66 | * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
67 | *
68 | * \par Overview
69 | * - CacheModifiedInputIterator is a random-access input iterator that wraps a native
70 | * device pointer of type ValueType*. \p ValueType references are
71 | * made by reading \p ValueType values through loads modified by \p MODIFIER.
72 | * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
73 | * "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
74 | * - Can be constructed, manipulated, and exchanged within and between host and device
75 | * functions, but can only be dereferenced within device functions.
76 | * - Compatible with Thrust API v1.7 or newer.
77 | *
78 | * \par Snippet
79 | * The code snippet below illustrates the use of \p CacheModifiedInputIterator to
80 | * dereference a device array of double using the "ldg" PTX load modifier
81 | * (i.e., load values through texture cache).
82 | * \par
83 | * \code
84 | * #include // or equivalently
85 | *
86 | * // Declare, allocate, and initialize a device array
87 | * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
88 | *
89 | * // Create an iterator wrapper
90 | * cub::CacheModifiedInputIterator itr(d_in);
91 | *
92 | * // Within device code:
93 | * printf("%f\n", itr[0]); // 8.0
94 | * printf("%f\n", itr[1]); // 6.0
95 | * printf("%f\n", itr[6]); // 9.0
96 | *
97 | * \endcode
98 | *
99 | * \tparam CacheLoadModifier The cub::CacheLoadModifier to use when accessing data
100 | * \tparam ValueType The value type of this iterator
101 | * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t)
102 | */
103 | template <
104 | CacheLoadModifier MODIFIER,
105 | typename ValueType,
106 | typename Offset = ptrdiff_t>
107 | class CacheModifiedInputIterator
108 | {
109 | public:
110 |
111 | // Required iterator traits
112 | typedef CacheModifiedInputIterator self_type; ///< My own type
113 | typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another
114 | typedef ValueType value_type; ///< The type of the element the iterator can point to
115 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to
116 | typedef ValueType reference; ///< The type of a reference to an element the iterator can point to
117 |
118 | #if (THRUST_VERSION >= 100700)
119 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
120 | typedef typename thrust::detail::iterator_facade_category<
121 | thrust::device_system_tag,
122 | thrust::random_access_traversal_tag,
123 | value_type,
124 | reference
125 | >::type iterator_category; ///< The iterator category
126 | #else
127 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
128 | #endif // THRUST_VERSION
129 |
130 |
131 | private:
132 |
133 | ValueType* ptr;
134 |
135 | public:
136 |
137 | /// Constructor
138 | __host__ __device__ __forceinline__ CacheModifiedInputIterator(
139 | ValueType* ptr) ///< Native pointer to wrap
140 | :
141 | ptr(ptr)
142 | {}
143 |
144 | /// Postfix increment
145 | __host__ __device__ __forceinline__ self_type operator++(int)
146 | {
147 | self_type retval = *this;
148 | ptr++;
149 | return retval;
150 | }
151 |
152 | /// Prefix increment
153 | __host__ __device__ __forceinline__ self_type operator++()
154 | {
155 | ptr++;
156 | return *this;
157 | }
158 |
159 | /// Indirection
160 | __host__ __device__ __forceinline__ reference operator*() const
161 | {
162 | return ThreadLoad(ptr);
163 | }
164 |
165 | /// Addition
166 | template
167 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const
168 | {
169 | self_type retval(ptr + n);
170 | return retval;
171 | }
172 |
173 | /// Addition assignment
174 | template
175 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
176 | {
177 | ptr += n;
178 | return *this;
179 | }
180 |
181 | /// Subtraction
182 | template
183 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const
184 | {
185 | self_type retval(ptr - n);
186 | return retval;
187 | }
188 |
189 | /// Subtraction assignment
190 | template
191 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
192 | {
193 | ptr -= n;
194 | return *this;
195 | }
196 |
197 | /// Distance
198 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
199 | {
200 | return ptr - other.ptr;
201 | }
202 |
203 | /// Array subscript
204 | template
205 | __host__ __device__ __forceinline__ reference operator[](Distance n) const
206 | {
207 | return ThreadLoad(ptr + n);
208 | }
209 |
210 | /// Structure dereference
211 | __host__ __device__ __forceinline__ pointer operator->()
212 | {
213 | return &ThreadLoad(ptr);
214 | }
215 |
216 | /// Equal to
217 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
218 | {
219 | return (ptr == rhs.ptr);
220 | }
221 |
222 | /// Not equal to
223 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
224 | {
225 | return (ptr != rhs.ptr);
226 | }
227 |
228 | /// ostream operator
229 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
230 | {
231 | return os;
232 | }
233 | };
234 |
235 |
236 |
237 | /** @} */ // end group UtilIterator
238 |
239 | } // CUB namespace
240 | CUB_NS_POSTFIX // Optional outer namespace(s)
241 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/iterator/constant_input_iterator.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Random-access iterator types
32 | */
33 |
34 | #pragma once
35 |
36 | #include
37 | #include
38 |
39 | #include "../thread/thread_load.cuh"
40 | #include "../thread/thread_store.cuh"
41 | #include "../util_namespace.cuh"
42 |
43 | #if (THRUST_VERSION >= 100700)
44 | // This iterator is compatible with Thrust API 1.7 and newer
45 | #include
46 | #include
47 | #endif // THRUST_VERSION
48 |
49 |
50 | /// Optional outer namespace(s)
51 | CUB_NS_PREFIX
52 |
53 | /// CUB namespace
54 | namespace cub {
55 |
56 |
57 | /**
58 | * \addtogroup UtilIterator
59 | * @{
60 | */
61 |
62 |
63 | /**
64 | * \brief A random-access input generator for dereferencing a sequence of homogeneous values
65 | *
66 | * \par Overview
67 | * - Read references to a ConstantInputIterator iterator always return the supplied constant
68 | * of type \p ValueType.
69 | * - Can be used with any data type.
70 | * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
71 | * functions.
72 | * - Compatible with Thrust API v1.7 or newer.
73 | *
74 | * \par Snippet
75 | * The code snippet below illustrates the use of \p ConstantInputIterator to
76 | * dereference a sequence of homogeneous doubles.
77 | * \par
78 | * \code
79 | * #include // or equivalently
80 | *
81 | * cub::ConstantInputIterator itr(5.0);
82 | *
83 | * printf("%f\n", itr[0]); // 5.0
84 | * printf("%f\n", itr[1]); // 5.0
85 | * printf("%f\n", itr[2]); // 5.0
86 | * printf("%f\n", itr[50]); // 5.0
87 | *
88 | * \endcode
89 | *
90 | * \tparam ValueType The value type of this iterator
91 | * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t)
92 | */
93 | template <
94 | typename ValueType,
95 | typename Offset = ptrdiff_t>
96 | class ConstantInputIterator
97 | {
98 | public:
99 |
100 | // Required iterator traits
101 | typedef ConstantInputIterator self_type; ///< My own type
102 | typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another
103 | typedef ValueType value_type; ///< The type of the element the iterator can point to
104 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to
105 | typedef ValueType reference; ///< The type of a reference to an element the iterator can point to
106 |
107 | #if (THRUST_VERSION >= 100700)
108 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
109 | typedef typename thrust::detail::iterator_facade_category<
110 | thrust::any_system_tag,
111 | thrust::random_access_traversal_tag,
112 | value_type,
113 | reference
114 | >::type iterator_category; ///< The iterator category
115 | #else
116 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
117 | #endif // THRUST_VERSION
118 |
119 | private:
120 |
121 | ValueType val;
122 | Offset offset;
123 | #ifdef _WIN32
124 | Offset pad[CUB_MAX(1, (16 / sizeof(Offset) - 1))]; // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
125 | #endif
126 |
127 | public:
128 |
129 | /// Constructor
130 | __host__ __device__ __forceinline__ ConstantInputIterator(
131 | ValueType val, ///< Starting value for the iterator instance to report
132 | Offset offset = 0) ///< Base offset
133 | :
134 | val(val),
135 | offset(offset)
136 | {}
137 |
138 | /// Postfix increment
139 | __host__ __device__ __forceinline__ self_type operator++(int)
140 | {
141 | self_type retval = *this;
142 | offset++;
143 | return retval;
144 | }
145 |
146 | /// Prefix increment
147 | __host__ __device__ __forceinline__ self_type operator++()
148 | {
149 | offset++;
150 | return *this;
151 | }
152 |
153 | /// Indirection
154 | __host__ __device__ __forceinline__ reference operator*() const
155 | {
156 | return val;
157 | }
158 |
159 | /// Addition
160 | template
161 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const
162 | {
163 | self_type retval(val, offset + n);
164 | return retval;
165 | }
166 |
167 | /// Addition assignment
168 | template
169 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
170 | {
171 | offset += n;
172 | return *this;
173 | }
174 |
175 | /// Subtraction
176 | template
177 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const
178 | {
179 | self_type retval(val, offset - n);
180 | return retval;
181 | }
182 |
183 | /// Subtraction assignment
184 | template
185 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
186 | {
187 | offset -= n;
188 | return *this;
189 | }
190 |
191 | /// Distance
192 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
193 | {
194 | return offset - other.offset;
195 | }
196 |
197 | /// Array subscript
198 | template
199 | __host__ __device__ __forceinline__ reference operator[](Distance n) const
200 | {
201 | return val;
202 | }
203 |
204 | /// Structure dereference
205 | __host__ __device__ __forceinline__ pointer operator->()
206 | {
207 | return &val;
208 | }
209 |
210 | /// Equal to
211 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
212 | {
213 | return (offset == rhs.offset) && ((val == rhs.val));
214 | }
215 |
216 | /// Not equal to
217 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
218 | {
219 | return (offset != rhs.offset) || (val!= rhs.val);
220 | }
221 |
222 | /// ostream operator
223 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
224 | {
225 | os << "[" << itr.val << "," << itr.offset << "]";
226 | return os;
227 | }
228 |
229 | };
230 |
231 |
232 | /** @} */ // end group UtilIterator
233 |
234 | } // CUB namespace
235 | CUB_NS_POSTFIX // Optional outer namespace(s)
236 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/iterator/counting_input_iterator.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Random-access iterator types
32 | */
33 |
34 | #pragma once
35 |
36 | #include
37 | #include
38 |
39 | #include "../thread/thread_load.cuh"
40 | #include "../thread/thread_store.cuh"
41 | #include "../util_device.cuh"
42 | #include "../util_namespace.cuh"
43 |
44 | #if (THRUST_VERSION >= 100700)
45 | // This iterator is compatible with Thrust API 1.7 and newer
46 | #include
47 | #include
48 | #endif // THRUST_VERSION
49 |
50 |
51 | /// Optional outer namespace(s)
52 | CUB_NS_PREFIX
53 |
54 | /// CUB namespace
55 | namespace cub {
56 |
57 | /**
58 | * \addtogroup UtilIterator
59 | * @{
60 | */
61 |
62 | /**
63 | * \brief A random-access input generator for dereferencing a sequence of incrementing integer values.
64 | *
65 | * \par Overview
66 | * - After initializing a CountingInputIterator to a certain integer \p base, read references
67 | * at \p offset will return the value \p base + \p offset.
68 | * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
69 | * functions.
70 | * - Compatible with Thrust API v1.7 or newer.
71 | *
72 | * \par Snippet
73 | * The code snippet below illustrates the use of \p CountingInputIterator to
74 | * dereference a sequence of incrementing integers.
75 | * \par
76 | * \code
77 | * #include // or equivalently
78 | *
79 | * cub::CountingInputIterator itr(5);
80 | *
81 | * printf("%d\n", itr[0]); // 5
82 | * printf("%d\n", itr[1]); // 6
83 | * printf("%d\n", itr[2]); // 7
84 | * printf("%d\n", itr[50]); // 55
85 | *
86 | * \endcode
87 | *
88 | * \tparam ValueType The value type of this iterator
89 | * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t)
90 | */
91 | template <
92 | typename ValueType,
93 | typename Offset = ptrdiff_t>
94 | class CountingInputIterator
95 | {
96 | public:
97 |
98 | // Required iterator traits
99 | typedef CountingInputIterator self_type; ///< My own type
100 | typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another
101 | typedef ValueType value_type; ///< The type of the element the iterator can point to
102 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to
103 | typedef ValueType reference; ///< The type of a reference to an element the iterator can point to
104 |
105 | #if (THRUST_VERSION >= 100700)
106 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
107 | typedef typename thrust::detail::iterator_facade_category<
108 | thrust::any_system_tag,
109 | thrust::random_access_traversal_tag,
110 | value_type,
111 | reference
112 | >::type iterator_category; ///< The iterator category
113 | #else
114 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
115 | #endif // THRUST_VERSION
116 |
117 | private:
118 |
119 | ValueType val;
120 |
121 | public:
122 |
123 | /// Constructor
124 | __host__ __device__ __forceinline__ CountingInputIterator(
125 | const ValueType &val) ///< Starting value for the iterator instance to report
126 | :
127 | val(val)
128 | {}
129 |
130 | /// Postfix increment
131 | __host__ __device__ __forceinline__ self_type operator++(int)
132 | {
133 | self_type retval = *this;
134 | val++;
135 | return retval;
136 | }
137 |
138 | /// Prefix increment
139 | __host__ __device__ __forceinline__ self_type operator++()
140 | {
141 | val++;
142 | return *this;
143 | }
144 |
145 | /// Indirection
146 | __host__ __device__ __forceinline__ reference operator*() const
147 | {
148 | return val;
149 | }
150 |
151 | /// Addition
152 | template
153 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const
154 | {
155 | self_type retval(val + n);
156 | return retval;
157 | }
158 |
159 | /// Addition assignment
160 | template
161 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
162 | {
163 | val += n;
164 | return *this;
165 | }
166 |
167 | /// Subtraction
168 | template
169 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const
170 | {
171 | self_type retval(val - n);
172 | return retval;
173 | }
174 |
175 | /// Subtraction assignment
176 | template
177 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
178 | {
179 | val -= n;
180 | return *this;
181 | }
182 |
183 | /// Distance
184 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
185 | {
186 | return val - other.val;
187 | }
188 |
189 | /// Array subscript
190 | template
191 | __host__ __device__ __forceinline__ reference operator[](Distance n) const
192 | {
193 | return val + n;
194 | }
195 |
196 | /// Structure dereference
197 | __host__ __device__ __forceinline__ pointer operator->()
198 | {
199 | return &val;
200 | }
201 |
202 | /// Equal to
203 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
204 | {
205 | return (val == rhs.val);
206 | }
207 |
208 | /// Not equal to
209 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
210 | {
211 | return (val != rhs.val);
212 | }
213 |
214 | /// ostream operator
215 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
216 | {
217 | os << "[" << itr.val << "]";
218 | return os;
219 | }
220 |
221 | };
222 |
223 |
224 |
225 | /** @} */ // end group UtilIterator
226 |
227 | } // CUB namespace
228 | CUB_NS_POSTFIX // Optional outer namespace(s)
229 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/thread/thread_operators.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Simple binary operator functor types
32 | */
33 |
34 | /******************************************************************************
35 | * Simple functor operators
36 | ******************************************************************************/
37 |
38 | #pragma once
39 |
40 | #include "../util_macro.cuh"
41 | #include "../util_type.cuh"
42 | #include "../util_namespace.cuh"
43 |
44 | /// Optional outer namespace(s)
45 | CUB_NS_PREFIX
46 |
47 | /// CUB namespace
48 | namespace cub {
49 |
50 |
51 | /**
52 | * \addtogroup UtilModule
53 | * @{
54 | */
55 |
56 | /**
57 | * \brief Default equality functor
58 | */
59 | struct Equality
60 | {
61 | /// Boolean equality operator, returns (a == b)
62 | template
63 | __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
64 | {
65 | return a == b;
66 | }
67 | };
68 |
69 |
70 | /**
71 | * \brief Default inequality functor
72 | */
73 | struct Inequality
74 | {
75 | /// Boolean inequality operator, returns (a != b)
76 | template
77 | __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
78 | {
79 | return a != b;
80 | }
81 | };
82 |
83 |
84 | /**
85 | * \brief Inequality functor (wraps equality functor)
86 | */
87 | template
88 | struct InequalityWrapper
89 | {
90 | /// Wrapped equality operator
91 | EqualityOp op;
92 |
93 | /// Constructor
94 | __host__ __device__ __forceinline__
95 | InequalityWrapper(EqualityOp op) : op(op) {}
96 |
97 | /// Boolean inequality operator, returns (a != b)
98 | template
99 | __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
100 | {
101 | return !op(a, b);
102 | }
103 | };
104 |
105 |
106 | /**
107 | * \brief Default sum functor
108 | */
109 | struct Sum
110 | {
111 | /// Boolean sum operator, returns a + b
112 | template
113 | __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
114 | {
115 | return a + b;
116 | }
117 | };
118 |
119 |
120 | /**
121 | * \brief Default max functor
122 | */
123 | struct Max
124 | {
125 | /// Boolean max operator, returns (a > b) ? a : b
126 | template
127 | __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
128 | {
129 | return CUB_MAX(a, b);
130 | }
131 | };
132 |
133 |
134 | /**
135 | * \brief Arg max functor (keeps the value and offset of the first occurrence of the l item)
136 | */
137 | struct ArgMax
138 | {
139 | /// Boolean max operator, preferring the item having the smaller offset in case of ties
140 | template
141 | __host__ __device__ __forceinline__ ItemOffsetPair operator()(
142 | const ItemOffsetPair &a,
143 | const ItemOffsetPair &b) const
144 | {
145 | if (a.value == b.value)
146 | return (b.offset < a.offset) ? b : a;
147 |
148 | return (b.value > a.value) ? b : a;
149 | }
150 | };
151 |
152 |
153 | /**
154 | * \brief Default min functor
155 | */
156 | struct Min
157 | {
158 | /// Boolean min operator, returns (a < b) ? a : b
159 | template
160 | __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
161 | {
162 | return CUB_MIN(a, b);
163 | }
164 | };
165 |
166 |
167 | /**
168 | * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
169 | */
170 | struct ArgMin
171 | {
172 | /// Boolean min operator, preferring the item having the smaller offset in case of ties
173 | template
174 | __host__ __device__ __forceinline__ ItemOffsetPair operator()(
175 | const ItemOffsetPair &a,
176 | const ItemOffsetPair &b) const
177 | {
178 | if (a.value == b.value)
179 | return (b.offset < a.offset) ? b : a;
180 |
181 | return (b.value < a.value) ? b : a;
182 | }
183 | };
184 |
185 |
186 | /**
187 | * \brief Default cast functor
188 | */
189 | template
190 | struct Cast
191 | {
192 | /// Boolean max operator, returns (a > b) ? a : b
193 | template
194 | __host__ __device__ __forceinline__ B operator()(const A &a) const
195 | {
196 | return (B) a;
197 | }
198 | };
199 |
200 |
201 |
202 | /** @} */ // end group UtilModule
203 |
204 |
205 | } // CUB namespace
206 | CUB_NS_POSTFIX // Optional outer namespace(s)
207 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/thread/thread_reduce.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Thread utilities for sequential reduction over statically-sized array types
32 | */
33 |
34 | #pragma once
35 |
36 | #include "../thread/thread_operators.cuh"
37 | #include "../util_namespace.cuh"
38 |
39 | /// Optional outer namespace(s)
40 | CUB_NS_PREFIX
41 |
42 | /// CUB namespace
43 | namespace cub {
44 |
45 | /**
46 | * \addtogroup UtilModule
47 | * @{
48 | */
49 |
50 | /**
51 | * \name Sequential reduction over statically-sized array types
52 | * @{
53 | */
54 |
55 |
56 | template <
57 | int LENGTH,
58 | typename T,
59 | typename ReductionOp>
60 | __device__ __forceinline__ T ThreadReduce(
61 | T* input, ///< [in] Input array
62 | ReductionOp reduction_op, ///< [in] Binary reduction operator
63 | T prefix, ///< [in] Prefix to seed reduction with
64 | Int2Type length)
65 | {
66 | T addend = *input;
67 | prefix = reduction_op(prefix, addend);
68 |
69 | return ThreadReduce(input + 1, reduction_op, prefix, Int2Type());
70 | }
71 |
72 | template <
73 | typename T,
74 | typename ReductionOp>
75 | __device__ __forceinline__ T ThreadReduce(
76 | T* input, ///< [in] Input array
77 | ReductionOp reduction_op, ///< [in] Binary reduction operator
78 | T prefix, ///< [in] Prefix to seed reduction with
79 | Int2Type<0> length)
80 | {
81 | return prefix;
82 | }
83 |
84 |
85 | /**
86 | * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned.
87 | *
88 | * \tparam LENGTH Length of input array
89 | * \tparam T [inferred] The data type to be reduced.
90 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b)
91 | */
92 | template <
93 | int LENGTH,
94 | typename T,
95 | typename ReductionOp>
96 | __device__ __forceinline__ T ThreadReduce(
97 | T* input, ///< [in] Input array
98 | ReductionOp reduction_op, ///< [in] Binary reduction operator
99 | T prefix) ///< [in] Prefix to seed reduction with
100 | {
101 | return ThreadReduce(input, reduction_op, prefix, Int2Type());
102 | }
103 |
104 |
105 | /**
106 | * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned.
107 | *
108 | * \tparam LENGTH Length of input array
109 | * \tparam T [inferred] The data type to be reduced.
110 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b)
111 | */
112 | template <
113 | int LENGTH,
114 | typename T,
115 | typename ReductionOp>
116 | __device__ __forceinline__ T ThreadReduce(
117 | T* input, ///< [in] Input array
118 | ReductionOp reduction_op) ///< [in] Binary reduction operator
119 | {
120 | T prefix = input[0];
121 | return ThreadReduce(input + 1, reduction_op, prefix);
122 | }
123 |
124 |
125 | /**
126 | * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned.
127 | *
128 | * \tparam LENGTH [inferred] Length of \p input array
129 | * \tparam T [inferred] The data type to be reduced.
130 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b)
131 | */
132 | template <
133 | int LENGTH,
134 | typename T,
135 | typename ReductionOp>
136 | __device__ __forceinline__ T ThreadReduce(
137 | T (&input)[LENGTH], ///< [in] Input array
138 | ReductionOp reduction_op, ///< [in] Binary reduction operator
139 | T prefix) ///< [in] Prefix to seed reduction with
140 | {
141 | return ThreadReduce(input, reduction_op, prefix);
142 | }
143 |
144 |
145 | /**
146 | * \brief Serial reduction with the specified operator
147 | *
148 | * \tparam LENGTH [inferred] Length of \p input array
149 | * \tparam T [inferred] The data type to be reduced.
150 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b)
151 | */
152 | template <
153 | int LENGTH,
154 | typename T,
155 | typename ReductionOp>
156 | __device__ __forceinline__ T ThreadReduce(
157 | T (&input)[LENGTH], ///< [in] Input array
158 | ReductionOp reduction_op) ///< [in] Binary reduction operator
159 | {
160 | return ThreadReduce((T*) input, reduction_op);
161 | }
162 |
163 |
164 | //@} end member group
165 |
166 | /** @} */ // end group UtilModule
167 |
168 | } // CUB namespace
169 | CUB_NS_POSTFIX // Optional outer namespace(s)
170 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/util_arch.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Static architectural properties by SM version.
32 | */
33 |
34 | #pragma once
35 |
36 | #include "util_namespace.cuh"
37 |
38 | /// Optional outer namespace(s)
39 | CUB_NS_PREFIX
40 |
41 | /// CUB namespace
42 | namespace cub {
43 |
44 |
45 | /**
46 | * \addtogroup UtilMgmt
47 | * @{
48 | */
49 |
50 |
51 | /// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
52 | #ifndef __CUDA_ARCH__
53 | #define CUB_PTX_ARCH 0
54 | #else
55 | #define CUB_PTX_ARCH __CUDA_ARCH__
56 | #endif
57 |
58 |
59 | /// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API.
60 | #if (CUB_PTX_ARCH == 0) || defined(CUB_CDP)
61 | #define CUB_RUNTIME_ENABLED
62 | #define CUB_RUNTIME_FUNCTION __host__ __device__
63 | #else
64 | #define CUB_RUNTIME_FUNCTION __host__
65 | #endif
66 |
67 |
68 |
69 | /// Number of threads per warp (log)
70 | #define CUB_LOG_WARP_THREADS(arch) \
71 | (5)
72 |
73 | /// Number of threads per warp
74 | #define CUB_WARP_THREADS(arch) \
75 | (1 << CUB_LOG_WARP_THREADS(arch))
76 |
77 | /// Number of smem banks (log)
78 | #define CUB_LOG_SMEM_BANKS(arch) \
79 | ((arch >= 200) ? \
80 | (5) : \
81 | (4))
82 |
83 | /// Number of smem banks
84 | #define CUB_SMEM_BANKS(arch) \
85 | (1 << CUB_LOG_SMEM_BANKS(arch))
86 |
87 | /// Number of bytes per smem bank
88 | #define CUB_SMEM_BANK_BYTES(arch) \
89 | (4)
90 |
91 | /// Number of smem bytes provisioned per SM
92 | #define CUB_SMEM_BYTES(arch) \
93 | ((arch >= 200) ? \
94 | (48 * 1024) : \
95 | (16 * 1024))
96 |
97 | /// Smem allocation size in bytes
98 | #define CUB_SMEM_ALLOC_UNIT(arch) \
99 | ((arch >= 300) ? \
100 | (256) : \
101 | ((arch >= 200) ? \
102 | (128) : \
103 | (512)))
104 |
105 | /// Whether or not the architecture allocates registers by block (or by warp)
106 | #define CUB_REGS_BY_BLOCK(arch) \
107 | ((arch >= 200) ? \
108 | (false) : \
109 | (true))
110 |
111 | /// Number of registers allocated at a time per block (or by warp)
112 | #define CUB_REG_ALLOC_UNIT(arch) \
113 | ((arch >= 300) ? \
114 | (256) : \
115 | ((arch >= 200) ? \
116 | (64) : \
117 | ((arch >= 120) ? \
118 | (512) : \
119 | (256))))
120 |
121 | /// Granularity of warps for which registers are allocated
122 | #define CUB_WARP_ALLOC_UNIT(arch) \
123 | ((arch >= 300) ? \
124 | (4) : \
125 | (2))
126 |
127 | /// Maximum number of threads per SM
128 | #define CUB_MAX_SM_THREADS(arch) \
129 | ((arch >= 300) ? \
130 | (2048) : \
131 | ((arch >= 200) ? \
132 | (1536) : \
133 | ((arch >= 120) ? \
134 | (1024) : \
135 | (768))))
136 |
137 | /// Maximum number of thread blocks per SM
138 | #define CUB_MAX_SM_BLOCKS(arch) \
139 | ((arch >= 300) ? \
140 | (16) : \
141 | (8))
142 |
143 | /// Maximum number of threads per thread block
144 | #define CUB_MAX_BLOCK_THREADS(arch) \
145 | ((arch >= 200) ? \
146 | (1024) : \
147 | (512))
148 |
149 | /// Maximum number of registers per SM
150 | #define CUB_MAX_SM_REGISTERS(arch) \
151 | ((arch >= 300) ? \
152 | (64 * 1024) : \
153 | ((arch >= 200) ? \
154 | (32 * 1024) : \
155 | ((arch >= 120) ? \
156 | (16 * 1024) : \
157 | (8 * 1024))))
158 |
159 | /// Oversubscription factor
160 | #define CUB_SUBSCRIPTION_FACTOR(arch) \
161 | ((arch >= 300) ? \
162 | (5) : \
163 | ((arch >= 200) ? \
164 | (3) : \
165 | (10)))
166 |
167 | /// Prefer padding overhead vs X-way conflicts greater than this threshold
168 | #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \
169 | ((arch >= 300) ? \
170 | (1) : \
171 | (4))
172 |
173 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
174 |
175 | #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
176 | #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH)
177 | #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
178 | #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH)
179 | #define CUB_PTX_SMEM_BANK_BYTES CUB_SMEM_BANK_BYTES(CUB_PTX_ARCH)
180 | #define CUB_PTX_SMEM_BYTES CUB_SMEM_BYTES(CUB_PTX_ARCH)
181 | #define CUB_PTX_SMEM_ALLOC_UNIT CUB_SMEM_ALLOC_UNIT(CUB_PTX_ARCH)
182 | #define CUB_PTX_REGS_BY_BLOCK CUB_REGS_BY_BLOCK(CUB_PTX_ARCH)
183 | #define CUB_PTX_REG_ALLOC_UNIT CUB_REG_ALLOC_UNIT(CUB_PTX_ARCH)
184 | #define CUB_PTX_WARP_ALLOC_UNIT CUB_WARP_ALLOC_UNIT(CUB_PTX_ARCH)
185 | #define CUB_PTX_MAX_SM_THREADS CUB_MAX_SM_THREADS(CUB_PTX_ARCH)
186 | #define CUB_PTX_MAX_SM_BLOCKS CUB_MAX_SM_BLOCKS(CUB_PTX_ARCH)
187 | #define CUB_PTX_MAX_BLOCK_THREADS CUB_MAX_BLOCK_THREADS(CUB_PTX_ARCH)
188 | #define CUB_PTX_MAX_SM_REGISTERS CUB_MAX_SM_REGISTERS(CUB_PTX_ARCH)
189 | #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
190 |
191 | #endif // Do not document
192 |
193 |
194 | /** @} */ // end group UtilMgmt
195 |
196 | } // CUB namespace
197 | CUB_NS_POSTFIX // Optional outer namespace(s)
198 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/util_debug.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Error and event logging routines.
32 | *
33 | * The following macros definitions are supported:
34 | * - \p CUB_LOG. Simple event messages are printed to \p stdout.
35 | */
36 |
37 | #pragma once
38 |
39 | #include
40 | #include "util_namespace.cuh"
41 | #include "util_arch.cuh"
42 |
43 | /// Optional outer namespace(s)
44 | CUB_NS_PREFIX
45 |
46 | /// CUB namespace
47 | namespace cub {
48 |
49 |
50 | /**
51 | * \addtogroup UtilMgmt
52 | * @{
53 | */
54 |
55 |
56 | /// CUB error reporting macro (prints error messages to stderr)
57 | #if (defined(DEBUG) || defined(_DEBUG))
58 | #define CUB_STDERR
59 | #endif
60 |
61 |
62 |
63 | /**
64 | * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
65 | *
66 | * \return The CUDA error.
67 | */
68 | __host__ __device__ __forceinline__ cudaError_t Debug(
69 | cudaError_t error,
70 | const char* filename,
71 | int line)
72 | {
73 | #ifdef CUB_STDERR
74 | if (error)
75 | {
76 | #if (CUB_PTX_ARCH == 0)
77 | fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
78 | fflush(stderr);
79 | #elif (CUB_PTX_ARCH >= 200)
80 | printf("CUDA error %d [block %d, thread %d, %s, %d]\n", error, blockIdx.x, threadIdx.x, filename, line);
81 | #endif
82 | }
83 | #endif
84 | return error;
85 | }
86 |
87 |
88 | /**
89 | * \brief Debug macro
90 | */
91 | #define CubDebug(e) cub::Debug((e), __FILE__, __LINE__)
92 |
93 |
94 | /**
95 | * \brief Debug macro with exit
96 | */
97 | #define CubDebugExit(e) if (cub::Debug((e), __FILE__, __LINE__)) { exit(1); }
98 |
99 |
100 | /**
101 | * \brief Log macro for printf statements.
102 | */
103 | #if (CUB_PTX_ARCH == 0)
104 | #define CubLog(format, ...) printf(format,__VA_ARGS__);
105 | #elif (CUB_PTX_ARCH >= 200)
106 | #define CubLog(format, ...) printf("[block %d, thread %d]: " format, blockIdx.x, threadIdx.x, __VA_ARGS__);
107 | #endif
108 |
109 |
110 |
111 |
112 | /** @} */ // end group UtilMgmt
113 |
114 | } // CUB namespace
115 | CUB_NS_POSTFIX // Optional outer namespace(s)
116 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/util_macro.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /******************************************************************************
30 | * Common C/C++ macro utilities
31 | ******************************************************************************/
32 |
33 | #pragma once
34 |
35 | #include "util_namespace.cuh"
36 |
37 | /// Optional outer namespace(s)
38 | CUB_NS_PREFIX
39 |
40 | /// CUB namespace
41 | namespace cub {
42 |
43 |
44 | /**
45 | * \addtogroup UtilModule
46 | * @{
47 | */
48 |
49 | /**
50 | * Align struct
51 | */
52 | #if defined(_WIN32) || defined(_WIN64)
53 | #define CUB_ALIGN(bytes) __declspec(align(32))
54 | #else
55 | #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
56 | #endif
57 |
58 | /**
59 | * Select maximum(a, b)
60 | */
61 | #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
62 |
63 | /**
64 | * Select minimum(a, b)
65 | */
66 | #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
67 |
68 | /**
69 | * Quotient of x/y rounded down to nearest integer
70 | */
71 | #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
72 |
73 | /**
74 | * Quotient of x/y rounded up to nearest integer
75 | */
76 | #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
77 |
78 | /**
79 | * x rounded up to the nearest multiple of y
80 | */
81 | #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
82 |
83 | /**
84 | * x rounded down to the nearest multiple of y
85 | */
86 | #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
87 |
88 | /**
89 | * Return character string for given type
90 | */
91 | #define CUB_TYPE_STRING(type) ""#type
92 |
93 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
94 | #define CUB_CAT_(a, b) a ## b
95 | #define CUB_CAT(a, b) CUB_CAT_(a, b)
96 | #endif // DOXYGEN_SHOULD_SKIP_THIS
97 |
98 | /**
99 | * Static assert
100 | */
101 | #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
102 |
103 |
104 | /** @} */ // end group UtilModule
105 |
106 | } // CUB namespace
107 | CUB_NS_POSTFIX // Optional outer namespace(s)
108 |
--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/util_namespace.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Place-holder for prefixing the cub namespace
32 | */
33 |
34 | #pragma once
35 |
36 | // For example:
37 | //#define CUB_NS_PREFIX namespace thrust{ namespace detail {
38 | //#define CUB_NS_POSTFIX } }
39 |
40 | #define CUB_NS_PREFIX
41 | #define CUB_NS_POSTFIX
42 |
--------------------------------------------------------------------------------
/python/SmoothParticleNets/ImageProjection.py:
--------------------------------------------------------------------------------
1 |
2 | import numbers
3 | import numpy as np
4 |
5 | import torch
6 | import torch.autograd
7 |
8 | import _ext
9 | import _extc
10 | import error_checking as ec
11 | from kernels import KERNELS, KERNEL_NAMES
12 |
13 | MAX_FLOAT = float(np.finfo(np.float32).max)
14 |
15 |
16 | class ImageProjection(torch.nn.Module):
17 | """
18 | """
19 |
20 | def __init__(self, camera_fl):
21 | """ Initialize a ParticleProjection layer.
22 | TODO
23 |
24 | Arguments:
25 | -camera_fl: The camera focal length in pixels (all pixels are
26 | assumed to be square. This layer does not simulate
27 | any image warping e.g. radial distortion).
28 | """
29 | super(ImageProjection, self).__init__()
30 |
31 | self.camera_fl = ec.check_conditions(camera_fl, "camera_fl",
32 | "%s > 0", "isinstance(%s, numbers.Real)")
33 |
34 | self.register_buffer("empty_depth_mask",
35 | torch.ones(1, 1, 1)*MAX_FLOAT)
36 |
37 | def _rotationMatrixFromQuaternion(self, quat):
38 | """
39 | 1 - 2*qy2 - 2*qz2 2*qx*qy - 2*qz*qw 2*qx*qz + 2*qy*qw
40 | 2*qx*qy + 2*qz*qw 1 - 2*qx2 - 2*qz2 2*qy*qz - 2*qx*qw
41 | 2*qx*qz - 2*qy*qw 2*qy*qz + 2*qx*qw 1 - 2*qx2 - 2*qy2
42 | """
43 | quat = quat.data
44 | qx = quat[:, 0]
45 | qy = quat[:, 1]
46 | qz = quat[:, 2]
47 | qw = quat[:, 3]
48 | qx2 = qx*qx
49 | qxqy = qx*qy
50 | qxqz = qx*qz
51 | qxqw = qx*qw
52 | qy2 = qy*qy
53 | qyqz = qy*qz
54 | qyqw = qy*qw
55 | qz2 = qz*qz
56 | qzqw = qz*qw
57 | ret = quat.new(quat.size()[0], 3, 3)
58 | ret[:, 0, 0] = 1 - 2*qy2 - 2*qz2
59 | ret[:, 1, 0] = 2*qxqy - 2*qzqw
60 | ret[:, 2, 0] = 2*qxqz + 2*qyqw
61 | ret[:, 0, 1] = 2*qxqy + 2*qzqw
62 | ret[:, 1, 1] = 1 - 2*qx2 - 2*qz2
63 | ret[:, 2, 1] = 2*qyqz - 2*qxqw
64 | ret[:, 0, 2] = 2*qxqz - 2*qyqw
65 | ret[:, 1, 2] = 2*qyqz + 2*qxqw
66 | ret[:, 2, 2] = 1 - 2*qx2 - 2*qy2
67 | return torch.autograd.Variable(ret, requires_grad=False)
68 |
69 | def forward(self, locs, image, camera_pose, camera_rot, depth_mask=None):
70 | """ Forwad pass for the particle projection. Takes in the set of
71 | particles and outputs an image.
72 | TODO
73 |
74 | Arguments:
75 | -locs: A BxNx3 tensor where B is the batch size, N is the number
76 | of particles, and 3 is the dimensionality of the
77 | particles' coordinate space (this layer currently only
78 | supports 3D projections).
79 | -camera_pose: A Bx3 tensor containing the camera translation.
80 | -camera_rot: A Bx4 tensor containing the camera rotation as a
81 | quaternion in xyzw format.
82 | -depth_mask: An optional BxHxW tensor where W and H are the
83 | camera image width and height respectively. If not
84 | None, then this is used to compute occlusions. The
85 | value in each pixel in the depth_mask should be
86 | the distance to the first object. Any particles
87 | further away than that value will not be projected
88 | onto the output image.
89 |
90 | Returns: A BxHxW tensor of the projected particles.
91 | """
92 |
93 | # Error checking.
94 | batch_size = locs.size()[0]
95 | N = locs.size()[1]
96 | width = image.size()[3]
97 | height = image.size()[2]
98 | channels = image.size()[1]
99 | ec.check_tensor_dims(locs, "locs", (batch_size, N, 3))
100 | ec.check_tensor_dims(
101 | image, "image", (batch_size, channels, height, width))
102 | ec.check_tensor_dims(camera_pose, "camera_pose", (batch_size, 3))
103 | ec.check_tensor_dims(camera_rot, "camera_rot", (batch_size, 4))
104 |
105 | ec.check_nans(locs, "locs")
106 | ec.check_nans(image, "image")
107 | ec.check_nans(camera_pose, "camera_pose")
108 | ec.check_nans(camera_rot, "camera_rot")
109 |
110 | if depth_mask is not None:
111 | ec.check_tensor_dims(depth_mask, "depth_mask", (batch_size,
112 | height, width))
113 | ec.check_nans(depth_mask, "depth_mask")
114 | depth_mask = depth_mask.contiguous()
115 | else:
116 | if (self.empty_depth_mask.size()[0] != batch_size or
117 | self.empty_depth_mask.size()[1] != height or
118 | self.empty_depth_mask.size()[2] != width):
119 | self.empty_depth_mask.resize_(batch_size, height, width)
120 | self.empty_depth_mask.fill_(MAX_FLOAT)
121 | depth_mask = torch.autograd.Variable(
122 | self.empty_depth_mask, requires_grad=False)
123 | if locs.is_cuda:
124 | depth_mask = depth_mask.cuda()
125 |
126 | # Let's transform the particles to camera space here.
127 | locs = locs - camera_pose.unsqueeze(1)
128 | # Ensure the rotation quaternion is normalized.
129 | camera_rot = camera_rot / \
130 | torch.sqrt(torch.sum(camera_rot**2, 1, keepdim=True))
131 | # Invert the rotation.
132 | inv = camera_rot.data.new(1, 4)
133 | inv[0, 0] = -1
134 | inv[0, 1] = -1
135 | inv[0, 2] = -1
136 | inv[0, 3] = 1
137 | inv = torch.autograd.Variable(inv, requires_grad=False)
138 | camera_rot = camera_rot*inv
139 | rot = self._rotationMatrixFromQuaternion(camera_rot)
140 | if (rot != rot).data.any():
141 | raise ValueError("No NaNs found in camera_rot argument, but NaNs created when"
142 | " constructing a rotation matrix from it.")
143 | # Rotate the locs into camera space.
144 | try:
145 | # There's a bug that causes this to fail on the first call when using cuda.
146 | # To fix that, just call it again.
147 | locs = torch.bmm(locs, rot)
148 | except RuntimeError:
149 | locs = torch.bmm(locs, rot)
150 | if (locs != locs).data.any():
151 | raise ValueError(
152 | "Rotating locs by rotation matrix resulted in NaNs.")
153 |
154 | locs = locs.contiguous()
155 | image = image.contiguous()
156 | proj = _ImageProjectionFunction(self.camera_fl)
157 | ret = proj(locs, image, depth_mask)
158 | return ret
159 |
160 |
161 | """
162 |
163 | INTERNAL FUNCTIONS
164 |
165 | """
166 |
167 |
168 | class _ImageProjectionFunction(torch.autograd.Function):
169 |
170 | def __init__(self, camera_fl):
171 | super(_ImageProjectionFunction, self).__init__()
172 | self.camera_fl = camera_fl
173 |
174 | def forward(self, locs, image, depth_mask):
175 | self.save_for_backward(locs, image, depth_mask)
176 | batch_size = locs.size()[0]
177 | N = locs.size()[1]
178 | channels = image.size()[1]
179 | ret = locs.new(batch_size, N, channels)
180 | ret.fill_(0)
181 | if locs.is_cuda:
182 | if not _extc.spnc_imageprojection_forward(locs, image,
183 | self.camera_fl, depth_mask, ret):
184 | raise Exception("Cuda error")
185 | else:
186 | _ext.spn_imageprojection_forward(locs, image,
187 | self.camera_fl, depth_mask, ret)
188 |
189 | return ret
190 |
191 | def backward(self, grad_output):
192 | locs, image, depth_mask = self.saved_tensors
193 | ret_locs = grad_output.new(locs.size())
194 | ret_locs.fill_(0)
195 | ret_image = grad_output.new(image.size())
196 | ret_image.fill_(0)
197 | ret_depth_mask = grad_output.new(depth_mask.size())
198 | ret_depth_mask.fill_(0)
199 | if grad_output.is_cuda:
200 | if not _extc.spnc_imageprojection_backward(locs, image,
201 | self.camera_fl, depth_mask, grad_output, ret_locs, ret_image):
202 | raise Exception("Cuda error")
203 | else:
204 | _ext.spn_imageprojection_backward(locs, image,
205 | self.camera_fl, depth_mask, grad_output, ret_locs, ret_image)
206 |
207 | return (ret_locs,
208 | ret_image,
209 | ret_depth_mask,)
210 |
--------------------------------------------------------------------------------
/python/SmoothParticleNets/ParticleProjection.py:
--------------------------------------------------------------------------------
1 |
2 | import numbers
3 | import numpy as np
4 |
5 | import torch
6 | import torch.autograd
7 |
8 | import _ext
9 | import _extc
10 | import error_checking as ec
11 | from kernels import KERNELS, KERNEL_NAMES
12 |
13 | MAX_FLOAT = float(np.finfo(np.float32).max)
14 |
15 |
16 | class ParticleProjection(torch.nn.Module):
17 | """ The particle projection layer. Projects the given set of particles onto
18 | a camera image plane. For each particle, this layer finds its location on
19 | the image plane, then adds a small circular Gaussian centered at that location
20 | to the image. The contributions from all particles are added together into
21 | a final image. Note that unlike the other layers in this package, this layer
22 | only works with 3D particles.
23 | """
24 |
25 | def __init__(self, camera_fl, camera_size, filter_std, filter_scale):
26 | """ Initialize a ParticleProjection layer.
27 |
28 | Arguments:
29 | -camera_fl: The camera focal length in pixels (all pixels are
30 | assumed to be square. This layer does not simulate
31 | any image warping e.g. radial distortion).
32 | -camera_size: 2-tuple with the image width and height in pixels.
33 | -filter_std: The standard deviation of the Gaussian that is
34 | added at each pixel location.
35 | -filter_scale: Before adding the Gaussian for an individual
36 | particle, it is scaled by this value.
37 | """
38 | super(ParticleProjection, self).__init__()
39 |
40 | self.camera_size = ec.make_list(camera_size, 2, "camera_size",
41 | "%s > 0", "isinstance(%s, numbers.Integral)")
42 |
43 | self.camera_fl = ec.check_conditions(camera_fl, "camera_fl",
44 | "%s > 0", "isinstance(%s, numbers.Real)")
45 | self.filter_std = ec.check_conditions(filter_std, "filter_std",
46 | "%s > 0", "isinstance(%s, numbers.Real)")
47 | self.filter_scale = ec.check_conditions(filter_scale, "filter_scale",
48 | "%s > 0", "isinstance(%s, numbers.Real)")
49 |
50 | self.register_buffer("empty_depth_mask",
51 | torch.ones(1, self.camera_size[1], self.camera_size[0])*MAX_FLOAT)
52 |
53 | def _rotationMatrixFromQuaternion(self, quat):
54 | """
55 | 1 - 2*qy2 - 2*qz2 2*qx*qy - 2*qz*qw 2*qx*qz + 2*qy*qw
56 | 2*qx*qy + 2*qz*qw 1 - 2*qx2 - 2*qz2 2*qy*qz - 2*qx*qw
57 | 2*qx*qz - 2*qy*qw 2*qy*qz + 2*qx*qw 1 - 2*qx2 - 2*qy2
58 | """
59 | quat = quat.data
60 | qx = quat[:, 0]
61 | qy = quat[:, 1]
62 | qz = quat[:, 2]
63 | qw = quat[:, 3]
64 | qx2 = qx*qx
65 | qxqy = qx*qy
66 | qxqz = qx*qz
67 | qxqw = qx*qw
68 | qy2 = qy*qy
69 | qyqz = qy*qz
70 | qyqw = qy*qw
71 | qz2 = qz*qz
72 | qzqw = qz*qw
73 | ret = quat.new(quat.size()[0], 3, 3)
74 | ret[:, 0, 0] = 1 - 2*qy2 - 2*qz2
75 | ret[:, 1, 0] = 2*qxqy - 2*qzqw
76 | ret[:, 2, 0] = 2*qxqz + 2*qyqw
77 | ret[:, 0, 1] = 2*qxqy + 2*qzqw
78 | ret[:, 1, 1] = 1 - 2*qx2 - 2*qz2
79 | ret[:, 2, 1] = 2*qyqz - 2*qxqw
80 | ret[:, 0, 2] = 2*qxqz - 2*qyqw
81 | ret[:, 1, 2] = 2*qyqz + 2*qxqw
82 | ret[:, 2, 2] = 1 - 2*qx2 - 2*qy2
83 | return torch.autograd.Variable(ret, requires_grad=False)
84 |
85 | def forward(self, locs, camera_pose, camera_rot, depth_mask=None):
86 | """ Forwad pass for the particle projection. Takes in the set of
87 | particles and outputs an image.
88 |
89 | Arguments:
90 | -locs: A BxNx3 tensor where B is the batch size, N is the number
91 | of particles, and 3 is the dimensionality of the
92 | particles' coordinate space (this layer currently only
93 | supports 3D projections).
94 | -camera_pose: A Bx3 tensor containing the camera translation.
95 | -camera_rot: A Bx4 tensor containing the camera rotation as a
96 | quaternion in xyzw format.
97 | -depth_mask: An optional BxHxW tensor where W and H are the
98 | camera image width and height respectively. If not
99 | None, then this is used to compute occlusions. The
100 | value in each pixel in the depth_mask should be
101 | the distance to the first object. Any particles
102 | further away than that value will not be projected
103 | onto the output image.
104 |
105 | Returns: A BxHxW tensor of the projected particles.
106 | """
107 |
108 | # Error checking.
109 | batch_size = locs.size()[0]
110 | N = locs.size()[1]
111 | ec.check_tensor_dims(locs, "locs", (batch_size, N, 3))
112 | ec.check_tensor_dims(camera_pose, "camera_pose", (batch_size, 3))
113 | ec.check_tensor_dims(camera_rot, "camera_rot", (batch_size, 4))
114 |
115 | if depth_mask is not None:
116 | ec.check_tensor_dims(depth_mask, "depth_mask", (batch_size,
117 | self.camera_size[1], self.camera_size[0]))
118 | depth_mask = depth_mask.contiguous()
119 | else:
120 | if self.empty_depth_mask.size()[0] != batch_size:
121 | self.empty_depth_mask.resize_(
122 | batch_size, self.camera_size[1], self.camera_size[0])
123 | self.empty_depth_mask.fill_(MAX_FLOAT)
124 | depth_mask = torch.autograd.Variable(
125 | self.empty_depth_mask, requires_grad=False)
126 | if locs.is_cuda:
127 | depth_mask = depth_mask.cuda()
128 |
129 | # Let's transform the particles to camera space here.
130 | locs = locs - camera_pose.unsqueeze(1)
131 | # Ensure the rotation quaternion is normalized.
132 | camera_rot = camera_rot / \
133 | torch.sqrt(torch.sum(camera_rot**2, 1, keepdim=True))
134 | # Invert the rotation.
135 | inv = camera_rot.data.new(1, 4)
136 | inv[0, 0] = -1
137 | inv[0, 1] = -1
138 | inv[0, 2] = -1
139 | inv[0, 3] = 1
140 | inv = torch.autograd.Variable(inv, requires_grad=False)
141 | camera_rot = camera_rot*inv
142 | rot = self._rotationMatrixFromQuaternion(camera_rot)
143 | # Rotate the locs into camera space.
144 | try:
145 | # There's a bug that causes this to fail on the first call when using cuda.
146 | # To fix that, just call it again.
147 | locs = torch.bmm(locs, rot)
148 | except RuntimeError:
149 | locs = torch.bmm(locs, rot)
150 |
151 | locs = locs.contiguous()
152 | proj = _ParticleProjectionFunction(self.camera_fl, self.camera_size, self.filter_std,
153 | self.filter_scale)
154 | ret = proj(locs, depth_mask)
155 | return ret
156 |
157 |
158 | """
159 |
160 | INTERNAL FUNCTIONS
161 |
162 | """
163 |
164 |
165 | class _ParticleProjectionFunction(torch.autograd.Function):
166 |
167 | def __init__(self, camera_fl, camera_size, filter_std, filter_scale):
168 | super(_ParticleProjectionFunction, self).__init__()
169 | self.camera_fl = camera_fl
170 | self.camera_size = camera_size
171 | self.filter_std = filter_std
172 | self.filter_scale = filter_scale
173 |
174 | def forward(self, locs, depth_mask):
175 | self.save_for_backward(locs, depth_mask)
176 | batch_size = locs.size()[0]
177 | ret = locs.new(batch_size, self.camera_size[1], self.camera_size[0])
178 | ret.fill_(0)
179 | if locs.is_cuda:
180 | if not _extc.spnc_particleprojection_forward(locs, self.camera_fl,
181 | self.filter_std, self.filter_scale, depth_mask, ret):
182 | raise Exception("Cuda error")
183 | else:
184 | _ext.spn_particleprojection_forward(locs, self.camera_fl,
185 | self.filter_std, self.filter_scale, depth_mask, ret)
186 |
187 | return ret
188 |
189 | def backward(self, grad_output):
190 | locs, depth_mask = self.saved_tensors
191 | ret_locs = grad_output.new(locs.size())
192 | ret_locs.fill_(0)
193 | ret_depth_mask = grad_output.new(depth_mask.size())
194 | ret_depth_mask.fill_(0)
195 | if grad_output.is_cuda:
196 | if not _extc.spnc_particleprojection_backward(locs,
197 | self.camera_fl, self.filter_std, self.filter_scale, depth_mask, grad_output, ret_locs):
198 | raise Exception("Cuda error")
199 | else:
200 | _ext.spn_particleprojection_backward(locs,
201 | self.camera_fl, self.filter_std, self.filter_scale, depth_mask, grad_output, ret_locs)
202 |
203 | return (ret_locs,
204 | ret_depth_mask,)
205 |
--------------------------------------------------------------------------------
/python/SmoothParticleNets/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from os.path import dirname, basename, isfile
3 | import glob
4 | import sys
5 | sys.path.append(dirname(__file__))
6 | modules = glob.glob(dirname(__file__)+"/*.py")
7 | __all__ = [basename(f)[:-3] for f in modules if isfile(f)]
8 | for f in modules:
9 | if isfile(f) and "__init__" not in f and "install" not in f:
10 | exec('from %s import *' % basename(f)[:-3])
11 |
--------------------------------------------------------------------------------
/python/SmoothParticleNets/error_checking.py:
--------------------------------------------------------------------------------
1 |
2 | import numbers
3 | import numpy as np
4 |
5 | import torch
6 |
7 | def check_nans(v, name):
8 | if (v != v).data.any():
9 | raise ValueError("Found NaNs in %s" % name)
10 |
11 | def throws_exception(exception_type, func, *args, **kwargs):
12 | try:
13 | func(*args, **kwargs)
14 | return False
15 | except exception_type:
16 | return True
17 |
18 | def check_conditions(v, name, *conditions):
19 | for condition in conditions:
20 | if not eval(condition % "v"):
21 | raise ValueError(("%s must meet the following condition: " + condition)
22 | % (name, name))
23 | return v
24 |
25 | def make_list(l, length, name, *conditions):
26 | if throws_exception(TypeError, list, l):
27 | l = [l]*length
28 | else:
29 | l = list(l)
30 | if len(l) != length:
31 | raise ValueError("%s must be a list of length %d." % (name, length))
32 | for i, ll in enumerate(l):
33 | l[i] = check_conditions(ll, name, *conditions)
34 | return l
35 |
36 | def check_tensor_dims(t, name, dims):
37 | s = t.size()
38 | if len(s) != len(dims):
39 | raise ValueError("%s must be a %d-dimensional tensor." % (name, len(dims)))
40 | for i in range(len(dims)):
41 | if dims[i] >= 0 and s[i] != dims[i]:
42 | raise ValueError("The %dth dimension of %s must have size %d, not %d."
43 | % (i, name, dims[i], s[i]))
44 |
45 | def list2tensor(l):
46 | return torch.from_numpy(np.array(l, dtype=np.float32))
--------------------------------------------------------------------------------
/python/SmoothParticleNets/kernels.py:
--------------------------------------------------------------------------------
1 |
2 | KERNELS = {}
3 | DKERNELS = {}
4 |
5 |
6 | """ DEFAULT:
7 | \eta * \sigma * max(0, H^2 - d^2)^3
8 | H = radius
9 | d = distance
10 | \sigma = 1/pi (dim norm)
11 | \eta = 315/(64*H^9) (norm)
12 | """
13 | KERNELS["default"] = (
14 | "(315.0f/(64.0f*M_PI*H*H*H*H*H*H*H*H*H))*(H*H-d*d)*(H*H-d*d)*(H*H-d*d)")
15 |
16 | """ DDEFAULT:
17 | \eta * \sigma * d * max(0, H^2 - d^2)^2
18 | H = radius
19 | d = distance
20 | \sigma = 1/pi (dim norm)
21 | \eta = -945/(32*H^9) (norm)
22 | """
23 | KERNELS["ddefault"] = "(-945.0f/(32.0f*M_PI*H*H*H*H*H*H*H*H*H))*(H*H-d*d)*(H*H-d*d)*d"
24 | DKERNELS["default"] = KERNELS["ddefault"]
25 |
26 | """ DDEFAULT2:
27 | \eta * \sigma * (H^4 - 6*H^2*d^2 + 5d^4)
28 | H = radius
29 | d = distance
30 | \sigma = 1/pi (dim norm)
31 | \eta = -945/(32*H^9) (norm)
32 | """
33 | KERNELS["ddefault2"] = "(-945.0f/(32.0f*M_PI*H*H*H*H*H*H*H*H*H))*(H*H*H*H - 6*H*H*d*d + 5*d*d*d*d)"
34 | DKERNELS["ddefault"] = KERNELS["ddefault2"]
35 | DKERNELS["ddefault2"] = "(-945.0f/(32.0f*M_PI*H*H*H*H*H*H*H*H*H))*(20*d*d*d - 12*H*H*d)"
36 |
37 | """ PRESSURE:
38 | \eta * \sigma * max(0, H - d)^3
39 | H = radius
40 | d = distance
41 | \sigma = 1/pi (dim norm)
42 | \eta = 15/(H^6) (norm)
43 | """
44 | KERNELS["pressure"] = "(15.0f/(M_PI*H*H*H*H*H*H))*(H-d)*(H-d)*(H-d)"
45 |
46 | """ DPRESSURE:
47 | \eta * \sigma * max(0, H - d)^2
48 | H = radius
49 | d = distance
50 | \sigma = 1/pi (dim norm)
51 | \eta = -45/(H^6) (norm)
52 | """
53 | KERNELS["dpressure"] = "(-45.0f/(M_PI*H*H*H*H*H*H))*(H-d)*(H-d)"
54 | DKERNELS["pressure"] = KERNELS["dpressure"]
55 |
56 | """ DPRESSURE2:
57 | \eta * \sigma * max(0, H - d) * (H - 2*d)/2
58 | H = radius
59 | d = distance
60 | \sigma = 1/pi (dim norm)
61 | \eta = -90/(H^6) (norm)
62 | """
63 | KERNELS["dpressure2"] = "(90.0f/(M_PI*H*H*H*H*H*H))*(H-d)"
64 | DKERNELS["dpressure"] = KERNELS["dpressure2"]
65 | DKERNELS["dpressure2"] = "(-90.0f/(M_PI*H*H*H*H*H*H))"
66 |
67 | """ INDIRECT:
68 | H - d
69 | H = radius
70 | d = distance
71 | """
72 | KERNELS["indirect"] = "H - d"
73 | DKERNELS["indirect"] = "-1.0f"
74 |
75 | """ CONSTANT:
76 | 1
77 | """
78 | KERNELS["constant"] = "1.0f"
79 | DKERNELS["constant"] = "0.0f"
80 |
81 | """ SPIKY:
82 | \eta * \sigma * (1 - d/H)^2
83 | H = radius
84 | d = distance
85 | \sigma = 1/pi (dim norm)
86 | \eta = 15/(H^3) (norm)
87 | """
88 | KERNELS["spiky"] = "15.0f/(M_PI*H*H*H)*(1.0f-d/H)*(1.0f-d/H)"
89 |
90 | """ DSPIKY:
91 | \eta * \sigma * 2 * (1 - d/H)/H
92 | H = radius
93 | d = distance
94 | \sigma = 1/pi (dim norm)
95 | \eta = 15/(H^3) (norm)
96 | """
97 | KERNELS["dspiky"] = "-15.0f/(M_PI*H*H*H)*2.0f*(1.0f - d/H)/H"
98 | DKERNELS["spiky"] = KERNELS["dspiky"]
99 | DKERNELS["dspiky"] = "-15.0f/(M_PI*H*H*H)*2.0f*(-1.0f/H)/H"
100 |
101 | """ COHESION:
102 | -(1.0f + \eta)/\eta^2*(d/H)^3 + (\eta^2 + \eta + 1)/\eta^2*(d/H)^2 - 1
103 | \eta * \sigma * (1 - d/H)^2
104 | H = radius
105 | d = distance
106 | \eta = 0.5 (rest)
107 | """
108 | KERNELS["cohesion"] = "-6.0f*(d/H)*(d/H)*(d/H) + 7*(d/H)*(d/H) - 1"
109 | DKERNELS["cohesion"] = "2.0f*d*(7.0f*H - 9.0f*d)/(H*H*H)"
110 |
111 | """ SIGMOID:
112 | 1/(1 + exp((d - C*H)*S/H))
113 | H = radius
114 | d = distance
115 | S = 20 (sharpness)
116 | C = 0.2 (center ratio)
117 | """
118 | KERNELS["sigmoid"] = "1.0f/(1.0f + expf((d - 0.2f*H)*20.0f/H))"
119 | # -S*expf((d - C*H)*S/H)/(H*(expf((d - C*H)*S/H) + 1.0f)*(expf((d - C*H)*S/H) + 1.0f))
120 | DKERNELS["sigmoid"] = ("-20.0f*expf((d - 0.2f*H)*20.0f/H)/" +
121 | "(H*(expf((d - 0.2f*H)*20.0f/H) + 1.0f)*(expf((d - 0.2f*H)*20.0f/H) + 1.0f))")
122 |
123 | KERNEL_NAMES = sorted(KERNELS.keys())
124 |
125 | import math
126 | KERNEL_FN = {k : eval("lambda d, H: " + v
127 | .replace("M_PI", "math.pi")
128 | .replace("fmaxf", "max")
129 | .replace("expf", "math.exp")
130 | .replace("f", ""))
131 | for k,v in KERNELS.items()}
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import sys
4 |
5 | from setuptools import setup
6 | import torch
7 | from torch.utils.cpp_extension import CppExtension, BuildExtension, CUDAExtension
8 |
9 | # Parse command line args.
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--with_cuda', action="store_true", default=None)
12 | parser.add_argument('--without_cuda', action="store_true", default=None)
13 | args, unknown = parser.parse_known_args()
14 | sys.argv = sys.argv[:2] + unknown
15 |
16 | if args.with_cuda is None:
17 | if args.without_cuda is not None:
18 | args.with_cuda = not args.without_cuda
19 | else:
20 | print("--with_cuda or --without_cuda not specified, using PyTorch to decide...")
21 | args.with_cuda = torch.cuda.is_available()
22 | if args.with_cuda:
23 | print("torch.cuda.is_available says True, proceeding to build with cuda.")
24 | else:
25 | print("torch.cuda.is_available says False, proceeding to build without cuda.")
26 |
27 |
28 | # Setup global variables.
29 | root_dir = os.path.dirname(os.path.abspath(__file__))
30 | test_dir = os.path.join(root_dir, "tests")
31 | py_dir = os.path.join(root_dir, "python", "SmoothParticleNets")
32 | src_dir = os.path.join(root_dir, "src")
33 |
34 | # Create pytest args.
35 | pytest_args = {
36 | 'with_cuda': args.with_cuda,
37 | }
38 | fp = open(os.path.join(test_dir, "pytest_args.py"), "w")
39 | for k, v in pytest_args.items():
40 | if isinstance(v, str):
41 | v = "'" + v + "'"
42 | fp.write("%s = %s\n" % (k, str(v)))
43 | fp.close()
44 |
45 | # Build kernel_constants.h
46 | # Add path to python source to path.
47 | sys.path.append(py_dir)
48 | from kernels import KERNELS, KERNEL_NAMES, DKERNELS
49 | fp = open(os.path.join(src_dir, "kernel_constants.h"), "w")
50 | fp.write("// THIS FILE IS AUTOGENERATED. DO NOT ALTER.\n")
51 | fp.write("#ifndef __kernel_constants_h__\n")
52 | fp.write("#define __kernel_constants_h__\n")
53 | fp.write("#ifdef __cplusplus\n")
54 | fp.write("extern \"C\" {\n")
55 | fp.write("#endif\n")
56 | fp.write("\n")
57 | fp.write("#include \n")
58 | fp.write("#include \n")
59 | fp.write("\n")
60 | fp.write("#ifdef CUDA\n")
61 | fp.write("__host__ __device__\n")
62 | fp.write("#endif\n")
63 | fp.write("inline\n")
64 | fp.write("float KERNEL_W(float d, float H, int fn) {\n")
65 | fp.write(" float ret = 0.0f;\n")
66 | for i, k in enumerate(KERNEL_NAMES):
67 | fp.write(" if(fn == %d) { ret = (%s); }\n" % (i, KERNELS[k]))
68 | fp.write(" return ret;\n")
69 | fp.write("}\n\n")
70 | fp.write("#ifdef CUDA\n")
71 | fp.write("__host__ __device__\n")
72 | fp.write("#endif\n")
73 | fp.write("inline\n")
74 | fp.write("float KERNEL_DW(float d, float H, int fn) {\n")
75 | fp.write(" float ret = 0.0f;\n")
76 | for i, k in enumerate(KERNEL_NAMES):
77 | fp.write(" if(fn == %d) { ret = (%s); }\n" % (i, DKERNELS[k]))
78 | fp.write(" return ret;\n")
79 | fp.write("}\n\n")
80 | fp.write("#define VALIDATE_KERNEL_ID(fn) (fn >= 0 && fn < %d)" % len(KERNELS))
81 | fp.write("\n")
82 | fp.write("#ifdef __cplusplus\n")
83 | fp.write("}\n")
84 | fp.write("#endif\n")
85 | fp.write("#endif\n")
86 | fp.flush()
87 | fp.close()
88 |
89 | # Define extensions.
90 | ext_modules = [
91 | CppExtension('SmoothParticleNets._ext', [
92 | os.path.join(src_dir, 'cpu_layer_funcs.cpp'),
93 | ]),
94 | ]
95 | if args.with_cuda:
96 | ext_modules.append(CUDAExtension('SmoothParticleNets._extc', [
97 | os.path.join(src_dir, 'cuda_layer_funcs.cpp'),
98 | os.path.join(src_dir, 'gpu_kernels.cu'),
99 | ]))
100 |
101 | # The main setup call.
102 | setup(
103 | name='SmoothParticleNets',
104 | package_dir={'': 'python'},
105 | packages=['SmoothParticleNets'],
106 | ext_modules=ext_modules,
107 | cmdclass={
108 | 'build_ext': BuildExtension
109 | })
110 |
--------------------------------------------------------------------------------
/src/constants.h:
--------------------------------------------------------------------------------
1 | #ifndef __constants_h__
2 | #define __constants_h__
3 | #ifdef __cplusplus
4 | extern "C" {
5 | #endif
6 |
7 | #define MAX_CARTESIAN_DIM 20
8 |
9 | #ifdef __cplusplus
10 | }
11 | #endif
12 |
13 | #endif
--------------------------------------------------------------------------------
/src/gpu_kernels.h:
--------------------------------------------------------------------------------
1 | #ifndef __gpu_kernels_h__
2 | #define __gpu_kernels_h__
3 | #ifdef __cplusplus
4 | extern "C" {
5 | #endif
6 |
7 | int cuda_convsp(
8 | const float* qlocs,
9 | const float* locs,
10 | const float* data,
11 | const float* neighbors,
12 | const float* weight,
13 | const float* bias,
14 | const int batch_size,
15 | const int M,
16 | const int N,
17 | const int nchannels,
18 | const int ndims,
19 | const int max_neighbors,
20 | const int nkernels,
21 | const int ncells,
22 | const float radius,
23 | const float* kernel_size,
24 | const float* dilation,
25 | const int dis_norm,
26 | const int kernel_fn,
27 | float* out,
28 | float* dqlocs,
29 | float* dlocs,
30 | float* ddata,
31 | float* dweight,
32 | cudaStream_t stream,
33 | const size_t nshared_device_mem);
34 |
35 | int cuda_convsdf(
36 | const float* locs,
37 | const int batch_size,
38 | const int N,
39 | const int ndims,
40 | const float* idxs,
41 | const float* poses,
42 | const float* scales,
43 | const int M,
44 | const int pose_len,
45 | const float* sdfs,
46 | const float* sdf_offsets,
47 | const float* sdf_shapes,
48 | const float* weight,
49 | const float* bias,
50 | const int nkernels,
51 | const int ncells,
52 | const float* kernel_size,
53 | const float* dilation,
54 | const float max_distance,
55 | float* out,
56 | float* dlocs,
57 | float* dweight,
58 | float* dposes,
59 | cudaStream_t stream);
60 |
61 | size_t GetSharedMemPerBlock(int device);
62 |
63 | int cuda_hashgrid_order(
64 | float* locs,
65 | const float* low,
66 | const float* grid_dims,
67 | float* cellIDs,
68 | float* idxs,
69 | float* buffer,
70 | const int batch_size,
71 | const int N,
72 | const int ndims,
73 | const float cellEdge,
74 | cudaStream_t stream);
75 |
76 | int cuda_compute_collisions(
77 | const float* qlocs,
78 | const float* locs,
79 | const float* low,
80 | const float* grid_dims,
81 | const float* cellIDs,
82 | float* cellStarts,
83 | float* cellEnds,
84 | float* collisions,
85 | const int batch_size,
86 | const int M,
87 | const int N,
88 | const int ndims,
89 | const int max_collisions,
90 | const int ncells,
91 | const float cellEdge,
92 | const float radius,
93 | const int include_self,
94 | cudaStream_t stream);
95 |
96 | int cuda_reorder_data(
97 | float* locs,
98 | float* data,
99 | float* idxs,
100 | float* nlocs,
101 | float* ndata,
102 | const int batch_size,
103 | const int N,
104 | const int ndims,
105 | const int nchannels,
106 | const int reverse,
107 | cudaStream_t stream);
108 |
109 | size_t get_radixsort_buffer_size(cudaStream_t stream);
110 |
111 | int cuda_particleprojection(
112 | const float* locs,
113 | const float camera_fl,
114 | const float filter_std,
115 | const float filter_scale,
116 | const float* depth_mask,
117 | const int batch_size,
118 | const int N,
119 | const int width,
120 | const int height,
121 | float* out,
122 | float* dlocs,
123 | cudaStream_t stream);
124 |
125 | int cuda_imageprojection(
126 | const float* locs,
127 | const float* image,
128 | const float camera_fl,
129 | const float* depth_mask,
130 | const int batch_size,
131 | const int N,
132 | const int width,
133 | const int height,
134 | const int channels,
135 | float* out,
136 | float* dlocs,
137 | float* dimage,
138 | cudaStream_t stream);
139 |
140 | #ifdef __cplusplus
141 | }
142 | #endif
143 |
144 | #endif
--------------------------------------------------------------------------------
/tests/test_convsp.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | # Add path to python source to path.
4 | sys.path.append(os.path.join(os.path.dirname(
5 | os.path.dirname(os.path.abspath(__file__))), "python"))
6 | import SmoothParticleNets as spn
7 |
8 | import itertools
9 | import numpy as np
10 | import torch
11 | import torch.autograd
12 |
13 | from gradcheck import gradcheck
14 | try:
15 | import pytest_args
16 | except ImportError:
17 | print("Make sure to compile SmoothParticleNets before running tests.")
18 | raise
19 |
20 |
21 | def pyconvsp(qlocs, locs, data, weights, biases, kernel_fn, KERNEL_SIZE, RADIUS, DILATION, NKERNELS):
22 | w = spn.KERNEL_FN[kernel_fn]
23 |
24 | BATCH_SIZE = locs.shape[0]
25 | M = qlocs.shape[1]
26 | N = locs.shape[1]
27 | NDIM = locs.shape[-1]
28 |
29 | kernel_centers = (np.array(KERNEL_SIZE) - 1)/2
30 | ground_truth = np.zeros((BATCH_SIZE, M, NKERNELS), dtype=data.dtype)
31 | for b in range(BATCH_SIZE):
32 | for i in range(M):
33 | for j in range(N):
34 | dd = np.square(qlocs[b, i, :] - locs[b, j, :]).sum()
35 | nr = DILATION*max(KERNEL_SIZE)/2 + RADIUS
36 | if dd > nr*nr:
37 | continue
38 | for k, idxs in enumerate(itertools.product(*[range(x) for x in KERNEL_SIZE[::-1]])):
39 | dd = np.square(qlocs[b, i, :] + (idxs[::-1] - kernel_centers)*DILATION
40 | - locs[b, j, :]).sum()
41 | if dd > RADIUS*RADIUS:
42 | continue
43 | ground_truth[b, i, :] += weights[:, :, k].dot(
44 | w(np.sqrt(dd), RADIUS)*data[b, j, :])
45 | ground_truth += biases[np.newaxis, np.newaxis, :]
46 | return ground_truth
47 |
48 |
49 | def test_convsp(cpu=True, cuda=True):
50 | if cpu:
51 | print("Testing CPU implementation of ConvSP...")
52 | eval_convsp(cuda=False)
53 | print("CPU implementation passed!")
54 | print("")
55 |
56 | if cuda:
57 | if pytest_args.with_cuda:
58 | print("Testing CUDA implementation of ConvSP...")
59 | eval_convsp(cuda=True)
60 | print("CUDA implementation passed!")
61 | else:
62 | print("Not compiled with CUDA, skipping CUDA test.")
63 |
64 |
65 | def eval_convsp(cuda=False):
66 | BATCH_SIZE = 2
67 | N = 5
68 | M = 3
69 | NDIM = 2
70 | KERNEL_SIZE = (3, 1)
71 | RADIUS = 1.0
72 | DILATION = 0.05
73 | NCHANNELS = 2
74 | NKERNELS = 3
75 |
76 | np.random.seed(0)
77 |
78 | locs = np.random.rand(BATCH_SIZE, N, NDIM).astype(np.float32)
79 | qlocs = np.random.rand(BATCH_SIZE, M, NDIM).astype(np.float32)
80 | data = np.random.rand(BATCH_SIZE, N, NCHANNELS).astype(np.float32)
81 | weights = np.random.rand(NKERNELS, NCHANNELS, np.prod(
82 | KERNEL_SIZE)).astype(np.float32)
83 | biases = np.random.rand(NKERNELS).astype(np.float32)
84 |
85 | def use_cuda(x):
86 | if cuda:
87 | return x.cuda()
88 | else:
89 | return x
90 |
91 | def undo_cuda(x):
92 | if cuda:
93 | return x.cpu()
94 | else:
95 | return x
96 |
97 | for use_qlocs in (True, False):
98 |
99 | locs_t = torch.autograd.Variable(
100 | use_cuda(torch.FloatTensor(locs)), requires_grad=True)
101 | if use_qlocs:
102 | qlocs_t = torch.autograd.Variable(
103 | use_cuda(torch.FloatTensor(qlocs)), requires_grad=True)
104 | else:
105 | qlocs_t = None
106 | data_t = torch.autograd.Variable(
107 | use_cuda(torch.FloatTensor(data)), requires_grad=True)
108 | weights_t = torch.nn.Parameter(
109 | torch.FloatTensor(weights), requires_grad=True)
110 | biases_t = torch.nn.Parameter(
111 | torch.FloatTensor(biases), requires_grad=True)
112 |
113 | coll = use_cuda(spn.ParticleCollision(NDIM,
114 | RADIUS + DILATION*max((k - 1)/2 for k in KERNEL_SIZE)))
115 | locs_t, data_t, idxs_t, neighbors_t = coll(
116 | locs_t, data_t, (qlocs_t if use_qlocs else None))
117 |
118 | for kernel_fn in spn.KERNEL_NAMES:
119 | print("\tTesting kernel %s (%s query locations)..." %
120 | (kernel_fn, "with" if use_qlocs else "without"))
121 | ground_truth = pyconvsp((qlocs if use_qlocs else locs), locs, data, weights, biases,
122 | kernel_fn, KERNEL_SIZE, RADIUS, DILATION, NKERNELS)
123 |
124 | convsp = spn.ConvSP(NCHANNELS, NKERNELS, NDIM, KERNEL_SIZE, DILATION, RADIUS,
125 | kernel_fn=kernel_fn)
126 | convsp.weight = weights_t
127 | convsp.bias = biases_t
128 | convsp = use_cuda(convsp)
129 |
130 | pred_t = undo_cuda(convsp(locs_t, data_t, neighbors_t, qlocs_t))
131 | np.testing.assert_array_almost_equal(
132 | pred_t.data.numpy(), ground_truth, decimal=3)
133 |
134 | dt = torch.autograd.Variable(data_t.data, requires_grad=True)
135 | lt = torch.autograd.Variable(locs_t.data, requires_grad=True)
136 | if use_qlocs:
137 | qt = torch.autograd.Variable(qlocs_t.data, requires_grad=True)
138 | wt = torch.nn.Parameter(weights_t.data, requires_grad=True)
139 | bt = torch.nn.Parameter(biases_t.data, requires_grad=True)
140 | # Use pyconvsp to allow for double precision when computing numeric grads.
141 |
142 | def func_numerical(l, d, w, b, q=None):
143 | return (torch.autograd.Variable(torch.from_numpy(
144 | pyconvsp((q.data.cpu().numpy() if use_qlocs else l.data.cpu().numpy()),
145 | l.data.cpu().numpy(),
146 | d.data.cpu().numpy(), w.data.cpu().numpy(), b.data.cpu().numpy(),
147 | kernel_fn, KERNEL_SIZE, RADIUS, DILATION, NKERNELS))),)
148 |
149 | def func_analytical(l, d, w, b, q=None):
150 | convsp.weight = w
151 | convsp.bias = b
152 | return (convsp(l, d, neighbors_t, (q if use_qlocs else None)),)
153 | assert gradcheck(func_analytical,
154 | ((lt, dt, wt, bt, qt)
155 | if use_qlocs else (lt, dt, wt, bt,)),
156 | eps=1e-4, atol=1e-3, rtol=1e-1, func_numerical=func_numerical, use_double=True)
157 |
158 |
159 | if __name__ == '__main__':
160 | import argparse
161 | parser = argparse.ArgumentParser()
162 | parser.add_argument('--cpu', dest='cpu', action="store_true", default=True)
163 | parser.add_argument('--no-cpu', dest='cpu', action="store_false")
164 | parser.add_argument('--cuda', dest='cuda',
165 | action="store_true", default=True)
166 | parser.add_argument('--no-cuda', dest='cuda', action="store_false")
167 | args = parser.parse_args()
168 | test_convsp(cpu=args.cpu, cuda=args.cuda)
169 |
--------------------------------------------------------------------------------
/tests/test_imageprojection.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | # Add path to python source to path.
4 | sys.path.append(os.path.join(os.path.dirname(
5 | os.path.dirname(os.path.abspath(__file__))), "python"))
6 | import SmoothParticleNets as spn
7 |
8 | import itertools
9 | import numpy as np
10 | import torch
11 | import torch.autograd
12 |
13 | from gradcheck import gradcheck
14 | from test_convsdf import quaternionMult, quaternionConjugate
15 | from regular_grid_interpolater import RegularGridInterpolator
16 | try:
17 | import pytest_args
18 | except ImportError:
19 | print("Make sure to compile SmoothParticleNets before running tests.")
20 | raise
21 |
22 |
23 | def pyproject(locs, image, camera_fl, camera_pose,
24 | camera_rot, depth_mask=None, dtype=np.float32):
25 | batch_size = locs.shape[0]
26 | N = locs.shape[1]
27 | channels = image.shape[1]
28 | width = image.shape[3]
29 | height = image.shape[2]
30 | ret = np.zeros((batch_size, N, channels), dtype=dtype)
31 | if depth_mask is None:
32 | depth_mask = np.ones((batch_size, height, width),
33 | dtype=dtype)*np.finfo(np.float32).max
34 | depth_fns = [RegularGridInterpolator(
35 | [np.arange(0.5, width, 1), np.arange(0.5, height, 1)],
36 | depth_mask[b, ...].transpose(), bounds_error=False, fill_value=np.finfo(np.float32).max)
37 | for b in range(batch_size)]
38 | for b in range(batch_size):
39 | r = locs[b, ...] - camera_pose[b, ...]
40 | r = np.concatenate((r, np.zeros((N, 1), dtype=r.dtype)), axis=-1)
41 | r = np.array([quaternionMult(quaternionConjugate(camera_rot[b, :]),
42 | quaternionMult(r[i, ...], camera_rot[b, :])) for i in range(N)], dtype=dtype)
43 | ijs = np.concatenate((
44 | r[:, 0:1]*camera_fl/r[:, 2:3] + width/2.0,
45 | r[:, 1:2]*camera_fl/r[:, 2:3] + height/2.0,
46 | ), axis=-1)
47 | depths = depth_fns[b](ijs)
48 | mask = (r[:, 2] <= depths)*(r[:, 2] > 0)
49 | for c in range(channels):
50 | fn = RegularGridInterpolator(
51 | [np.arange(0.5, width, 1), np.arange(0.5, height, 1)],
52 | image[b, c, ...].transpose(), bounds_error=False, fill_value=0)
53 | ret[b, :, c] = fn(ijs)*mask
54 |
55 | return ret
56 |
57 |
58 | def test_imageprojection(cpu=True, cuda=True):
59 | if cpu:
60 | print("Testing CPU implementation of ImageProjection...")
61 | eval_imageprojection(cuda=False)
62 | print("CPU implementation passed!")
63 | print("")
64 |
65 | if cuda:
66 | if pytest_args.with_cuda:
67 | print("Testing CUDA implementation of ImageProjection...")
68 | eval_imageprojection(cuda=True)
69 | print("CUDA implementation passed!")
70 | else:
71 | print("Not compiled with CUDA, skipping CUDA test.")
72 |
73 |
74 | def eval_imageprojection(cuda=False):
75 | np.random.seed(1)
76 | BATCH_SIZE = 2
77 | N = 5
78 | CHANNELS = 2
79 | CAMERA_FOV = 45.0/180.0*np.pi
80 | CAMERA_SIZE = (30, 30)
81 | CAMERA_FL = CAMERA_SIZE[0]/2/(CAMERA_FOV/2.0)
82 | CAMERA_POSE = 5.0*(np.random.rand(BATCH_SIZE, 3).astype(np.float32) - 0.5)
83 | CAMERA_TARGET = np.array([(0.0, 0.0, 0.0)]*BATCH_SIZE, dtype=np.float32)
84 |
85 | CAMERA_ROT = np.zeros((BATCH_SIZE, 4), dtype=np.float32)
86 | for b in range(BATCH_SIZE):
87 | CAMERA_ROT[b, :] = pointAt(
88 | CAMERA_POSE[b, :], np.array([0, 0, 0], dtype=np.float32))
89 |
90 | locs = 2.0*(np.random.rand(BATCH_SIZE, N, 3).astype(np.float32) - 0.5)
91 | image = np.random.rand(BATCH_SIZE, CHANNELS,
92 | CAMERA_SIZE[1], CAMERA_SIZE[0])
93 | depth_mask = np.ones((BATCH_SIZE, CAMERA_SIZE[1], CAMERA_SIZE[0]),
94 | dtype=np.float32)*np.finfo(np.float32).max
95 | ir = (int(CAMERA_SIZE[0]/2 - CAMERA_SIZE[0]*0.2),
96 | int(CAMERA_SIZE[0]/2 + CAMERA_SIZE[0]*0.2) + 1)
97 | jr = (int(CAMERA_SIZE[1]/2 - CAMERA_SIZE[1]*0.2),
98 | int(CAMERA_SIZE[1]/2 + CAMERA_SIZE[1]*0.2) + 1)
99 | ul = 0.0
100 | lr = 10.0
101 | ur = 5.0
102 | ll = 3.5
103 | for i in range(ir[0], ir[1]):
104 | for j in range(jr[0], jr[1]):
105 | ii = 1.0*(i - ir[0])/(ir[1] - ir[0])
106 | jj = 1.0*(j - jr[0])/(jr[1] - jr[0])
107 | l = ul*(1 - jj) + ll*jj
108 | r = ur*(1 - jj) + lr*jj
109 | depth_mask[0, j, i] = l*(1 - ii) + r*ii
110 |
111 | def use_cuda(x):
112 | if cuda:
113 | return x.cuda()
114 | else:
115 | return x
116 |
117 | def undo_cuda(x):
118 | if cuda:
119 | return x.cpu()
120 | else:
121 | return x
122 |
123 | def np2var(t):
124 | return torch.autograd.Variable(use_cuda(torch.from_numpy(t)), requires_grad=False)
125 |
126 | locs_t = torch.autograd.Variable(
127 | use_cuda(torch.FloatTensor(locs)), requires_grad=True)
128 | image_t = torch.autograd.Variable(
129 | use_cuda(torch.FloatTensor(image)), requires_grad=True)
130 | depth_mask_t = torch.autograd.Variable(
131 | use_cuda(torch.FloatTensor(depth_mask)), requires_grad=False)
132 | camera_pose_t = torch.autograd.Variable(use_cuda(torch.FloatTensor(CAMERA_POSE)),
133 | requires_grad=False)
134 | camera_rot_t = torch.autograd.Variable(use_cuda(torch.FloatTensor(CAMERA_ROT)),
135 | requires_grad=False)
136 |
137 | imageProjection = spn.ImageProjection(CAMERA_FL)
138 |
139 | ground_truth = pyproject(locs, image, CAMERA_FL,
140 | CAMERA_POSE, CAMERA_ROT, depth_mask)
141 | pred_t = imageProjection(
142 | locs_t, image_t, camera_pose_t, camera_rot_t, depth_mask_t)
143 | pred = undo_cuda(pred_t).data.numpy()
144 | np.testing.assert_array_almost_equal(pred, ground_truth, decimal=3)
145 |
146 | # Use pyproject to allow for double precision when computing numeric grads.
147 | def func_numerical(l, i):
148 | ll = undo_cuda(l).data.numpy()
149 | ii = undo_cuda(i).data.numpy()
150 | return torch.autograd.Variable(use_cuda(torch.from_numpy(pyproject(ll, ii, CAMERA_FL, CAMERA_POSE,
151 | CAMERA_ROT, dtype=np.float64))), requires_grad=False)
152 |
153 | def func_analytical(l, i):
154 | return imageProjection(l, i, camera_pose_t, camera_rot_t)
155 | assert torch.autograd.gradcheck(func_analytical, (locs_t, image_t,),
156 | eps=1e-3, atol=1e-3, rtol=1e-1)
157 |
158 |
159 | def quaternionFromMatrix(matrix):
160 | M = matrix
161 | m00 = M[0, 0]
162 | m01 = M[0, 1]
163 | m02 = M[0, 2]
164 | m10 = M[1, 0]
165 | m11 = M[1, 1]
166 | m12 = M[1, 2]
167 | m20 = M[2, 0]
168 | m21 = M[2, 1]
169 | m22 = M[2, 2]
170 | # symmetric matrix K
171 | K = np.array([[m00-m11-m22, 0.0, 0.0, 0.0],
172 | [m01+m10, m11-m00-m22, 0.0, 0.0],
173 | [m02+m20, m12+m21, m22-m00-m11, 0.0],
174 | [m21-m12, m02-m20, m10-m01, m00+m11+m22]])
175 | K /= 3.0
176 | # quaternion is eigenvector of K that corresponds to largest eigenvalue
177 | w, V = np.linalg.eigh(K)
178 | q = V[[3, 0, 1, 2], np.argmax(w)]
179 | if q[0] < 0.0:
180 | np.negative(q, q)
181 | return [q[1], q[2], q[3], q[0]]
182 |
183 |
184 | def pointAt(pose, target):
185 | # Convention: +Z=out of camera, +Y=Down, +X=right
186 | z = target - pose
187 | z /= np.sqrt(np.sum(z**2))
188 | y = np.array([0, -1, 0], dtype=np.float32)
189 | x = np.cross(y, z)
190 | x /= np.sqrt(np.sum(x**2))
191 | y = np.cross(z, x)
192 | ret = quaternionFromMatrix(np.array([x, y, z]).transpose())
193 | return ret
194 |
195 |
196 | if __name__ == '__main__':
197 | import argparse
198 | parser = argparse.ArgumentParser()
199 | parser.add_argument('--cpu', dest='cpu', action="store_true", default=True)
200 | parser.add_argument('--no-cpu', dest='cpu', action="store_false")
201 | parser.add_argument('--cuda', dest='cuda',
202 | action="store_true", default=True)
203 | parser.add_argument('--no-cuda', dest='cuda', action="store_false")
204 | args = parser.parse_args()
205 | test_imageprojection(cpu=args.cpu, cuda=args.cuda)
206 |
--------------------------------------------------------------------------------
/tests/test_particlecollision.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | # Add path to python source to path.
4 | sys.path.append(os.path.join(os.path.dirname(os.path.dirname(
5 | os.path.abspath(__file__))), "python"))
6 | import SmoothParticleNets as spn
7 |
8 | import itertools
9 | import numpy as np
10 | import torch
11 | import torch.autograd
12 |
13 | from gradcheck import gradcheck
14 | try:
15 | import pytest_args
16 | except ImportError:
17 | print("Make sure to compile SmoothParticleNets before running tests.")
18 | raise
19 |
20 |
21 | def test_particlecollision(cpu=True, cuda=True):
22 | if cpu:
23 | print("Testing CPU implementation of ParticleCollision...")
24 | eval_particlecollision(cuda=False)
25 | print("CPU implementation passed!")
26 | print("")
27 |
28 | if cuda:
29 | if pytest_args.with_cuda:
30 | print("Testing CUDA implementation of ParticleCollision...")
31 | eval_particlecollision(cuda=True)
32 | print("CUDA implementation passed!")
33 | else:
34 | print("Not compiled with CUDA, skipping CUDA test.")
35 |
36 | def eval_particlecollision(cuda=False):
37 | BATCH_SIZE = 2
38 | N = 100
39 | M = 77
40 | NDIM = 2
41 | RADIUS = 0.2
42 | NCHANNELS = 2
43 |
44 | np.random.seed(0)
45 |
46 | locs = np.random.rand(BATCH_SIZE, N, NDIM).astype(np.float32)
47 | qlocs = np.random.rand(BATCH_SIZE, M, NDIM).astype(np.float32)
48 | data = np.random.rand(BATCH_SIZE, N, NCHANNELS).astype(np.float32)
49 |
50 | gt_neighbors = np.ones((BATCH_SIZE, M, N), dtype=int)*-1
51 | for b in range(BATCH_SIZE):
52 | for i in range(M):
53 | for j in range(N):
54 | d = np.square(qlocs[b, i, :] - locs[b, j, :]).sum()
55 | if d <= RADIUS*RADIUS:
56 | nc = min(np.where(gt_neighbors[b, i, :] < 0)[0])
57 | gt_neighbors[b, i, nc] = j
58 |
59 | def use_cuda(x):
60 | if cuda:
61 | return x.cuda()
62 | else:
63 | return x
64 | def undo_cuda(x):
65 | if cuda:
66 | return x.cpu()
67 | else:
68 | return x
69 |
70 | olocs = locs
71 | oqlocs = qlocs
72 | odata = data
73 | locs = torch.autograd.Variable(use_cuda(torch.FloatTensor(locs.copy())),
74 | requires_grad=False)
75 | qlocs = torch.autograd.Variable(use_cuda(torch.FloatTensor(qlocs.copy())),
76 | requires_grad=False)
77 | data = torch.autograd.Variable(use_cuda(torch.FloatTensor(data.copy())),
78 | requires_grad=False)
79 |
80 | coll = spn.ParticleCollision(NDIM, RADIUS, max_collisions=N)
81 | convsp = use_cuda(coll)
82 |
83 | vlocs, vdata, vidxs, vneighbors = coll(locs, data, qlocs)
84 |
85 | idxs = undo_cuda(vidxs).data.numpy().astype(int)
86 | neighbors = undo_cuda(vneighbors).data.numpy().astype(int)
87 | nlocs = undo_cuda(vlocs).data.numpy()
88 | ndata = undo_cuda(vdata).data.numpy()
89 |
90 | # First make sure all the indexes are in idxs.
91 | for b in range(BATCH_SIZE):
92 | for i in range(N):
93 | assert i in idxs[b, :]
94 |
95 | # Next make sure locs and data are in the order idxs says they're in.
96 | for b in range(BATCH_SIZE):
97 | for i, j in enumerate(idxs[b, :]):
98 | assert all(olocs[b, j, :] == nlocs[b, i, :])
99 | assert all(odata[b, j, :] == ndata[b, i, :])
100 |
101 | # Make sure the input locs and data weren't altered.
102 | assert np.all(undo_cuda(locs).data.numpy() == olocs)
103 | assert np.all(undo_cuda(data).data.numpy() == odata)
104 |
105 | # Check the neighbor list.
106 | for b in range(BATCH_SIZE):
107 | for i in range(M):
108 | for j in neighbors[b, i, :]:
109 | if j < 0:
110 | break
111 | assert idxs[b, j] in gt_neighbors[b, i, :]
112 | for j in gt_neighbors[b, i, :]:
113 | if j < 0:
114 | break
115 | jj = np.where(idxs[b, :] == j)[0][0]
116 | assert jj in neighbors[b, i, :]
117 |
118 | # Finally put the locations and data back in their original order.
119 | reorder = use_cuda(spn.ReorderData(reverse=True))
120 | vlocs, vdata = reorder(vidxs, vlocs, vdata)
121 | assert np.all(undo_cuda(vlocs).data.numpy() == olocs)
122 | assert np.all(undo_cuda(vdata).data.numpy() == odata)
123 |
124 | # Test gradients.
125 | def func(l, d, q):
126 | return coll(l, d, q)[:2]
127 | assert gradcheck(func, (locs, data, qlocs), eps=1e-2, atol=1e-3)
128 |
129 |
130 |
131 | if __name__ == '__main__':
132 | import argparse
133 | parser = argparse.ArgumentParser()
134 | parser.add_argument('--cpu', dest='cpu', action="store_true", default=True)
135 | parser.add_argument('--no-cpu', dest='cpu', action="store_false")
136 | parser.add_argument('--cuda', dest='cuda', action="store_true", default=True)
137 | parser.add_argument('--no-cuda', dest='cuda', action="store_false")
138 | args = parser.parse_args()
139 | test_particlecollision(cpu=args.cpu, cuda=args.cuda)
--------------------------------------------------------------------------------