├── .gitignore
├── LICENSE
├── README.md
├── REQUIREMENTS.txt
├── _config.yml
├── docs
    ├── convsdf
    │   ├── README.md
    │   └── diagram.png
    ├── convsp
    │   ├── README.md
    │   ├── conv_diagram.png
    │   └── kernel_diagram.png
    ├── imageprojection
    │   └── README.md
    ├── particlecollision
    │   └── README.md
    ├── particleprojection
    │   └── README.md
    └── reorderdata
    │   └── README.md
├── examples
    ├── convsp_example.py
    ├── fluid_sim.py
    └── tblogger.py
├── external
    └── cub-1.3.2
    │   └── cub
    │       ├── block
    │           ├── block_discontinuity.cuh
    │           ├── block_exchange.cuh
    │           ├── block_histogram.cuh
    │           ├── block_load.cuh
    │           ├── block_radix_rank.cuh
    │           ├── block_radix_sort.cuh
    │           ├── block_raking_layout.cuh
    │           ├── block_reduce.cuh
    │           ├── block_scan.cuh
    │           ├── block_shift.cuh
    │           ├── block_store.cuh
    │           └── specializations
    │           │   ├── block_histogram_atomic.cuh
    │           │   ├── block_histogram_sort.cuh
    │           │   ├── block_reduce_raking.cuh
    │           │   ├── block_reduce_raking_commutative_only.cuh
    │           │   ├── block_reduce_warp_reductions.cuh
    │           │   ├── block_scan_raking.cuh
    │           │   └── block_scan_warp_scans.cuh
    │       ├── block_range
    │           ├── block_range_histo.cuh
    │           ├── block_range_radix_sort_downsweep.cuh
    │           ├── block_range_radix_sort_upsweep.cuh
    │           ├── block_range_reduce.cuh
    │           ├── block_range_reduce_by_key.cuh
    │           ├── block_range_scan.cuh
    │           ├── block_range_select.cuh
    │           ├── block_scan_prefix_operators.cuh
    │           └── specializations
    │           │   ├── block_range_histo_gatomic.cuh
    │           │   ├── block_range_histo_satomic.cuh
    │           │   └── block_range_histo_sort.cuh
    │       ├── cub.cuh
    │       ├── device
    │           ├── device_histogram.cuh
    │           ├── device_partition.cuh
    │           ├── device_radix_sort.cuh
    │           ├── device_reduce.cuh
    │           ├── device_scan.cuh
    │           ├── device_select.cuh
    │           └── dispatch
    │           │   ├── device_histogram_dispatch.cuh
    │           │   ├── device_radix_sort_dispatch.cuh
    │           │   ├── device_reduce_by_key_dispatch.cuh
    │           │   ├── device_reduce_dispatch.cuh
    │           │   ├── device_scan_dispatch.cuh
    │           │   └── device_select_dispatch.cuh
    │       ├── grid
    │           ├── grid_barrier.cuh
    │           ├── grid_even_share.cuh
    │           ├── grid_mapping.cuh
    │           └── grid_queue.cuh
    │       ├── host
    │           └── spinlock.cuh
    │       ├── iterator
    │           ├── arg_index_input_iterator.cuh
    │           ├── cache_modified_input_iterator.cuh
    │           ├── cache_modified_output_iterator.cuh
    │           ├── constant_input_iterator.cuh
    │           ├── counting_input_iterator.cuh
    │           ├── tex_obj_input_iterator.cuh
    │           ├── tex_ref_input_iterator.cuh
    │           └── transform_input_iterator.cuh
    │       ├── thread
    │           ├── thread_load.cuh
    │           ├── thread_operators.cuh
    │           ├── thread_reduce.cuh
    │           ├── thread_scan.cuh
    │           └── thread_store.cuh
    │       ├── util_allocator.cuh
    │       ├── util_arch.cuh
    │       ├── util_debug.cuh
    │       ├── util_device.cuh
    │       ├── util_macro.cuh
    │       ├── util_namespace.cuh
    │       ├── util_ptx.cuh
    │       ├── util_type.cuh
    │       └── warp
    │           ├── specializations
    │               ├── warp_reduce_shfl.cuh
    │               ├── warp_reduce_smem.cuh
    │               ├── warp_scan_shfl.cuh
    │               └── warp_scan_smem.cuh
    │           ├── warp_reduce.cuh
    │           └── warp_scan.cuh
├── python
    └── SmoothParticleNets
    │   ├── ImageProjection.py
    │   ├── ParticleCollision.py
    │   ├── ParticleProjection.py
    │   ├── __init__.py
    │   ├── convsdf.py
    │   ├── convsp.py
    │   ├── error_checking.py
    │   └── kernels.py
├── setup.py
├── src
    ├── common_funcs.h
    ├── constants.h
    ├── cpu_layer_funcs.cpp
    ├── cuda_layer_funcs.cpp
    ├── gpu_kernels.cu
    └── gpu_kernels.h
└── tests
    ├── gradcheck.py
    ├── regular_grid_interpolater.py
    ├── test_convsdf.py
    ├── test_convsp.py
    ├── test_imageprojection.py
    ├── test_particlecollision.py
    └── test_particleprojection.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | lib/gpu_kernels.cu.o
 2 | test/__pycache__/test_f_grid.cpython-27-PYTEST.pyc
 3 | test/__pycache__/test_particles2grid.cpython-27-PYTEST.pyc
 4 | python/SmoothParticleNets/_ext/_ext.so
 5 | *.pyc
 6 | python/SmoothParticleNets/_ext/__ext.so
 7 | .cache/
 8 | test/.cache/
 9 | test/pytest_args.py
10 | ._timings_n2_shared.csv
11 | src/kernel_constants.h
12 | build
13 | *.so
14 | *.egg-info
15 | tests/pytest_args.py
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 cschenck
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SmoothParticleNets
 2 | 
 3 | Smooth Particle Networks (SmoothParticleNets or SPNets) is a set of custom PyTorch layers to facilitate computation with unordered particle sets.
 4 | They were created for the purpose of enabling particle-based fluid dynamics inside a deep network, but the layers can be used for other purposes.
 5 | Broadly, the layers enable computing particle-particle interactions, particle-object interactions, and projections onto and out of a camera image.
 6 | The interface to this library is in Python.
 7 | This library contains 6 layers, listed below.
 8 | Note that this library provides only the basic functionality and no additional utilities, e.g., the library does not include a particle visualizer and the library does not include a tool for processing 3D object mesh files into signed distance fields.
 9 | 
10 | ## Layers
11 | 
12 | Below is the list of each layer contained in this library.
13 | Clicking on the layer's name will take you to a description of what that layer does and how to use it.
14 | 
15 | * [ConvSP](https://cschenck.github.io/SmoothParticleNets/docs/convsp)
16 | * [ConvSDF](https://cschenck.github.io/SmoothParticleNets/docs/convsdf)
17 | * [ImageProjection](https://cschenck.github.io/SmoothParticleNets/docs/imageprojection)
18 | * [ParticleProjection](https://cschenck.github.io/SmoothParticleNets/docs/particleprojection)
19 | * [ParticleCollision](https://cschenck.github.io/SmoothParticleNets/docs/particlecollision)
20 | * [ReorderData](https://cschenck.github.io/SmoothParticleNets/docs/reorderdata)
21 | 
22 | ## Requirements
23 | 
24 | This library only requires PyTorch as a dependency. 
25 | The current version of the library has been tested to work with PyTorch 0.4.1.
26 | Furthermore, this library only supports Python 3, and does not support Python 2.
27 | 
28 | Note that this library was developed only under linux and may or may not run directly without modification on other platforms.
29 | Specifically, this library is confirmed to work on Ubuntu 18.04 with PyTorch 0.4.1, Cuda 10.0, and the 410 Nvidia drivers (although that should not matter).
30 | 
31 | ## Installation
32 | 
33 | To install this library, download the source from github.
34 | Once downloaded, enter the root directory of the source and run
35 | ```bash
36 | sudo python3 setup.py install
37 | ```
38 | 
39 | Once installed, in Python you should be able to call 'import SmoothParticleNets', which will import the library.
40 | 
41 | ## Citation
42 | 
43 | In published works please cite this as
44 | > C. Schenck and D. Fox, "SPNets: Differentiable Fluid Dynamics for Deep Neural Networks," in *Proceedings of the Second Conference on Robot Learning (CoRL),* Zurich, Switzerland, 2018.
45 | 
46 | ```bibtex
47 | @inproceedings{spnets2018,
48 |   title={SPNets: Differentiable Fluid Dynamics for Deep Neural Networks},
49 |   author={Schenck, C. and Fox, D.},
50 |   booktitle={Proceedings of the Second Conference on Robot Learning (CoRL)},
51 |   year={2018},
52 |   address={Zurich, Switzerland}
53 | }
54 | ```
55 | 


--------------------------------------------------------------------------------
/REQUIREMENTS.txt:
--------------------------------------------------------------------------------
1 | torch 0.4.1
2 | torchvision
3 | CUDA 10
4 | nvidia drivers 410
5 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman


--------------------------------------------------------------------------------
/docs/convsdf/README.md:
--------------------------------------------------------------------------------
 1 | # ConvSDF
 2 | 
 3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets)
 4 | 
 5 | ## Description
 6 | 
 7 | The ConvSDF layer is the other primary layer in addition to the ConvSP layer.
 8 | ConvSDF stands for Signed Distance Field Convolution.
 9 | The purpose of this layer is to enable particle-object interactions.
10 | The particles are represented as a list of coordinate locations.
11 | The objects are represented as signed distance fields (SDFs).
12 | SDFs are functions that take in a point in space relative to the object and return the signed distance to the closest point on the surface of the object, where the sign indicates if the query point is inside the object (negative) or outside (positive).
13 | For ConvSDF, this function is represented as a lookup table in the form of a grid.
14 | ConvSDF accepts a grid with the SDF values for each grid cell filled in, then performs linear interpolation when looking up the SDF value for a specific point.
15 | 
16 | ConvSDF works as follows.
17 | ConvSDF operates on sets of query locations, but for simplicity the following describes a single query location.
18 | For a given query point, ConvSDF places a convolutional kernel around that point's location in space.
19 | Then it looks up the SDF values at the center of each of the kernel cells.
20 | This is then convolved with a set of weights in the same manner as a standard convolutional layer, the values are multiplied by a set of weights and then summed.
21 | The following diagram illustrates this process.
22 | 
23 | ![](diagram.png)
24 | 
25 | The SDF field is shown as a heatmap, with the object boundry shown in black.
26 | The large red dot is the query location, with the smaller red dots showing the kernel cell centers.
27 | The output of ConvSDF is the convolved value for the given query location.
28 | 
29 | The ConvSDF layer is given the pre-computed SDF grids; it does not compute grids from mesh files.
30 | That must be done externally.
31 | SmoothParticleNets does not include any tools to do this (although some can be found by searching online).
32 | This was done intentnionally to reduce the dependencies that this library requires.
33 | Furthermore, for simplicity, ConvSDF assumes the origin of all the SDF grids is the bottom corner grid.
34 | Ensure that when generating SDF grids that you note if the origin in the mesh file differs from the bottom corner of the grid and ensure you update all poses to take this into account.
35 | SDFs in 1D or in 4+D are not really well-defined, so for now ConvSDF only supports 2D or 3D.
36 | 
37 | One common usecase for ConvSDF is to compute when particles are inside objects and how to move them away from the object.
38 | This can be done by using ConvSDF to first compute which particles have a negative SDF value, and then by using another ConvSDF layer with fixed +1/-1 weights to compute numerical gradients.
39 | Multiplying the gradients by the distance yields the vector to move the particle by.
40 | 
41 | ConvSDF is implemented as a subclass of torch.nn.Module.
42 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d).
43 | ConvSDF is implemented with gradients for the query locations and the object poses so that it can be used during a backward call.
44 | ConvSdf is impelemented in native code with Cuda support, so it can be evaluated efficiently.
45 | 
46 | ## Example
47 | 
48 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches.
49 | ```python
50 | # Let's make a simple SDF grid.
51 | sdf = torch.Tensor([[0.7, 0.5, 0.5, 0.7], [0.5, -0.5, -0.5, 0.5], [-0.5, 0.5, 0.5, -0.5], [0.7, 0.5, 0.5, 0.7]])
52 | # Construct a ConvSDF layer with 5 kernels.
53 | ConvSDF(sdfs=[sdf], sdf_sizes=[1.0], out_channels=5, ndim=2, kernel_size=1, dilation=0.1, max_distance=1.0, with_params=True, compute_pose_grads=True)
54 | # Convolve at the particle locations. Put the object at the origin with no rotation.
55 | new_data = conv(locs, torch.Tensor([[0]]*locs.shape[0]), torch.Tensor([[0.0, 0.0, 0.0, 0.0]]*locs.shape[0]), torch.Tensor([[1.0]]*locs.shape[0]))
56 | ```
57 | 
58 | 
59 | ## Documentation
60 | 
61 | ConvSDF provides three functions: a constructor, SetSDFs, and forward.
62 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer).
63 | 
64 | * ### ConvSDF(sdfs, sdf_sizes, out_channels, ndim, kernel_size, dilation, max_distance, with_params=True, compute_pose_grads=False):
65 |     * Arguments
66 |         * **sdfs**[list of torch.Tensor]: The pre-computed SDF grids for every object that may be encountered. During the forward call, specific objects can be selected. When there are multiple objects in a scene, the SDFs are combined using the MIN operator (e.g., when evaluating each at a specific query location, the SDF with the smallest value is used). Each value in the grids should be the distance to the surface of the object and negative iff it is inside the object. 
67 |         * **sdf_sizes**[list of float]: The size of one side of a grid cell for each SDF. The grid cells are assumed to be hypercubes.
68 |         * **out_channels**[int]: Similar to standard convolutions, this is the number of convolutional kernels to create. The output is then a feature vector for each query location. Unlike ConvSP, the input is not an arbitrary feature vector but an SDF, so there is no corresponding in_channels argument.
69 |         * **ndim**[int]: The dimensionality of the coordinate space.
70 |         * **kernel_size**[int or tuple]: The size of the kernel. If a tuple, then len(kernel_size) == ndim must be True. If an integer, the same size is used for each dimension. Kernel sizes must be odd.
71 |         * **dilation**[float or tuple]: The size of a kernel cell. If a tuple, then len(dilation) == nimd must be True. If a float, then the same size is used for each dimension. Unlike standard convolutions, where the size of a kernel cell is fixed by the size of a grid cell (e.g., size of a pixel), the unordered particle sets do not provide that structure, so this size must be specified.
72 |         * **max_distance**[float]: When looking up the SDF value in an SDF grid, if it is larger than this value, this value is used instead. This is useful when query locations may fall outside of the pre-computed SDF grids.
73 |         * **with_params**[boolean]: (optional) If True (default), the parameters of the layer (weights and bias) will be instantiated as torch.nn.Parameters so that they are treated as parameters by PyTorch's built-in operators. If False, then they are added as torch.autograd.Variables and will not be modified by PyTorch directly. This can be useful if desiring fixed (non-trainable) parameters or for debugging.
74 |         * **compuse_pose_grads**[boolean]: (optional) If False, will not compute gradients with respect to the poses of the objects during backpropagation. This can speed up the backward pass when these gradients are not desired.
75 | 
76 | * ### SetSDFs(sdfs, sdf_sizes):
77 |     * Arguments
78 |         * **sdfs**[list of torch.Tensor]: The pre-computed SDF grids for every object that may be encountered. During the forward call, specific objects can be selected. When there are multiple objects in a scene, the SDFs are combined using the MIN operator (e.g., when evaluating each at a specific query location, the SDF with the smallest value is used). Each value in the grids should be the distance to the surface of the object and negative iff it is inside the object. 
79 |         * **sdf_sizes**[list of float]: The size of one side of a grid cell for each SDF. The grid cells are assumed to be hypercubes.
80 | 
81 | * ### forward(locs, idxs, poses, scales):
82 |     * Arguments
83 |         * **locs**[BxNxD torch.autograd.Variable]: The batched list of query locations. D must match the ndim argument to the constructor.
84 |         * **idxs**[BxM torch.autograd.Variable]: The indices of the objects to use, where M is the number of objects in the scene. The indices index into the sdfs passed into the constructor. Not every element in the batch must have M objects. Any element that has fewer than M objects may simply set the usused indices to -1.
85 |         * **poses**[BxMxDD torch.autograd.Variable]: The pose of each object in the scene. The first D values are the translation, and the remaining values are the rotation. For 2D, the rotation is a single angle. For 3D, the rotation is a quaternion in xyzw format. Only 2D and 3D are supported. The origina for all objects is the lower corner of its SDF grid.
86 |         * **scales**[BxM torch.autograd.Variable]: The scale for each object, where 0.5 shrinks the object by half and 2.0 doubles the size of the object.
87 |     * Returns
88 |         * **new_data**[BxMxG torch.autograd.Variable]: The result of the convolutions. G is the out_channels argument passed to the constructor. This is a new feature vector for each of the query locations.
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/docs/convsdf/diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cschenck/SmoothParticleNets/1bfde9bd6ce00dcb8750a48f49ce03f4400fb8cc/docs/convsdf/diagram.png


--------------------------------------------------------------------------------
/docs/convsp/README.md:
--------------------------------------------------------------------------------
 1 | # ConvSP
 2 | 
 3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets)
 4 | 
 5 | ## Description
 6 | 
 7 | The ConvSP layer is the main workhorse layer of SmoothParticleNets.
 8 | ConvSP stands for Smooth Particle Convolution.
 9 | The ConvSP layer operates on unordered particle sets.
10 | Each particle has a feature vector associated with it, and the ConvSP performs a convolution on these features, similar to how a Conv2D layer performs a convolution on the channels of a feature image.
11 | However, unlike in a standard convolution on a gird, the features associated with each particle here create a continuous vector field across space.
12 | 
13 | More formally, a set of particles represents a continuous vector field in space.
14 | That is, at everypoint in space it is possible to evaluate the features represented by the particle set.
15 | This is illustrated in the following diagram and equation
16 | 
17 | ![](kernel_diagram.png)
18 | 
19 | Given an arbitrary query location (the red dot), the features of each nearby particle (x_j) are averaged together, weighted based on their distance to the query point using a kernel function W.
20 | 
21 | This is then used to perform convolutions.
22 | Unlike in the standard convolution, here there isn't a well-defined grid to convolve on.
23 | Instead, the ConvSP layer convolves in free space.
24 | This is illustrated in the following diagram.
25 | 
26 | ![](conv_diagram.png)
27 | 
28 | In the above 2D case, the kernel used is 3x3.
29 | Given a query location (the large red dot), the kernel is placed on top of that location.
30 | Then the above field lookup equation is used to evaluate the continuous vector field at the center of each kernel cell (small red dots).
31 | The resulting values are then multiplied by kernel weights and summed in the same manner as a standard convolution.
32 | The key difference between ConvSP and a standard convolution is the use of the smoothing kernel average above to allow evaluating the kernel at any arbitrary point in space.
33 | 
34 | 
35 | ConvSP is implemented as a subclass of torch.nn.Module.
36 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d).
37 | ConvSP is implemented with gradients so that it can be used during a backward call.
38 | ConvSP is impelemented in native code with Cuda support, so it can be evaluated efficiently.
39 | 
40 | ## Example
41 | 
42 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches and data is a tensor containing a feature vector for each particle.
43 | ```python
44 | # Create a ConvSP layer with 5 output channels, 3 size kernel with dilation of 0.05, and a radius of 0.1.
45 | conv = ConvSP(in_channels=data.shape[2], out_channels=5, locs.shape[2], kernel_size=3, dilation=0.05, radius=0.1, dis_norm=False, with_params=True, kernel_fn='spiky')
46 | # The ConvSP layer requires a ParticleCollision layer to generate the neighbor list. The radius of the neighbor list should be the maximum distance a neighor of any kernel cell could be from the center of the kernel, which is radius + kernel_size/2*dilation.
47 | coll = ParticleCollision(ndim=locs.shape[2], radius=(0.1 + 0.05))
48 | # PartileCollision reorders locs and data.
49 | locs, data, idxs, neighbors = coll(locs, data)
50 | # Get the new features. We'll use the particle locations as the query locations, so we won't be passing anything for qlocs.
51 | new_data = conv(locs, data, neighbors)
52 | # new_data is still reordered according to the reordered locs, but we might want them in the original order.
53 | reorder = ReorderData(reverse=True)
54 | locs, new_data = reorder(idxs, locs, new_data)
55 | ```
56 | 
57 | 
58 | ## Documentation
59 | 
60 | ConvSP provides two functions: a constructor and forward.
61 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer).
62 | 
63 | * ### ConvSP(in_channels, out_channels, ndim, kernel_size, dilation, radius, dis_norm=False, kernel_fn='default', with_params=True):
64 |     * Arguments
65 |         * **in_channels**[int]: The dimensionality of the feature vectors associated with each particle.
66 |         * **out_channels**[int]: Similar to standard convolutions, this is the number of convolutional kernels to create. The output is then a feature vector for each query location.
67 |         * **ndim**[int]: The dimensionality of the particle's coordinate space.
68 |         * **kernel_size**[int or tuple]: The size of the kernel. If a tuple, then len(kernel_size) == ndim must be True. If an integer, the same size is used for each dimension. Kernel sizes must be odd.
69 |         * **dilation**[float or tuple]: The size of a kernel cell. If a tuple, then len(dilation) == nimd must be True. If a float, then the same size is used for each dimension. Unlike standard convolutions, where the size of a kernel cell is fixed by the size of a grid cell (e.g., size of a pixel), the unordered particle sets do not provide that structure, so this size must be specified.
70 |         * **radius**[float]: The radius to use when computing the smoothing kernel average. Only particles within this distance of the query location are used in the average.
71 |         * **dis_norm**[boolean]: (optional) If true, the features in the smoothing kernel average will be divided by the distance from the query location to the particle. This normalization can be useful for some computations.
72 |         * **kernel_fn**[string]: (optional) The kernel function to use in the smoothing kernel average. SmoothParticleNets provides many options for the kernel. Refer to kernels.py for a complete list.
73 |         * **with_params**[boolean]: (optional) If True (default), the parameters of the layer (weights and bias) will be instantiated as torch.nn.Parameters so that they are treated as parameters by PyTorch's built-in operators. If False, then they are added as torch.autograd.Variables and will not be modified by PyTorch directly. This can be useful if desiring fixed (non-trainable) parameters or for debugging.
74 | 
75 | * ### forward(locs, data, neighbors, qlocs=None):
76 |     * Arguments
77 |         * **locs**[BxNxD torch.autograd.Variable]: The batched list of particle locations. D must match the ndim argument to the constructor.
78 |         * **data**[BxNxK torch.autograd.Variable]: The feature vectors associated with each particle. K must be the same as the in_channels argument to the constructor. 
79 |         * **neighbors**[BxMxF torch.autograd.Variable]: The pre-computed neighbor list for each query location. This can be generated using the ParticleCollision layer. This is necessary for evaluating the kernel smoothing average.
80 |         * **qlocs**[BxMxD torch.autograd.Variable]: (optional) The set of locations to perform convolutions around. Usually this will be the same as the particle locations, but not always. If this argument is not provided, locs is used.
81 |     * Returns
82 |         * **new_data**[BxMxG torch.autograd.Variable]: The result of the convolutions. G is the out_channels argument passed to the constructor. This is a new feature vector for each of the query locations.
83 | 
84 | 


--------------------------------------------------------------------------------
/docs/convsp/conv_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cschenck/SmoothParticleNets/1bfde9bd6ce00dcb8750a48f49ce03f4400fb8cc/docs/convsp/conv_diagram.png


--------------------------------------------------------------------------------
/docs/convsp/kernel_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cschenck/SmoothParticleNets/1bfde9bd6ce00dcb8750a48f49ce03f4400fb8cc/docs/convsp/kernel_diagram.png


--------------------------------------------------------------------------------
/docs/imageprojection/README.md:
--------------------------------------------------------------------------------
 1 | # ImageProjection
 2 | 
 3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets)
 4 | 
 5 | ## Description
 6 | 
 7 | The ImageProjection layer projects an image feature map onto a set of particles in the view frame of the camera.
 8 | That is, given an image of C channels, it first projects each particle onto the image using given camera intrinsics (focal length, etc.) and extrinsics (pose).
 9 | Then it uses bilinear interpolation between the 4 adjacent pixels to generate a feature vector for the given particle.
10 | The output is a C-length feature vector for each particle.
11 | The ImageProjection layer currently only supports 3D coordinate spaces.
12 | 
13 | ImageProjection is implemented as a subclass of torch.nn.Module.
14 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d).
15 | ImageProjection can compute gradients with respect to the camera or particle poses and the image features, and is implemented with Cuda support for efficient computation.
16 | 
17 | ## Example
18 | 
19 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches and image is a [BxHxWxC] feature image.
20 | ```python
21 | # First create the ParticleProjection layer.
22 | proj = ImageProjection(camera_fl=540)
23 | # Setup the camera pose.
24 | camera_pose = torch.Tensor([0.0, 0.0, 0.0])
25 | camera_rotation = torch.Tensor([0.0, 0.0, 0.0, 1.0])
26 | new_data = proj(locs, image, camera_pose, camera_rotation)
27 | ```
28 | 
29 | 
30 | ## Documentation
31 | 
32 | ImageProjection provides two functions: a constructor and forward.
33 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer).
34 | 
35 | * ### ImageProjection(camera_fl):
36 |     * Arguments
37 |         * **camera_fl**[float]: The focal length of the camera.
38 | 
39 | * ### forward(locs, image, camera_pose, camera_rot, depth_mask=None):
40 |     * Arguments
41 |         * **locs**[BxNx3 torch.autograd.Variable]: The batched list of particle locations. Only 3D particle loations are supported.
42 |         * **image**[BxHxWxC torch.autograd.Variable]: The image to project onto the particles. H and W are the height and width, respectively, and C is the number of channels.
43 |         * **camera_pose**[Bx3 torch.autograd.Variable]: The camera translation in the environment.
44 |         * **camera_rot**[Bx4 torch.autograd.Variable]: The camera rotation in the environment, represented as a quaternion in xyzw format.
45 |         * **depth_mask**[BxHxW torch.autograd.Variable]: (optional) If passed, this is used to mask particles that are obscured by obstructions in the environment. If the depth of a pixel is less than the depth of the particle, nothing is projected onto that particle. 
46 |     * Returns
47 |         * **new_data**[BxNxC torch.autograd.Variable]: The set of features for each particle after projecting the image features onto them.
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/docs/particlecollision/README.md:
--------------------------------------------------------------------------------
 1 | # ParticleCollision
 2 | 
 3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets)
 4 | 
 5 | ## Description
 6 | 
 7 | The ParticleCollision layer pre-computes neighbor lists (i.e., "colliding" particles) for each given particle.
 8 | That is, given a list of particle positions and a fixed radius, this layer returns a short list for each particle with the index of all other particles that are within that radius of it.
 9 | To do this, internally the ParticleCollision layer creates a hashgrid and performs lookups based on that grid.
10 | The resulting neighbor list is designed to be used by the ConvSP layer to compute particle-particle interactions.
11 | 
12 | An important operation that this layer does alongside computing collisions is to reorder the particle list.
13 | The reordering places particles falling in the same grid cell in the hash grid next to each other in memory.
14 | By doing so, cache hits are increased dramatically during the computation of particle-particle interactions in ConvSP, resulting in a large speedup.
15 | Due to this reordering, the returned list of colliding neighbor indices are indices in the *reordered* list, not in the original.
16 | The standard use of this layer is to compute collisions, make as many calls to ConvSP as are desired, then use the ReorderData layer to return the particle list to its original order.
17 | It is important to emphasize that reordering the data according to the hash grid is critical for perfomance of the ConvSP layer.
18 | 
19 | ParticleCollision is implemented as a subclass of torch.nn.Module.
20 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d).
21 | There are no gradients to compute for this layer, so it simply passes them through when calling backward.
22 | 
23 | ## Example
24 | 
25 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches and vel is a same size tensor containing the particle's velocities.
26 | ```python
27 | coll = ParticleCollision(ndim, radius)
28 | # PartileCollision reorders locs and vel.
29 | locs, vel, idxs, neighbors = coll(locs, vel)
30 | ```
31 | 
32 | 
33 | ## Documentation
34 | 
35 | ParticleCollision provides two functions: a constructor and forward.
36 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer).
37 | 
38 | * ### ParticleCollision(ndim, radius, max_grid_dim=96, max_collisions=128, include_self=True):
39 |     * Arguments
40 |         * **ndim**[int]: The dimensionality of the particle's coordinate space.
41 |         * **radius**[float]: The maximum distance a particle can be from another and still be colliding.
42 |         * **max_grid_dims**[int]: (optional) The maximum size of the hash grid in any dimension. This is useful for limiting memory consumpation in cases where the particles are very spread out relative to the collision radius. Particles that don't fall in the hash grid are placed in the cell closest to them. 
43 |         * **max_collisions**[int]: (optional) The maximum number of neighbors to return. The returned neighbor list for each particle will always be this length (although not necessarily entirely filled in), so selecting this parameter is a balance between memory consumption and ensuring all colliding particles are included.
44 |         * **include_self**[boolean]: (optional) If True, the particle will be in its own list of neighbors. If False it will not be.
45 | 
46 | * ### forward(idxs, locs, data=None, qlocs=None):
47 |     * Arguments
48 |         * **locs**[BxNxD torch.autograd.Variable]: The batched list of particle locations. D must match the ndim argument to the constructor.
49 |         * **data**[BxNxK torch.autograd.Variable]: (optional) Additional data associated with each particle. This data is not used during the forward call, however since the locs are reordered, any data associated with each particle must also be reordered. Technically this could also be accomplished instead by calling the ReorderData layer on the data after calling forward, but doing so here helps to prevent bugs when calling ConvSP with reordered locs but non-reordered data.
50 |         * **qlocs**[BxMxD torch.autograd.Variable]: (optional) In the case where it is desired to compute collisions between two different particle sets, this is the second set. Rather than returning the neighbor list for particles in locs, if this argument is passed, the returned neighbor list is a list for each particle in qlocs of the indices of particles in locs (after reordering) that it collides with.
51 |     * Returns
52 |         * **locs**[BxNxD torch.autograd.Variable]: The reordered list of particle positions.
53 |         * **data**[BxNxK torch.autograd.Variable]: (optional) If data was passed as an input, then the data reordered is returned.
54 |         * **idxs**[BxNxD torch.autograd.Variable]: The index list for the reordered particle list. Each index value indicates where the original index of that particle in the original locs, i.e., idxs[b, i] = j where i is the new index of the particle after reordering and j is its original index (b being the batch).
55 |         * **neighbors**[Bx(N/M)xC torch.autograd.Variable]: The neighbor list for each particle. If qlocs was passed as an argument, then it is the neighbors of each particle in qlocs instead of locs. Each value indicates the index in locs (after reordering) of the neighboring particle. C is the value of max_collisions as passed to the constructor. Note that not all particles will have max_collisions neighbors. In that event, the values in each particle's list are filled sequentially, with unfilled values in the list being set to -1.
56 | 


--------------------------------------------------------------------------------
/docs/particleprojection/README.md:
--------------------------------------------------------------------------------
 1 | # ParticleProjection
 2 | 
 3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets)
 4 | 
 5 | ## Description
 6 | 
 7 | The ParticleProjection layer is designed to allow comparison of the particle state with a camera image.
 8 | It does this by projecting the particles onto a virtual camera image, which can then be compared to other camera images as desired.
 9 | Each particle is projected onto the virtual image as a small Gaussian, which allows for smooth gradients with respect to the particle positions or camera pose.
10 | The layer computes the image coordinate of a given particle location using the pinhole camera model, not taking into account any distortions, e.g., radial distortion.
11 | ParticleProjection currently only supports 3D particle locations.
12 | 
13 | ParticleProjection is implemented as a subclass of torch.nn.Module.
14 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d).
15 | ParticleProjection can compute gradients with respect to the camera or particle poses, and is implemented with Cuda support for efficient computation.
16 | 
17 | ## Example
18 | 
19 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches.
20 | ```python
21 | # First create the ParticleProjection layer.
22 | proj = ParticleProjection(camera_fl=540, camera_size=(480, 640), filter_std=5.0, filter_scale=10.0)
23 | # Setup the camera pose.
24 | camera_pose = torch.Tensor([0.0, 0.0, 0.0])
25 | camera_rotation = torch.Tensor([0.0, 0.0, 0.0, 1.0])
26 | image = proj(locs, camera_pose, camera_rotation)
27 | ```
28 | 
29 | 
30 | ## Documentation
31 | 
32 | ParticleProjection provides two functions: a constructor and forward.
33 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer).
34 | 
35 | * ### ParticleProjection(camera_fl, camera_size, filter_std, filter_scale):
36 |     * Arguments
37 |         * **camera_fl**[float]: The focal length of the camera.
38 |         * **camera_size**[tuple]: A tuple of the camera image height and width (in that order) in pixels.
39 |         * **filter_std**[float]: The standard deviation (in pixels) of the Gaussian for each particle. The Gaussian will be added to all pixels within 2x of this to the particle's image coordinate.
40 |         * **filter_scale**[float]: All values added to a pixel will be multiplied by this to allow control of the intensity of the Gaussians for each particle. This is equivalent to multiplying the output image by this value after the fact.
41 | 
42 | * ### forward(locs, camera_pose, camera_rot, depth_mask=None):
43 |     * Arguments
44 |         * **locs**[BxNx3 torch.autograd.Variable]: The batched list of particle locations. Only 3D particle loations are supported.
45 |         * **camera_pose**[Bx3 torch.autograd.Variable]: The camera translation in the environment.
46 |         * **camera_rot**[Bx4 torch.autograd.Variable]: The camera rotation in the environment, represented as a quaternion in xyzw format.
47 |         * **depth_mask**[BxHxW torch.autograd.Variable]: (optional) If passed, this is used to mask particles that are obscured by obstructions in the environment. If the depth of a pixel is less than the depth of the particle, the particle's contribution to that pixel is not added. H and W must match the camera image height and width passed to the constructor. 
48 |     * Returns
49 |         * **image**[BxHxW torch.autograd.Variable]: The projected image. Particles appear as small Gaussians, and where particles overlap the Gaussians are added together.
50 | 
51 | 


--------------------------------------------------------------------------------
/docs/reorderdata/README.md:
--------------------------------------------------------------------------------
 1 | # ReorderData
 2 | 
 3 | [SmoothParticleNets](https://cschenck.github.io/SmoothParticleNets)
 4 | 
 5 | ## Description
 6 | 
 7 | The ReorderData layer is fairly simple.
 8 | The layer reorders a given tensor based on a tensor containing the indices for the data in the first tensor.
 9 | More formally, assume that DATA is a BxNxD tensor containing N D-dimensional data points (e.g., XYZ particle locations) over B batches.
10 | Let IDXS be a BxN tensor, where each IDXS[i, :] contains the numbers 0 to N-1 in some arbitrary order.
11 | This layer then returns DATA where the second dimension has been rearranged according to IDXS.
12 | This is equivalent to 
13 | ```python
14 | DATA[i, :, :] = DATA[i, IDXS[i, :], :]
15 | ```
16 | in PyTorch syntax, however this layer is specialized for this specific kind of indexing resulting in a faster implementation.
17 | This layer is designed as a helper layer for the ParticleCollision layer.
18 | 
19 | ReorderData is implemented as a subclass of torch.nn.Module.
20 | This allows it to be used in the same manner as any other PyTorch layer (e.g., conv2d).
21 | Additionally, this layer computes graidents, so it can be used in a backward pass.
22 | 
23 | ## Example
24 | 
25 | Assume *locs* is a BxNxD tensor containing the locations of N D-dimensional particles across B batches and *vel* is a same size tensor containing the particles' velocities.
26 | ```python
27 | # ReorderData is most commonly used in conjunction with ParticleCollision.
28 | coll = ParticleCollision(ndim, radius)
29 | # Set reverse=True. ParticleCollision calls ReorderData internally, so we want to undo that reordering when we're done.
30 | reorder = ReorderData(reverse=True)
31 | # PartileCollision reorders locs and vel.
32 | locs, vel, idxs, neighbors = coll(locs, vel)
33 | # Perform desired operations with locs, vel, neighbors...
34 | # When we're done, return locs and vel to their original order using ReorderData.
35 | locs, vel = reorder(idxs, locs, vel)
36 | ```
37 | 
38 | 
39 | ## Documentation
40 | 
41 | ReorderData provides two functions: a constructor and forward.
42 | Forward is called by calling the layer object itself (in the same manner as any standard PyTorch layer).
43 | 
44 | * ### ReorderData(reverse=True):
45 |     * Arguments
46 |         * **reverse**[boolean]: (optional) When False, behaves as normal, using the given indices to reorder the data. When True, this layer assumes that the given data was already reordered according to the given indices, and so reverses that process and retursn the data to the original order.
47 | 
48 | * ### forward(idxs, locs, data=None):
49 |     * Arguments
50 |         * **idxs**[BxN torch.autograd.Variable]: The list of indices to redorder the input by.
51 |         * **locs**[BxNxD torch.autograd.Variable]: The main data to be reordered. It is called *locs* because ReorderData is primarily a helper for ParticleCollision, which reorders the locations of the particles.
52 |         * **data**[BxNxK torch.autograd.Variable]: (optional) Additional data to reorder alongside locs. Calling forward with both locs and data is equivalent to calling it twice in a row with each individually. This argument is provided as a convenience.
53 |     * Returns
54 |         * **locs**[BxNxD torch.autograd.Variable]: A new tensor with the same values as in the locs argument reordered based in idxs.
55 |         * **data**[BxNxK torch.autograd.Variable]: (optional) If the data argument is passed, then forward will return a pair of tensors, where the second has the same values as data but reordered according to idxs.


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/block/block_raking_layout.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
 32 |  */
 33 | 
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "../util_macro.cuh"
 38 | #include "../util_arch.cuh"
 39 | #include "../util_namespace.cuh"
 40 | 
 41 | /// Optional outer namespace(s)
 42 | CUB_NS_PREFIX
 43 | 
 44 | /// CUB namespace
 45 | namespace cub {
 46 | 
 47 | /**
 48 |  * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
 49 |  * \ingroup BlockModule
 50 |  *
 51 |  * \par Overview
 52 |  * This type facilitates a shared memory usage pattern where a block of CUDA
 53 |  * threads places elements into shared memory and then reduces the active
 54 |  * parallelism to one "raking" warp of threads for serially aggregating consecutive
 55 |  * sequences of shared items.  Padding is inserted to eliminate bank conflicts
 56 |  * (for most data types).
 57 |  *
 58 |  * \tparam T                        The data type to be exchanged.
 59 |  * \tparam BLOCK_THREADS            The thread block size in threads.
 60 |  * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
 61 |  */
 62 | template <
 63 |     typename    T,
 64 |     int         BLOCK_THREADS,
 65 |     int         PTX_ARCH = CUB_PTX_ARCH>
 66 | struct BlockRakingLayout
 67 | {
 68 |     //---------------------------------------------------------------------
 69 |     // Constants and type definitions
 70 |     //---------------------------------------------------------------------
 71 | 
 72 |     enum
 73 |     {
 74 |         /// The total number of elements that need to be cooperatively reduced
 75 |         SHARED_ELEMENTS = BLOCK_THREADS,
 76 | 
 77 |         /// Maximum number of warp-synchronous raking threads
 78 |         MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
 79 | 
 80 |         /// Number of raking elements per warp-synchronous raking thread (rounded up)
 81 |         SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
 82 | 
 83 |         /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
 84 |         RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
 85 | 
 86 |         /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
 87 |         HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
 88 | 
 89 |         /// Degree of bank conflicts (e.g., 4-way)
 90 |         CONFLICT_DEGREE = (HAS_CONFLICTS) ?
 91 |             (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
 92 |             1,
 93 | 
 94 |         /// Pad each segment length with one element if degree of bank conflicts is greater than 4-way (heuristic)
 95 |         SEGMENT_PADDING = (CONFLICT_DEGREE > CUB_PREFER_CONFLICT_OVER_PADDING(PTX_ARCH)) ? 1 : 0,
 96 | //        SEGMENT_PADDING = (HAS_CONFLICTS) ? 1 : 0,
 97 | 
 98 |         /// Total number of elements in the raking grid
 99 |         GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + SEGMENT_PADDING),
100 | 
101 |         /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
102 |         UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
103 |     };
104 | 
105 | 
106 |     /**
107 |      * \brief Shared memory storage type
108 |      */
109 |     typedef T _TempStorage[BlockRakingLayout::GRID_ELEMENTS];
110 | 
111 |     /// Alias wrapper allowing storage to be unioned
112 |     struct TempStorage : Uninitialized<_TempStorage> {};
113 | 
114 | 
115 |     /**
116 |      * \brief Returns the location for the calling thread to place data into the grid
117 |      */
118 |     static __device__ __forceinline__ T* PlacementPtr(
119 |         TempStorage &temp_storage,
120 |         int linear_tid)
121 |     {
122 |         // Offset for partial
123 |         unsigned int offset = linear_tid;
124 | 
125 |         // Add in one padding element for every segment
126 |         if (SEGMENT_PADDING > 0)
127 |         {
128 |             offset += offset / SEGMENT_LENGTH;
129 |         }
130 | 
131 |         // Incorporating a block of padding partials every shared memory segment
132 |         return temp_storage.Alias() + offset;
133 |     }
134 | 
135 | 
136 |     /**
137 |      * \brief Returns the location for the calling thread to begin sequential raking
138 |      */
139 |     static __device__ __forceinline__ T* RakingPtr(
140 |         TempStorage &temp_storage,
141 |         int linear_tid)
142 |     {
143 |         return temp_storage.Alias() + (linear_tid * (SEGMENT_LENGTH + SEGMENT_PADDING));
144 |     }
145 | };
146 | 
147 | }               // CUB namespace
148 | CUB_NS_POSTFIX  // Optional outer namespace(s)
149 | 
150 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/block/specializations/block_histogram_atomic.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
 4 |  * 
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  * 
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
32 |  */
33 | 
34 | #pragma once
35 | 
36 | #include "../../util_namespace.cuh"
37 | 
38 | /// Optional outer namespace(s)
39 | CUB_NS_PREFIX
40 | 
41 | /// CUB namespace
42 | namespace cub {
43 | 
44 | 
45 | /**
46 |  * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
47 |  */
48 | template <int BINS>
49 | struct BlockHistogramAtomic
50 | {
51 |     /// Shared memory storage layout type
52 |     struct TempStorage {};
53 | 
54 | 
55 |     /// Constructor
56 |     __device__ __forceinline__ BlockHistogramAtomic(
57 |         TempStorage &temp_storage)
58 |     {}
59 | 
60 | 
61 |     /// Composite data onto an existing histogram
62 |     template <
63 |         typename            T,
64 |         typename            HistoCounter,
65 |         int                 ITEMS_PER_THREAD>
66 |     __device__ __forceinline__ void Composite(
67 |         T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
68 |         HistoCounter        histogram[BINS])                 ///< [out] Reference to shared/global memory histogram
69 |     {
70 |         // Update histogram
71 |         #pragma unroll
72 |         for (int i = 0; i < ITEMS_PER_THREAD; ++i)
73 |         {
74 |               atomicAdd(histogram + items[i], 1);
75 |         }
76 |     }
77 | 
78 | };
79 | 
80 | }               // CUB namespace
81 | CUB_NS_POSTFIX  // Optional outer namespace(s)
82 | 
83 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/block/specializations/block_histogram_sort.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../../block/block_radix_sort.cuh"
 37 | #include "../../block/block_discontinuity.cuh"
 38 | #include "../../util_ptx.cuh"
 39 | #include "../../util_namespace.cuh"
 40 | 
 41 | /// Optional outer namespace(s)
 42 | CUB_NS_PREFIX
 43 | 
 44 | /// CUB namespace
 45 | namespace cub {
 46 | 
 47 | 
 48 | 
 49 | /**
 50 |  * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
 51 |  */
 52 | template <
 53 |     typename    T,                  ///< Sample type
 54 |     int         BLOCK_DIM_X,        ///< The thread block length in threads along the X dimension
 55 |     int         ITEMS_PER_THREAD,   ///< The number of samples per thread
 56 |     int         BINS,               ///< The number of bins into which histogram samples may fall
 57 |     int         BLOCK_DIM_Y,        ///< The thread block length in threads along the Y dimension
 58 |     int         BLOCK_DIM_Z,        ///< The thread block length in threads along the Z dimension
 59 |     int         PTX_ARCH>           ///< The PTX compute capability for which to to specialize this collective
 60 | struct BlockHistogramSort
 61 | {
 62 |     /// Constants
 63 |     enum
 64 |     {
 65 |         /// The thread block size in threads
 66 |         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
 67 |     };
 68 | 
 69 |     // Parameterize BlockRadixSort type for our thread block
 70 |     typedef BlockRadixSort<
 71 |             T,
 72 |             BLOCK_DIM_X,
 73 |             ITEMS_PER_THREAD,
 74 |             NullType,
 75 |             4,
 76 |             (PTX_ARCH >= 350) ? true : false,
 77 |             BLOCK_SCAN_WARP_SCANS,
 78 |             (PTX_ARCH >= 350) ? cudaSharedMemBankSizeEightByte : cudaSharedMemBankSizeFourByte,
 79 |             BLOCK_DIM_Y,
 80 |             BLOCK_DIM_Z,
 81 |             PTX_ARCH>
 82 |         BlockRadixSortT;
 83 | 
 84 |     // Parameterize BlockDiscontinuity type for our thread block
 85 |     typedef BlockDiscontinuity<
 86 |             T,
 87 |             BLOCK_DIM_X,
 88 |             BLOCK_DIM_Y,
 89 |             BLOCK_DIM_Z,
 90 |             PTX_ARCH>
 91 |         BlockDiscontinuityT;
 92 | 
 93 |     /// Shared memory
 94 |     union _TempStorage
 95 |     {
 96 |         // Storage for sorting bin values
 97 |         typename BlockRadixSortT::TempStorage sort;
 98 | 
 99 |         struct
100 |         {
101 |             // Storage for detecting discontinuities in the tile of sorted bin values
102 |             typename BlockDiscontinuityT::TempStorage flag;
103 | 
104 |             // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
105 |             unsigned int run_begin[BINS];
106 |             unsigned int run_end[BINS];
107 |         };
108 |     };
109 | 
110 | 
111 |     /// Alias wrapper allowing storage to be unioned
112 |     struct TempStorage : Uninitialized<_TempStorage> {};
113 | 
114 | 
115 |     // Thread fields
116 |     _TempStorage &temp_storage;
117 |     int linear_tid;
118 | 
119 | 
120 |     /// Constructor
121 |     __device__ __forceinline__ BlockHistogramSort(
122 |         TempStorage     &temp_storage)
123 |     :
124 |         temp_storage(temp_storage.Alias()),
125 |         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
126 |     {}
127 | 
128 | 
129 |     // Discontinuity functor
130 |     struct DiscontinuityOp
131 |     {
132 |         // Reference to temp_storage
133 |         _TempStorage &temp_storage;
134 | 
135 |         // Constructor
136 |         __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
137 |             temp_storage(temp_storage)
138 |         {}
139 | 
140 |         // Discontinuity predicate
141 |         __device__ __forceinline__ bool operator()(const T &a, const T &b, unsigned int b_index)
142 |         {
143 |             if (a != b)
144 |             {
145 |                 // Note the begin/end offsets in shared storage
146 |                 temp_storage.run_begin[b] = b_index;
147 |                 temp_storage.run_end[a] = b_index;
148 | 
149 |                 return true;
150 |             }
151 |             else
152 |             {
153 |                 return false;
154 |             }
155 |         }
156 |     };
157 | 
158 | 
159 |     // Composite data onto an existing histogram
160 |     template <
161 |         typename            HistoCounter>
162 |     __device__ __forceinline__ void Composite(
163 |         T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
164 |         HistoCounter        histogram[BINS])                 ///< [out] Reference to shared/global memory histogram
165 |     {
166 |         enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
167 | 
168 |         // Sort bytes in blocked arrangement
169 |         BlockRadixSortT(temp_storage.sort).Sort(items);
170 | 
171 |         __syncthreads();
172 | 
173 |         // Initialize the shared memory's run_begin and run_end for each bin
174 |         int histo_offset = 0;
175 | 
176 |         #pragma unroll
177 |         for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
178 |         {
179 |             temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
180 |             temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
181 |         }
182 |         // Finish up with guarded initialization if necessary
183 |         if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
184 |         {
185 |             temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
186 |             temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
187 |         }
188 | 
189 |         __syncthreads();
190 | 
191 |         int flags[ITEMS_PER_THREAD];    // unused
192 | 
193 |         // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
194 |         DiscontinuityOp flag_op(temp_storage);
195 |         BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
196 | 
197 |         // Update begin for first item
198 |         if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
199 | 
200 |         __syncthreads();
201 | 
202 |         // Composite into histogram
203 |         histo_offset = 0;
204 | 
205 |         #pragma unroll
206 |         for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
207 |         {
208 |             int thread_offset = histo_offset + linear_tid;
209 |             HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
210 |             histogram[thread_offset] += count;
211 |         }
212 | 
213 |         // Finish up with guarded composition if necessary
214 |         if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
215 |         {
216 |             int thread_offset = histo_offset + linear_tid;
217 |             HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
218 |             histogram[thread_offset] += count;
219 |         }
220 |     }
221 | 
222 | };
223 | 
224 | }               // CUB namespace
225 | CUB_NS_POSTFIX  // Optional outer namespace(s)
226 | 
227 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/block/specializations/block_reduce_raking_commutative_only.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "block_reduce_raking.cuh"
 37 | #include "../../warp/warp_reduce.cuh"
 38 | #include "../../thread/thread_reduce.cuh"
 39 | #include "../../util_ptx.cuh"
 40 | #include "../../util_namespace.cuh"
 41 | 
 42 | /// Optional outer namespace(s)
 43 | CUB_NS_PREFIX
 44 | 
 45 | /// CUB namespace
 46 | namespace cub {
 47 | 
 48 | 
 49 | /**
 50 |  * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.  Does not support block sizes that are not a multiple of the warp size.
 51 |  */
 52 | template <
 53 |     typename    T,              ///< Data type being reduced
 54 |     int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
 55 |     int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
 56 |     int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
 57 |     int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
 58 | struct BlockReduceRakingCommutativeOnly
 59 | {
 60 |     /// Constants
 61 |     enum
 62 |     {
 63 |         /// The thread block size in threads
 64 |         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
 65 |     };
 66 | 
 67 |     // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values
 68 |     typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> FallBack;
 69 | 
 70 |     /// Constants
 71 |     enum
 72 |     {
 73 |         /// Number of warp threads
 74 |         WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
 75 | 
 76 |         /// Whether or not to use fall-back
 77 |         USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
 78 | 
 79 |         /// Number of raking threads
 80 |         RAKING_THREADS = WARP_THREADS,
 81 | 
 82 |         /// Number of threads actually sharing items with the raking threads
 83 |         SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
 84 | 
 85 |         /// Number of raking elements per warp synchronous raking thread
 86 |         SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
 87 |     };
 88 | 
 89 |     ///  WarpReduce utility type
 90 |     typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce;
 91 | 
 92 |     /// Layout type for padded thread block raking grid
 93 |     typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout;
 94 | 
 95 |     /// Shared memory storage layout type
 96 |     struct _TempStorage
 97 |     {
 98 |         union
 99 |         {
100 |             struct
101 |             {
102 |                 typename WarpReduce::TempStorage        warp_storage;        ///< Storage for warp-synchronous reduction
103 |                 typename BlockRakingLayout::TempStorage raking_grid;         ///< Padded threadblock raking grid
104 |             };
105 |             typename FallBack::TempStorage              fallback_storage;    ///< Fall-back storage for non-commutative block scan
106 |         };
107 |     };
108 | 
109 | 
110 |     /// Alias wrapper allowing storage to be unioned
111 |     struct TempStorage : Uninitialized<_TempStorage> {};
112 | 
113 | 
114 |     // Thread fields
115 |     _TempStorage &temp_storage;
116 |     int linear_tid;
117 | 
118 | 
119 |     /// Constructor
120 |     __device__ __forceinline__ BlockReduceRakingCommutativeOnly(
121 |         TempStorage &temp_storage)
122 |     :
123 |         temp_storage(temp_storage.Alias()),
124 |         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
125 |     {}
126 | 
127 | 
128 |     /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
129 |     template <bool FULL_TILE>
130 |     __device__ __forceinline__ T Sum(
131 |         T                   partial,            ///< [in] Calling thread's input partial reductions
132 |         int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
133 |     {
134 |         if (USE_FALLBACK || !FULL_TILE)
135 |         {
136 |             return FallBack(temp_storage.fallback_storage).template Sum<FULL_TILE>(partial, num_valid);
137 |         }
138 |         else
139 |         {
140 |             // Place partial into shared memory grid
141 |             if (linear_tid >= RAKING_THREADS)
142 |                 *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
143 | 
144 |             __syncthreads();
145 | 
146 |             // Reduce parallelism to one warp
147 |             if (linear_tid < RAKING_THREADS)
148 |             {
149 |                 // Raking reduction in grid
150 |                 T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
151 |                 partial = ThreadReduce<SEGMENT_LENGTH>(raking_segment, cub::Sum(), partial);
152 | 
153 |                 // Warpscan
154 |                 partial = WarpReduce(temp_storage.warp_storage).Sum(partial);
155 |             }
156 |         }
157 | 
158 |         return partial;
159 |     }
160 | 
161 | 
162 |     /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
163 |     template <
164 |         bool                FULL_TILE,
165 |         typename            ReductionOp>
166 |     __device__ __forceinline__ T Reduce(
167 |         T                   partial,            ///< [in] Calling thread's input partial reductions
168 |         int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
169 |         ReductionOp         reduction_op)       ///< [in] Binary reduction operator
170 |     {
171 |         if (USE_FALLBACK || !FULL_TILE)
172 |         {
173 |             return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid, reduction_op);
174 |         }
175 |         else
176 |         {
177 |             // Place partial into shared memory grid
178 |             if (linear_tid >= RAKING_THREADS)
179 |                 *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
180 | 
181 |             __syncthreads();
182 | 
183 |             // Reduce parallelism to one warp
184 |             if (linear_tid < RAKING_THREADS)
185 |             {
186 |                 // Raking reduction in grid
187 |                 T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
188 |                 partial = ThreadReduce<SEGMENT_LENGTH>(raking_segment, reduction_op, partial);
189 | 
190 |                 // Warpscan
191 |                 partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op);
192 |             }
193 |         }
194 | 
195 |         return partial;
196 |     }
197 | 
198 | };
199 | 
200 | }               // CUB namespace
201 | CUB_NS_POSTFIX  // Optional outer namespace(s)
202 | 
203 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/block_range/specializations/block_range_histo_gatomic.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <iterator>
 37 | 
 38 | #include "../../util_type.cuh"
 39 | #include "../../util_namespace.cuh"
 40 | 
 41 | /// Optional outer namespace(s)
 42 | CUB_NS_PREFIX
 43 | 
 44 | /// CUB namespace
 45 | namespace cub {
 46 | 
 47 | 
 48 | 
 49 | /**
 50 |  * BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics
 51 |  */
 52 | template <
 53 |     typename    BlockRangeHistogramPolicy,      ///< Tuning policy
 54 |     int         BINS,                           ///< Number of histogram bins per channel
 55 |     int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
 56 |     int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
 57 |     typename    InputIterator,                ///< The input iterator type \iterator.  Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
 58 |     typename    HistoCounter,                   ///< Integer type for counting sample occurrences per histogram bin
 59 |     typename    Offset>                          ///< Signed integer type for global offsets
 60 | struct BlockRangeHistogramGlobalAtomic
 61 | {
 62 |     //---------------------------------------------------------------------
 63 |     // Types and constants
 64 |     //---------------------------------------------------------------------
 65 | 
 66 |     // Sample type
 67 |     typedef typename std::iterator_traits<InputIterator>::value_type SampleT;
 68 | 
 69 |     // Constants
 70 |     enum
 71 |     {
 72 |         BLOCK_THREADS       = BlockRangeHistogramPolicy::BLOCK_THREADS,
 73 |         ITEMS_PER_THREAD    = BlockRangeHistogramPolicy::ITEMS_PER_THREAD,
 74 |         TILE_CHANNEL_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD,
 75 |         TILE_ITEMS          = TILE_CHANNEL_ITEMS * CHANNELS,
 76 |     };
 77 | 
 78 |     // Shared memory type required by this thread block
 79 |     typedef NullType TempStorage;
 80 | 
 81 | 
 82 |     //---------------------------------------------------------------------
 83 |     // Per-thread fields
 84 |     //---------------------------------------------------------------------
 85 | 
 86 |     /// Reference to output histograms
 87 |     HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
 88 | 
 89 |     /// Input data to reduce
 90 |     InputIterator d_in;
 91 | 
 92 | 
 93 |     //---------------------------------------------------------------------
 94 |     // Interface
 95 |     //---------------------------------------------------------------------
 96 | 
 97 |     /**
 98 |      * Constructor
 99 |      */
100 |     __device__ __forceinline__ BlockRangeHistogramGlobalAtomic(
101 |         TempStorage         &temp_storage,                                  ///< Reference to temp_storage
102 |         InputIterator     d_in,                                           ///< Input data to reduce
103 |         HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
104 |     :
105 |         d_in(d_in),
106 |         d_out_histograms(d_out_histograms)
107 |     {}
108 | 
109 | 
110 |     /**
111 |      * Process a single tile of input
112 |      */
113 |     template <bool FULL_TILE>
114 |     __device__ __forceinline__ void ConsumeTile(
115 |         Offset   block_offset,               ///< The offset the tile to consume
116 |         int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
117 |     {
118 |         if (FULL_TILE)
119 |         {
120 |             // Full tile of samples to read and composite
121 |             SampleT items[ITEMS_PER_THREAD][CHANNELS];
122 | 
123 |             #pragma unroll
124 |             for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
125 |             {
126 |                 #pragma unroll
127 |                 for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
128 |                 {
129 |                     if (CHANNEL < ACTIVE_CHANNELS)
130 |                     {
131 |                         items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
132 |                     }
133 |                 }
134 |             }
135 | 
136 |             __threadfence_block();
137 | 
138 |             #pragma unroll
139 |             for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
140 |             {
141 |                 #pragma unroll
142 |                 for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
143 |                 {
144 |                     if (CHANNEL < ACTIVE_CHANNELS)
145 |                     {
146 |                         atomicAdd(d_out_histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
147 |                     }
148 |                 }
149 |             }
150 |         }
151 |         else
152 |         {
153 |             // Only a partially-full tile of samples to read and composite
154 |             int bounds = valid_items - (threadIdx.x * CHANNELS);
155 | 
156 |             #pragma unroll
157 |             for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
158 |             {
159 |                 #pragma unroll
160 |                 for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
161 |                 {
162 |                     if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
163 |                     {
164 |                         SampleT item  = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
165 |                         atomicAdd(d_out_histograms[CHANNEL] + item, 1);
166 |                     }
167 |                 }
168 |             }
169 | 
170 |         }
171 |     }
172 | 
173 | 
174 |     /**
175 |      * Aggregate results into output
176 |      */
177 |     __device__ __forceinline__ void AggregateOutput()
178 |     {}
179 | };
180 | 
181 | 
182 | }               // CUB namespace
183 | CUB_NS_POSTFIX  // Optional outer namespace(s)
184 | 
185 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/cub.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
 4 |  * 
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  * 
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * CUB umbrella include file
32 |  */
33 | 
34 | #pragma once
35 | 
36 | 
37 | // Block
38 | #include "block/block_histogram.cuh"
39 | #include "block/block_discontinuity.cuh"
40 | #include "block/block_exchange.cuh"
41 | #include "block/block_load.cuh"
42 | #include "block/block_radix_rank.cuh"
43 | #include "block/block_radix_sort.cuh"
44 | #include "block/block_reduce.cuh"
45 | #include "block/block_scan.cuh"
46 | #include "block/block_store.cuh"
47 | #include "block/block_shift.cuh"
48 | 
49 | // Device
50 | #include "device/device_histogram.cuh"
51 | #include "device/device_partition.cuh"
52 | #include "device/device_radix_sort.cuh"
53 | #include "device/device_reduce.cuh"
54 | #include "device/device_scan.cuh"
55 | #include "device/device_select.cuh"
56 | 
57 | // Grid
58 | //#include "grid/grid_barrier.cuh"
59 | #include "grid/grid_even_share.cuh"
60 | #include "grid/grid_mapping.cuh"
61 | #include "grid/grid_queue.cuh"
62 | 
63 | // Host
64 | #include "host/spinlock.cuh"
65 | 
66 | // Thread
67 | #include "thread/thread_load.cuh"
68 | #include "thread/thread_operators.cuh"
69 | #include "thread/thread_reduce.cuh"
70 | #include "thread/thread_scan.cuh"
71 | #include "thread/thread_store.cuh"
72 | 
73 | // Warp
74 | #include "warp/warp_reduce.cuh"
75 | #include "warp/warp_scan.cuh"
76 | 
77 | // Iterator
78 | #include "iterator/arg_index_input_iterator.cuh"
79 | #include "iterator/cache_modified_input_iterator.cuh"
80 | #include "iterator/cache_modified_output_iterator.cuh"
81 | #include "iterator/constant_input_iterator.cuh"
82 | #include "iterator/counting_input_iterator.cuh"
83 | #include "iterator/tex_obj_input_iterator.cuh"
84 | #include "iterator/tex_ref_input_iterator.cuh"
85 | #include "iterator/transform_input_iterator.cuh"
86 | 
87 | // Util
88 | #include "util_allocator.cuh"
89 | #include "util_arch.cuh"
90 | #include "util_debug.cuh"
91 | #include "util_device.cuh"
92 | #include "util_macro.cuh"
93 | #include "util_ptx.cuh"
94 | #include "util_type.cuh"
95 | 
96 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/grid/grid_barrier.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../util_debug.cuh"
 37 | #include "../util_namespace.cuh"
 38 | #include "../thread/thread_load.cuh"
 39 | 
 40 | /// Optional outer namespace(s)
 41 | CUB_NS_PREFIX
 42 | 
 43 | /// CUB namespace
 44 | namespace cub {
 45 | 
 46 | 
 47 | /**
 48 |  * \addtogroup GridModule
 49 |  * @{
 50 |  */
 51 | 
 52 | 
 53 | /**
 54 |  * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
 55 |  */
 56 | class GridBarrier
 57 | {
 58 | protected :
 59 | 
 60 |     typedef unsigned int SyncFlag;
 61 | 
 62 |     // Counters in global device memory
 63 |     SyncFlag* d_sync;
 64 | 
 65 | public:
 66 | 
 67 |     /**
 68 |      * Constructor
 69 |      */
 70 |     GridBarrier() : d_sync(NULL) {}
 71 | 
 72 | 
 73 |     /**
 74 |      * Synchronize
 75 |      */
 76 |     __device__ __forceinline__ void Sync() const
 77 |     {
 78 |         volatile SyncFlag *d_vol_sync = d_sync;
 79 | 
 80 |         // Threadfence and syncthreads to make sure global writes are visible before
 81 |         // thread-0 reports in with its sync counter
 82 |         __threadfence();
 83 |         __syncthreads();
 84 | 
 85 |         if (blockIdx.x == 0)
 86 |         {
 87 |             // Report in ourselves
 88 |             if (threadIdx.x == 0)
 89 |             {
 90 |                 d_vol_sync[blockIdx.x] = 1;
 91 |             }
 92 | 
 93 |             __syncthreads();
 94 | 
 95 |             // Wait for everyone else to report in
 96 |             for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
 97 |             {
 98 |                 while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
 99 |                 {
100 |                     __threadfence_block();
101 |                 }
102 |             }
103 | 
104 |             __syncthreads();
105 | 
106 |             // Let everyone know it's safe to proceed
107 |             for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
108 |             {
109 |                 d_vol_sync[peer_block] = 0;
110 |             }
111 |         }
112 |         else
113 |         {
114 |             if (threadIdx.x == 0)
115 |             {
116 |                 // Report in
117 |                 d_vol_sync[blockIdx.x] = 1;
118 | 
119 |                 // Wait for acknowledgment
120 |                 while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
121 |                 {
122 |                     __threadfence_block();
123 |                 }
124 |             }
125 | 
126 |             __syncthreads();
127 |         }
128 |     }
129 | };
130 | 
131 | 
132 | /**
133 |  * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
134 |  *
135 |  * Uses RAII for lifetime, i.e., device resources are reclaimed when
136 |  * the destructor is called.
137 |  */
138 | class GridBarrierLifetime : public GridBarrier
139 | {
140 | protected:
141 | 
142 |     // Number of bytes backed by d_sync
143 |     size_t sync_bytes;
144 | 
145 | public:
146 | 
147 |     /**
148 |      * Constructor
149 |      */
150 |     GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
151 | 
152 | 
153 |     /**
154 |      * DeviceFrees and resets the progress counters
155 |      */
156 |     cudaError_t HostReset()
157 |     {
158 |         cudaError_t retval = cudaSuccess;
159 |         if (d_sync)
160 |         {
161 |             CubDebug(retval = cudaFree(d_sync));
162 |             d_sync = NULL;
163 |         }
164 |         sync_bytes = 0;
165 |         return retval;
166 |     }
167 | 
168 | 
169 |     /**
170 |      * Destructor
171 |      */
172 |     virtual ~GridBarrierLifetime()
173 |     {
174 |         HostReset();
175 |     }
176 | 
177 | 
178 |     /**
179 |      * Sets up the progress counters for the next kernel launch (lazily
180 |      * allocating and initializing them if necessary)
181 |      */
182 |     cudaError_t Setup(int sweep_grid_size)
183 |     {
184 |         cudaError_t retval = cudaSuccess;
185 |         do {
186 |             size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
187 |             if (new_sync_bytes > sync_bytes)
188 |             {
189 |                 if (d_sync)
190 |                 {
191 |                     if (CubDebug(retval = cudaFree(d_sync))) break;
192 |                 }
193 | 
194 |                 sync_bytes = new_sync_bytes;
195 | 
196 |                 // Allocate and initialize to zero
197 |                 if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
198 |                 if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
199 |             }
200 |         } while (0);
201 | 
202 |         return retval;
203 |     }
204 | };
205 | 
206 | 
207 | /** @} */       // end group GridModule
208 | 
209 | }               // CUB namespace
210 | CUB_NS_POSTFIX  // Optional outer namespace(s)
211 | 
212 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/grid/grid_even_share.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion.  Each threadblock gets roughly the same number of fixed-size work units (grains).
 32 |  */
 33 | 
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "../util_namespace.cuh"
 38 | #include "../util_macro.cuh"
 39 | 
 40 | /// Optional outer namespace(s)
 41 | CUB_NS_PREFIX
 42 | 
 43 | /// CUB namespace
 44 | namespace cub {
 45 | 
 46 | 
 47 | /**
 48 |  * \addtogroup GridModule
 49 |  * @{
 50 |  */
 51 | 
 52 | 
 53 | /**
 54 |  * \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion.  Each threadblock gets roughly the same number of fixed-size work units (grains).
 55 |  *
 56 |  * \par Overview
 57 |  * GridEvenShare indicates which sections of input are to be mapped onto which threadblocks.
 58 |  * Threadblocks may receive one of three different amounts of work: "big", "normal",
 59 |  * and "last".  The "big" workloads are one scheduling grain larger than "normal".  The "last" work unit
 60 |  * for the last threadblock may be partially-full if the input is not an even multiple of
 61 |  * the scheduling grain size.
 62 |  *
 63 |  * \par
 64 |  * Before invoking a child grid, a parent thread will typically construct an instance of
 65 |  * GridEvenShare.  The instance can be passed to child threadblocks which can
 66 |  * initialize their per-threadblock offsets using \p BlockInit().
 67 |  *
 68 |  * \tparam Offset       Signed integer type for global offsets
 69 |  */
 70 | template <typename Offset>
 71 | struct GridEvenShare
 72 | {
 73 |     Offset      total_grains;
 74 |     int         big_blocks;
 75 |     Offset      big_share;
 76 |     Offset      normal_share;
 77 |     Offset      normal_base_offset;
 78 | 
 79 |     /// Total number of input items
 80 |     Offset      num_items;
 81 | 
 82 |     /// Grid size in threadblocks
 83 |     int         grid_size;
 84 | 
 85 |     /// Offset into input marking the beginning of the owning thread block's segment of input tiles
 86 |     Offset      block_offset;
 87 | 
 88 |     /// Offset into input of marking the end (one-past) of the owning thread block's segment of input tiles
 89 |     Offset      block_end;
 90 | 
 91 |     /**
 92 |      * \brief Default constructor.  Zero-initializes block-specific fields.
 93 |      */
 94 |     __host__ __device__ __forceinline__ GridEvenShare() :
 95 |         num_items(0),
 96 |         grid_size(0),
 97 |         block_offset(0),
 98 |         block_end(0) {}
 99 | 
100 |     /**
101 |      * \brief Constructor.  Initializes the grid-specific members \p num_items and \p grid_size. To be called prior prior to kernel launch)
102 |      */
103 |     __host__ __device__ __forceinline__ GridEvenShare(
104 |         Offset   num_items,                 ///< Total number of input items
105 |         int     max_grid_size,              ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
106 |         int     schedule_granularity)       ///< Granularity by which the input can be parcelled into and distributed among threablocks.  Usually the thread block's native tile size (or a multiple thereof.
107 |     {
108 |         this->num_items             = num_items;
109 |         this->block_offset          = num_items;
110 |         this->block_end             = num_items;
111 |         this->total_grains          = (num_items + schedule_granularity - 1) / schedule_granularity;
112 |         this->grid_size             = CUB_MIN(total_grains, max_grid_size);
113 |         Offset grains_per_block     = total_grains / grid_size;
114 |         this->big_blocks            = total_grains - (grains_per_block * grid_size);        // leftover grains go to big blocks
115 |         this->normal_share          = grains_per_block * schedule_granularity;
116 |         this->normal_base_offset    = big_blocks * schedule_granularity;
117 |         this->big_share             = normal_share + schedule_granularity;
118 |     }
119 | 
120 | 
121 | 
122 |     /**
123 |      * \brief Initializes ranges for the specified partition index
124 |      */
125 |     __device__ __forceinline__ void Init(int partition_id)
126 |     {
127 |         if (partition_id < big_blocks)
128 |         {
129 |             // This threadblock gets a big share of grains (grains_per_block + 1)
130 |             block_offset = (partition_id * big_share);
131 |             block_end = block_offset + big_share;
132 |         }
133 |         else if (partition_id < total_grains)
134 |         {
135 |             // This threadblock gets a normal share of grains (grains_per_block)
136 |             block_offset = normal_base_offset + (partition_id * normal_share);
137 |             block_end = CUB_MIN(num_items, block_offset + normal_share);
138 |         }
139 |     }
140 | 
141 | 
142 |     /**
143 |      * \brief Initializes ranges for the current thread block (e.g., to be called by each threadblock after startup)
144 |      */
145 |     __device__ __forceinline__ void BlockInit()
146 |     {
147 |         Init(blockIdx.x);
148 |     }
149 | 
150 | 
151 |     /**
152 |      * Print to stdout
153 |      */
154 |     __host__ __device__ __forceinline__ void Print()
155 |     {
156 |         printf(
157 | #if (CUB_PTX_ARCH > 0)
158 |             "\tthreadblock(%d) "
159 |             "block_offset(%lu) "
160 |             "block_end(%lu) "
161 | #endif
162 |             "num_items(%lu)  "
163 |             "total_grains(%lu)  "
164 |             "big_blocks(%lu)  "
165 |             "big_share(%lu)  "
166 |             "normal_share(%lu)\n",
167 | #if (CUB_PTX_ARCH > 0)
168 |                 blockIdx.x,
169 |                 (unsigned long) block_offset,
170 |                 (unsigned long) block_end,
171 | #endif
172 |                 (unsigned long) num_items,
173 |                 (unsigned long) total_grains,
174 |                 (unsigned long) big_blocks,
175 |                 (unsigned long) big_share,
176 |                 (unsigned long) normal_share);
177 |     }
178 | };
179 | 
180 | 
181 | 
182 | /** @} */       // end group GridModule
183 | 
184 | }               // CUB namespace
185 | CUB_NS_POSTFIX  // Optional outer namespace(s)
186 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/grid/grid_mapping.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
 4 |  *
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  *
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
32 |  */
33 | 
34 | #pragma once
35 | 
36 | #include "../util_namespace.cuh"
37 | 
38 | /// Optional outer namespace(s)
39 | CUB_NS_PREFIX
40 | 
41 | /// CUB namespace
42 | namespace cub {
43 | 
44 | 
45 | /**
46 |  * \addtogroup GridModule
47 |  * @{
48 |  */
49 | 
50 | 
51 | /******************************************************************************
52 |  * Mapping policies
53 |  *****************************************************************************/
54 | 
55 | 
56 | /**
57 |  * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
58 |  */
59 | enum GridMappingStrategy
60 | {
61 |     /**
62 |      * \brief An "even-share" strategy for assigning input tiles to thread blocks.
63 |      *
64 |      * \par Overview
65 |      * The input is evenly partitioned into \p p segments, where \p p is
66 |      * constant and corresponds loosely to the number of thread blocks that may
67 |      * actively reside on the target device. Each segment is comprised of
68 |      * consecutive tiles, where a tile is a small, constant-sized unit of input
69 |      * to be processed to completion before the thread block terminates or
70 |      * obtains more work.  The kernel invokes \p p thread blocks, each
71 |      * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
72 |      * in tile-size increments.
73 |      */
74 |     GRID_MAPPING_EVEN_SHARE,
75 | 
76 |     /**
77 |      * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
78 |      *
79 |      * \par Overview
80 |      * The input is treated as a queue to be dynamically consumed by a grid of
81 |      * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
82 |      * unit of input to be processed to completion before the thread block
83 |      * terminates or obtains more work.  The grid size \p p is constant,
84 |      * loosely corresponding to the number of thread blocks that may actively
85 |      * reside on the target device.
86 |      */
87 |     GRID_MAPPING_DYNAMIC,
88 | };
89 | 
90 | 
91 | /** @} */       // end group GridModule
92 | 
93 | }               // CUB namespace
94 | CUB_NS_POSTFIX  // Optional outer namespace(s)
95 | 
96 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/grid/grid_queue.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::GridQueue is a descriptor utility for dynamic queue management.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../util_namespace.cuh"
 37 | #include "../util_debug.cuh"
 38 | 
 39 | /// Optional outer namespace(s)
 40 | CUB_NS_PREFIX
 41 | 
 42 | /// CUB namespace
 43 | namespace cub {
 44 | 
 45 | 
 46 | /**
 47 |  * \addtogroup GridModule
 48 |  * @{
 49 |  */
 50 | 
 51 | 
 52 | /**
 53 |  * \brief GridQueue is a descriptor utility for dynamic queue management.
 54 |  *
 55 |  * \par Overview
 56 |  * GridQueue descriptors provides abstractions for "filling" or
 57 |  * "draining" globally-shared vectors.
 58 |  *
 59 |  * \par
 60 |  * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
 61 |  * returning a unique offset for the calling thread to write its items.
 62 |  * The GridQueue maintains the total "fill-size".  The fill counter must be reset
 63 |  * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
 64 |  * will be filling.
 65 |  *
 66 |  * \par
 67 |  * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
 68 |  * zero-initialized counter, returning a unique offset for the calling thread to
 69 |  * read its items. Threads can safely drain until the array's logical fill-size is
 70 |  * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
 71 |  * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
 72 |  * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
 73 |  * is simply the number of elements in the array.)
 74 |  *
 75 |  * \par
 76 |  * Iterative work management can be implemented simply with a pair of flip-flopping
 77 |  * work buffers, each with an associated set of fill and drain GridQueue descriptors.
 78 |  *
 79 |  * \tparam Offset Signed integer type for global offsets
 80 |  */
 81 | template <typename Offset>
 82 | class GridQueue
 83 | {
 84 | private:
 85 | 
 86 |     /// Counter indices
 87 |     enum
 88 |     {
 89 |         FILL    = 0,
 90 |         DRAIN   = 1,
 91 |     };
 92 | 
 93 |     /// Pair of counters
 94 |     Offset *d_counters;
 95 | 
 96 | public:
 97 | 
 98 |     /// Returns the device allocation size in bytes needed to construct a GridQueue instance
 99 |     __host__ __device__ __forceinline__
100 |     static size_t AllocationSize()
101 |     {
102 |         return sizeof(Offset) * 2;
103 |     }
104 | 
105 | 
106 |     /// Constructs an invalid GridQueue descriptor
107 |     __host__ __device__ __forceinline__ GridQueue()
108 |     :
109 |         d_counters(NULL)
110 |     {}
111 | 
112 | 
113 |     /// Constructs a GridQueue descriptor around the device storage allocation
114 |     __host__ __device__ __forceinline__ GridQueue(
115 |         void *d_storage)                    ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
116 |     :
117 |         d_counters((Offset*) d_storage)
118 |     {}
119 | 
120 | 
121 |     /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
122 |     __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain(
123 |         Offset fill_size,
124 |         cudaStream_t stream = 0)
125 |     {
126 | #if (CUB_PTX_ARCH > 0)
127 |         d_counters[FILL] = fill_size;
128 |         d_counters[DRAIN] = 0;
129 |         return cudaSuccess;
130 | #else
131 |         Offset counters[2];
132 |         counters[FILL] = fill_size;
133 |         counters[DRAIN] = 0;
134 |         return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(Offset) * 2, cudaMemcpyHostToDevice, stream));
135 | #endif
136 |     }
137 | 
138 | 
139 |     /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
140 |     __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
141 |     {
142 | #if (CUB_PTX_ARCH > 0)
143 |         d_counters[DRAIN] = 0;
144 |         return cudaSuccess;
145 | #else
146 |         return FillAndResetDrain(0, stream);
147 | #endif
148 |     }
149 | 
150 | 
151 |     /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
152 |     __host__ __device__ __forceinline__ cudaError_t ResetFill()
153 |     {
154 | #if (CUB_PTX_ARCH > 0)
155 |         d_counters[FILL] = 0;
156 |         return cudaSuccess;
157 | #else
158 |         return CubDebug(cudaMemset(d_counters + FILL, 0, sizeof(Offset)));
159 | #endif
160 |     }
161 | 
162 | 
163 |     /// Returns the fill-size established by the parent or by the previous kernel.
164 |     __host__ __device__ __forceinline__ cudaError_t FillSize(
165 |         Offset &fill_size,
166 |         cudaStream_t stream = 0)
167 |     {
168 | #if (CUB_PTX_ARCH > 0)
169 |         fill_size = d_counters[FILL];
170 |         return cudaSuccess;
171 | #else
172 |         return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(Offset), cudaMemcpyDeviceToHost, stream));
173 | #endif
174 |     }
175 | 
176 | 
177 |     /// Drain num_items.  Returns offset from which to read items.
178 |     __device__ __forceinline__ Offset Drain(Offset num_items)
179 |     {
180 |         return atomicAdd(d_counters + DRAIN, num_items);
181 |     }
182 | 
183 | 
184 |     /// Fill num_items.  Returns offset from which to write items.
185 |     __device__ __forceinline__ Offset Fill(Offset num_items)
186 |     {
187 |         return atomicAdd(d_counters + FILL, num_items);
188 |     }
189 | };
190 | 
191 | 
192 | #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
193 | 
194 | 
195 | /**
196 |  * Reset grid queue (call with 1 block of 1 thread)
197 |  */
198 | template <typename Offset>
199 | __global__ void FillAndResetDrainKernel(
200 |     GridQueue<Offset>    grid_queue,
201 |     Offset               num_items)
202 | {
203 |     grid_queue.FillAndResetDrain(num_items);
204 | }
205 | 
206 | 
207 | 
208 | #endif // DOXYGEN_SHOULD_SKIP_THIS
209 | 
210 | 
211 | /** @} */       // end group GridModule
212 | 
213 | }               // CUB namespace
214 | CUB_NS_POSTFIX  // Optional outer namespace(s)
215 | 
216 | 
217 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/host/spinlock.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Simple x86/x64 atomic spinlock, portable across MS Windows (cl.exe) & Linux (g++)
 32 |  */
 33 | 
 34 | 
 35 | #pragma once
 36 | 
 37 | #if defined(_WIN32) || defined(_WIN64)
 38 |     #include <intrin.h>
 39 |     #include <windows.h>
 40 |     #undef small            // Windows is terrible for polluting macro namespace
 41 | 
 42 |     /**
 43 |      * Compiler read/write barrier
 44 |      */
 45 |     #pragma intrinsic(_ReadWriteBarrier)
 46 | 
 47 | #endif
 48 | 
 49 | #include "../util_namespace.cuh"
 50 | 
 51 | /// Optional outer namespace(s)
 52 | CUB_NS_PREFIX
 53 | 
 54 | /// CUB namespace
 55 | namespace cub {
 56 | 
 57 | 
 58 | #if defined(_MSC_VER)
 59 | 
 60 |     // Microsoft VC++
 61 |     typedef long Spinlock;
 62 | 
 63 | #else
 64 | 
 65 |     // GNU g++
 66 |     typedef int Spinlock;
 67 | 
 68 |     /**
 69 |      * Compiler read/write barrier
 70 |      */
 71 |     __forceinline__ void _ReadWriteBarrier()
 72 |     {
 73 |         __sync_synchronize();
 74 |     }
 75 | 
 76 |     /**
 77 |      * Atomic exchange
 78 |      */
 79 |     __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
 80 |     {
 81 |         // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
 82 |         _ReadWriteBarrier();
 83 |         return __sync_lock_test_and_set(Target, Value);
 84 |     }
 85 | 
 86 |     /**
 87 |      * Pause instruction to prevent excess processor bus usage
 88 |      */
 89 |     __forceinline__ void YieldProcessor()
 90 |     {
 91 | #ifndef __arm__
 92 |         asm volatile("pause\n": : :"memory");
 93 | #endif  // __arm__
 94 |     }
 95 | 
 96 | #endif  // defined(_MSC_VER)
 97 | 
 98 | /**
 99 |  * Return when the specified spinlock has been acquired
100 |  */
101 | __forceinline__ void Lock(volatile Spinlock *lock)
102 | {
103 |     while (1)
104 |     {
105 |         if (!_InterlockedExchange(lock, 1)) return;
106 |         while (*lock) YieldProcessor();
107 |     }
108 | }
109 | 
110 | 
111 | /**
112 |  * Release the specified spinlock
113 |  */
114 | __forceinline__ void Unlock(volatile Spinlock *lock)
115 | {
116 |     _ReadWriteBarrier();
117 |     *lock = 0;
118 | }
119 | 
120 | 
121 | }               // CUB namespace
122 | CUB_NS_POSTFIX  // Optional outer namespace(s)
123 | 
124 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/iterator/cache_modified_input_iterator.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Random-access iterator types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <iterator>
 37 | #include <iostream>
 38 | 
 39 | #include "../thread/thread_load.cuh"
 40 | #include "../thread/thread_store.cuh"
 41 | #include "../util_device.cuh"
 42 | #include "../util_namespace.cuh"
 43 | 
 44 | #if (THRUST_VERSION >= 100700)
 45 |     // This iterator is compatible with Thrust API 1.7 and newer
 46 |     #include <thrust/iterator/iterator_facade.h>
 47 |     #include <thrust/iterator/iterator_traits.h>
 48 | #endif // THRUST_VERSION
 49 | 
 50 | 
 51 | /// Optional outer namespace(s)
 52 | CUB_NS_PREFIX
 53 | 
 54 | /// CUB namespace
 55 | namespace cub {
 56 | 
 57 | 
 58 | 
 59 | /**
 60 |  * \addtogroup UtilIterator
 61 |  * @{
 62 |  */
 63 | 
 64 | 
 65 | /**
 66 |  * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
 67 |  *
 68 |  * \par Overview
 69 |  * - CacheModifiedInputIterator is a random-access input iterator that wraps a native
 70 |  *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
 71 |  *   made by reading \p ValueType values through loads modified by \p MODIFIER.
 72 |  * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
 73 |  *   "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
 74 |  * - Can be constructed, manipulated, and exchanged within and between host and device
 75 |  *   functions, but can only be dereferenced within device functions.
 76 |  * - Compatible with Thrust API v1.7 or newer.
 77 |  *
 78 |  * \par Snippet
 79 |  * The code snippet below illustrates the use of \p CacheModifiedInputIterator to
 80 |  * dereference a device array of double using the "ldg" PTX load modifier
 81 |  * (i.e., load values through texture cache).
 82 |  * \par
 83 |  * \code
 84 |  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_input_iterator.cuh>
 85 |  *
 86 |  * // Declare, allocate, and initialize a device array
 87 |  * double *d_in;            // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
 88 |  *
 89 |  * // Create an iterator wrapper
 90 |  * cub::CacheModifiedInputIterator<cub::LOAD_LDG, double> itr(d_in);
 91 |  *
 92 |  * // Within device code:
 93 |  * printf("%f\n", itr[0]);  // 8.0
 94 |  * printf("%f\n", itr[1]);  // 6.0
 95 |  * printf("%f\n", itr[6]);  // 9.0
 96 |  *
 97 |  * \endcode
 98 |  *
 99 |  * \tparam CacheLoadModifier    The cub::CacheLoadModifier to use when accessing data
100 |  * \tparam ValueType            The value type of this iterator
101 |  * \tparam Offset               The difference type of this iterator (Default: \p ptrdiff_t)
102 |  */
103 | template <
104 |     CacheLoadModifier   MODIFIER,
105 |     typename            ValueType,
106 |     typename            Offset = ptrdiff_t>
107 | class CacheModifiedInputIterator
108 | {
109 | public:
110 | 
111 |     // Required iterator traits
112 |     typedef CacheModifiedInputIterator          self_type;              ///< My own type
113 |     typedef Offset                              difference_type;        ///< Type to express the result of subtracting one iterator from another
114 |     typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
115 |     typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
116 |     typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
117 | 
118 | #if (THRUST_VERSION >= 100700)
119 |     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
120 |     typedef typename thrust::detail::iterator_facade_category<
121 |         thrust::device_system_tag,
122 |         thrust::random_access_traversal_tag,
123 |         value_type,
124 |         reference
125 |       >::type iterator_category;                                        ///< The iterator category
126 | #else
127 |     typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
128 | #endif  // THRUST_VERSION
129 | 
130 | 
131 | private:
132 | 
133 |     ValueType* ptr;
134 | 
135 | public:
136 | 
137 |     /// Constructor
138 |     __host__ __device__ __forceinline__ CacheModifiedInputIterator(
139 |         ValueType* ptr)     ///< Native pointer to wrap
140 |     :
141 |         ptr(ptr)
142 |     {}
143 | 
144 |     /// Postfix increment
145 |     __host__ __device__ __forceinline__ self_type operator++(int)
146 |     {
147 |         self_type retval = *this;
148 |         ptr++;
149 |         return retval;
150 |     }
151 | 
152 |     /// Prefix increment
153 |     __host__ __device__ __forceinline__ self_type operator++()
154 |     {
155 |         ptr++;
156 |         return *this;
157 |     }
158 | 
159 |     /// Indirection
160 |     __host__ __device__ __forceinline__ reference operator*() const
161 |     {
162 |         return ThreadLoad<MODIFIER>(ptr);
163 |     }
164 | 
165 |     /// Addition
166 |     template <typename Distance>
167 |     __host__ __device__ __forceinline__ self_type operator+(Distance n) const
168 |     {
169 |         self_type retval(ptr + n);
170 |         return retval;
171 |     }
172 | 
173 |     /// Addition assignment
174 |     template <typename Distance>
175 |     __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
176 |     {
177 |         ptr += n;
178 |         return *this;
179 |     }
180 | 
181 |     /// Subtraction
182 |     template <typename Distance>
183 |     __host__ __device__ __forceinline__ self_type operator-(Distance n) const
184 |     {
185 |         self_type retval(ptr - n);
186 |         return retval;
187 |     }
188 | 
189 |     /// Subtraction assignment
190 |     template <typename Distance>
191 |     __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
192 |     {
193 |         ptr -= n;
194 |         return *this;
195 |     }
196 | 
197 |     /// Distance
198 |     __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
199 |     {
200 |         return ptr - other.ptr;
201 |     }
202 | 
203 |     /// Array subscript
204 |     template <typename Distance>
205 |     __host__ __device__ __forceinline__ reference operator[](Distance n) const
206 |     {
207 |         return ThreadLoad<MODIFIER>(ptr + n);
208 |     }
209 | 
210 |     /// Structure dereference
211 |     __host__ __device__ __forceinline__ pointer operator->()
212 |     {
213 |         return &ThreadLoad<MODIFIER>(ptr);
214 |     }
215 | 
216 |     /// Equal to
217 |     __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
218 |     {
219 |         return (ptr == rhs.ptr);
220 |     }
221 | 
222 |     /// Not equal to
223 |     __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
224 |     {
225 |         return (ptr != rhs.ptr);
226 |     }
227 | 
228 |     /// ostream operator
229 |     friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
230 |     {
231 |         return os;
232 |     }
233 | };
234 | 
235 | 
236 | 
237 | /** @} */       // end group UtilIterator
238 | 
239 | }               // CUB namespace
240 | CUB_NS_POSTFIX  // Optional outer namespace(s)
241 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/iterator/constant_input_iterator.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Random-access iterator types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <iterator>
 37 | #include <iostream>
 38 | 
 39 | #include "../thread/thread_load.cuh"
 40 | #include "../thread/thread_store.cuh"
 41 | #include "../util_namespace.cuh"
 42 | 
 43 | #if (THRUST_VERSION >= 100700)
 44 |     // This iterator is compatible with Thrust API 1.7 and newer
 45 |     #include <thrust/iterator/iterator_facade.h>
 46 |     #include <thrust/iterator/iterator_traits.h>
 47 | #endif // THRUST_VERSION
 48 | 
 49 | 
 50 | /// Optional outer namespace(s)
 51 | CUB_NS_PREFIX
 52 | 
 53 | /// CUB namespace
 54 | namespace cub {
 55 | 
 56 | 
 57 | /**
 58 |  * \addtogroup UtilIterator
 59 |  * @{
 60 |  */
 61 | 
 62 | 
 63 | /**
 64 |  * \brief A random-access input generator for dereferencing a sequence of homogeneous values
 65 |  *
 66 |  * \par Overview
 67 |  * - Read references to a ConstantInputIterator iterator always return the supplied constant
 68 |  *   of type \p ValueType.
 69 |  * - Can be used with any data type.
 70 |  * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
 71 |  *   functions.
 72 |  * - Compatible with Thrust API v1.7 or newer.
 73 |  *
 74 |  * \par Snippet
 75 |  * The code snippet below illustrates the use of \p ConstantInputIterator to
 76 |  * dereference a sequence of homogeneous doubles.
 77 |  * \par
 78 |  * \code
 79 |  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/constant_input_iterator.cuh>
 80 |  *
 81 |  * cub::ConstantInputIterator<double> itr(5.0);
 82 |  *
 83 |  * printf("%f\n", itr[0]);      // 5.0
 84 |  * printf("%f\n", itr[1]);      // 5.0
 85 |  * printf("%f\n", itr[2]);      // 5.0
 86 |  * printf("%f\n", itr[50]);     // 5.0
 87 |  *
 88 |  * \endcode
 89 |  *
 90 |  * \tparam ValueType            The value type of this iterator
 91 |  * \tparam Offset               The difference type of this iterator (Default: \p ptrdiff_t)
 92 |  */
 93 | template <
 94 |     typename ValueType,
 95 |     typename Offset = ptrdiff_t>
 96 | class ConstantInputIterator
 97 | {
 98 | public:
 99 | 
100 |     // Required iterator traits
101 |     typedef ConstantInputIterator               self_type;              ///< My own type
102 |     typedef Offset                              difference_type;        ///< Type to express the result of subtracting one iterator from another
103 |     typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
104 |     typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
105 |     typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
106 | 
107 | #if (THRUST_VERSION >= 100700)
108 |     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
109 |     typedef typename thrust::detail::iterator_facade_category<
110 |         thrust::any_system_tag,
111 |         thrust::random_access_traversal_tag,
112 |         value_type,
113 |         reference
114 |       >::type iterator_category;                                        ///< The iterator category
115 | #else
116 |     typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
117 | #endif  // THRUST_VERSION
118 | 
119 | private:
120 | 
121 |     ValueType   val;
122 |     Offset      offset;
123 | #ifdef _WIN32
124 |     Offset      pad[CUB_MAX(1, (16 / sizeof(Offset) - 1))];        // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
125 | #endif
126 | 
127 | public:
128 | 
129 |     /// Constructor
130 |     __host__ __device__ __forceinline__ ConstantInputIterator(
131 |         ValueType   val,            ///< Starting value for the iterator instance to report
132 |         Offset      offset = 0)     ///< Base offset
133 |     :
134 |         val(val),
135 |         offset(offset)
136 |     {}
137 | 
138 |     /// Postfix increment
139 |     __host__ __device__ __forceinline__ self_type operator++(int)
140 |     {
141 |         self_type retval = *this;
142 |         offset++;
143 |         return retval;
144 |     }
145 | 
146 |     /// Prefix increment
147 |     __host__ __device__ __forceinline__ self_type operator++()
148 |     {
149 |         offset++;
150 |         return *this;
151 |     }
152 | 
153 |     /// Indirection
154 |     __host__ __device__ __forceinline__ reference operator*() const
155 |     {
156 |         return val;
157 |     }
158 | 
159 |     /// Addition
160 |     template <typename Distance>
161 |     __host__ __device__ __forceinline__ self_type operator+(Distance n) const
162 |     {
163 |         self_type retval(val, offset + n);
164 |         return retval;
165 |     }
166 | 
167 |     /// Addition assignment
168 |     template <typename Distance>
169 |     __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
170 |     {
171 |         offset += n;
172 |         return *this;
173 |     }
174 | 
175 |     /// Subtraction
176 |     template <typename Distance>
177 |     __host__ __device__ __forceinline__ self_type operator-(Distance n) const
178 |     {
179 |         self_type retval(val, offset - n);
180 |         return retval;
181 |     }
182 | 
183 |     /// Subtraction assignment
184 |     template <typename Distance>
185 |     __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
186 |     {
187 |         offset -= n;
188 |         return *this;
189 |     }
190 | 
191 |     /// Distance
192 |     __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
193 |     {
194 |         return offset - other.offset;
195 |     }
196 | 
197 |     /// Array subscript
198 |     template <typename Distance>
199 |     __host__ __device__ __forceinline__ reference operator[](Distance n) const
200 |     {
201 |         return val;
202 |     }
203 | 
204 |     /// Structure dereference
205 |     __host__ __device__ __forceinline__ pointer operator->()
206 |     {
207 |         return &val;
208 |     }
209 | 
210 |     /// Equal to
211 |     __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
212 |     {
213 |         return (offset == rhs.offset) && ((val == rhs.val));
214 |     }
215 | 
216 |     /// Not equal to
217 |     __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
218 |     {
219 |         return (offset != rhs.offset) || (val!= rhs.val);
220 |     }
221 | 
222 |     /// ostream operator
223 |     friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
224 |     {
225 |         os << "[" << itr.val << "," << itr.offset << "]";
226 |         return os;
227 |     }
228 | 
229 | };
230 | 
231 | 
232 | /** @} */       // end group UtilIterator
233 | 
234 | }               // CUB namespace
235 | CUB_NS_POSTFIX  // Optional outer namespace(s)
236 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/iterator/counting_input_iterator.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Random-access iterator types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <iterator>
 37 | #include <iostream>
 38 | 
 39 | #include "../thread/thread_load.cuh"
 40 | #include "../thread/thread_store.cuh"
 41 | #include "../util_device.cuh"
 42 | #include "../util_namespace.cuh"
 43 | 
 44 | #if (THRUST_VERSION >= 100700)
 45 |     // This iterator is compatible with Thrust API 1.7 and newer
 46 |     #include <thrust/iterator/iterator_facade.h>
 47 |     #include <thrust/iterator/iterator_traits.h>
 48 | #endif // THRUST_VERSION
 49 | 
 50 | 
 51 | /// Optional outer namespace(s)
 52 | CUB_NS_PREFIX
 53 | 
 54 | /// CUB namespace
 55 | namespace cub {
 56 | 
 57 | /**
 58 |  * \addtogroup UtilIterator
 59 |  * @{
 60 |  */
 61 | 
 62 | /**
 63 |  * \brief A random-access input generator for dereferencing a sequence of incrementing integer values.
 64 |  *
 65 |  * \par Overview
 66 |  * - After initializing a CountingInputIterator to a certain integer \p base, read references
 67 |  *   at \p offset will return the value \p base + \p offset.
 68 |  * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
 69 |  *   functions.
 70 |  * - Compatible with Thrust API v1.7 or newer.
 71 |  *
 72 |  * \par Snippet
 73 |  * The code snippet below illustrates the use of \p CountingInputIterator to
 74 |  * dereference a sequence of incrementing integers.
 75 |  * \par
 76 |  * \code
 77 |  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/counting_input_iterator.cuh>
 78 |  *
 79 |  * cub::CountingInputIterator<int> itr(5);
 80 |  *
 81 |  * printf("%d\n", itr[0]);      // 5
 82 |  * printf("%d\n", itr[1]);      // 6
 83 |  * printf("%d\n", itr[2]);      // 7
 84 |  * printf("%d\n", itr[50]);     // 55
 85 |  *
 86 |  * \endcode
 87 |  *
 88 |  * \tparam ValueType            The value type of this iterator
 89 |  * \tparam Offset               The difference type of this iterator (Default: \p ptrdiff_t)
 90 |  */
 91 | template <
 92 |     typename ValueType,
 93 |     typename Offset = ptrdiff_t>
 94 | class CountingInputIterator
 95 | {
 96 | public:
 97 | 
 98 |     // Required iterator traits
 99 |     typedef CountingInputIterator               self_type;              ///< My own type
100 |     typedef Offset                              difference_type;        ///< Type to express the result of subtracting one iterator from another
101 |     typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
102 |     typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
103 |     typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
104 | 
105 | #if (THRUST_VERSION >= 100700)
106 |     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
107 |     typedef typename thrust::detail::iterator_facade_category<
108 |         thrust::any_system_tag,
109 |         thrust::random_access_traversal_tag,
110 |         value_type,
111 |         reference
112 |       >::type iterator_category;                                        ///< The iterator category
113 | #else
114 |     typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
115 | #endif  // THRUST_VERSION
116 | 
117 | private:
118 | 
119 |     ValueType val;
120 | 
121 | public:
122 | 
123 |     /// Constructor
124 |     __host__ __device__ __forceinline__ CountingInputIterator(
125 |         const ValueType &val)          ///< Starting value for the iterator instance to report
126 |     :
127 |         val(val)
128 |     {}
129 | 
130 |     /// Postfix increment
131 |     __host__ __device__ __forceinline__ self_type operator++(int)
132 |     {
133 |         self_type retval = *this;
134 |         val++;
135 |         return retval;
136 |     }
137 | 
138 |     /// Prefix increment
139 |     __host__ __device__ __forceinline__ self_type operator++()
140 |     {
141 |         val++;
142 |         return *this;
143 |     }
144 | 
145 |     /// Indirection
146 |     __host__ __device__ __forceinline__ reference operator*() const
147 |     {
148 |         return val;
149 |     }
150 | 
151 |     /// Addition
152 |     template <typename Distance>
153 |     __host__ __device__ __forceinline__ self_type operator+(Distance n) const
154 |     {
155 |         self_type retval(val + n);
156 |         return retval;
157 |     }
158 | 
159 |     /// Addition assignment
160 |     template <typename Distance>
161 |     __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
162 |     {
163 |         val += n;
164 |         return *this;
165 |     }
166 | 
167 |     /// Subtraction
168 |     template <typename Distance>
169 |     __host__ __device__ __forceinline__ self_type operator-(Distance n) const
170 |     {
171 |         self_type retval(val - n);
172 |         return retval;
173 |     }
174 | 
175 |     /// Subtraction assignment
176 |     template <typename Distance>
177 |     __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
178 |     {
179 |         val -= n;
180 |         return *this;
181 |     }
182 | 
183 |     /// Distance
184 |     __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
185 |     {
186 |         return val - other.val;
187 |     }
188 | 
189 |     /// Array subscript
190 |     template <typename Distance>
191 |     __host__ __device__ __forceinline__ reference operator[](Distance n) const
192 |     {
193 |         return val + n;
194 |     }
195 | 
196 |     /// Structure dereference
197 |     __host__ __device__ __forceinline__ pointer operator->()
198 |     {
199 |         return &val;
200 |     }
201 | 
202 |     /// Equal to
203 |     __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
204 |     {
205 |         return (val == rhs.val);
206 |     }
207 | 
208 |     /// Not equal to
209 |     __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
210 |     {
211 |         return (val != rhs.val);
212 |     }
213 | 
214 |     /// ostream operator
215 |     friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
216 |     {
217 |         os << "[" << itr.val << "]";
218 |         return os;
219 |     }
220 | 
221 | };
222 | 
223 | 
224 | 
225 | /** @} */       // end group UtilIterator
226 | 
227 | }               // CUB namespace
228 | CUB_NS_POSTFIX  // Optional outer namespace(s)
229 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/thread/thread_operators.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Simple binary operator functor types
 32 |  */
 33 | 
 34 | /******************************************************************************
 35 |  * Simple functor operators
 36 |  ******************************************************************************/
 37 | 
 38 | #pragma once
 39 | 
 40 | #include "../util_macro.cuh"
 41 | #include "../util_type.cuh"
 42 | #include "../util_namespace.cuh"
 43 | 
 44 | /// Optional outer namespace(s)
 45 | CUB_NS_PREFIX
 46 | 
 47 | /// CUB namespace
 48 | namespace cub {
 49 | 
 50 | 
 51 | /**
 52 |  * \addtogroup UtilModule
 53 |  * @{
 54 |  */
 55 | 
 56 | /**
 57 |  * \brief Default equality functor
 58 |  */
 59 | struct Equality
 60 | {
 61 |     /// Boolean equality operator, returns <tt>(a == b)</tt>
 62 |     template <typename T>
 63 |     __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
 64 |     {
 65 |         return a == b;
 66 |     }
 67 | };
 68 | 
 69 | 
 70 | /**
 71 |  * \brief Default inequality functor
 72 |  */
 73 | struct Inequality
 74 | {
 75 |     /// Boolean inequality operator, returns <tt>(a != b)</tt>
 76 |     template <typename T>
 77 |     __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
 78 |     {
 79 |         return a != b;
 80 |     }
 81 | };
 82 | 
 83 | 
 84 | /**
 85 |  * \brief Inequality functor (wraps equality functor)
 86 |  */
 87 | template <typename EqualityOp>
 88 | struct InequalityWrapper
 89 | {
 90 |     /// Wrapped equality operator
 91 |     EqualityOp op;
 92 | 
 93 |     /// Constructor
 94 |     __host__ __device__ __forceinline__
 95 |     InequalityWrapper(EqualityOp op) : op(op) {}
 96 | 
 97 |     /// Boolean inequality operator, returns <tt>(a != b)</tt>
 98 |     template <typename T>
 99 |     __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
100 |     {
101 |         return !op(a, b);
102 |     }
103 | };
104 | 
105 | 
106 | /**
107 |  * \brief Default sum functor
108 |  */
109 | struct Sum
110 | {
111 |     /// Boolean sum operator, returns <tt>a + b</tt>
112 |     template <typename T>
113 |     __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
114 |     {
115 |         return a + b;
116 |     }
117 | };
118 | 
119 | 
120 | /**
121 |  * \brief Default max functor
122 |  */
123 | struct Max
124 | {
125 |     /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
126 |     template <typename T>
127 |     __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
128 |     {
129 |         return CUB_MAX(a, b);
130 |     }
131 | };
132 | 
133 | 
134 | /**
135 |  * \brief Arg max functor (keeps the value and offset of the first occurrence of the l item)
136 |  */
137 | struct ArgMax
138 | {
139 |     /// Boolean max operator, preferring the item having the smaller offset in case of ties
140 |     template <typename T, typename Offset>
141 |     __host__ __device__ __forceinline__ ItemOffsetPair<T, Offset> operator()(
142 |         const ItemOffsetPair<T, Offset> &a,
143 |         const ItemOffsetPair<T, Offset> &b) const
144 |     {
145 |         if (a.value == b.value)
146 |             return (b.offset < a.offset) ? b : a;
147 | 
148 |         return (b.value > a.value) ? b : a;
149 |     }
150 | };
151 | 
152 | 
153 | /**
154 |  * \brief Default min functor
155 |  */
156 | struct Min
157 | {
158 |     /// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
159 |     template <typename T>
160 |     __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
161 |     {
162 |         return CUB_MIN(a, b);
163 |     }
164 | };
165 | 
166 | 
167 | /**
168 |  * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
169 |  */
170 | struct ArgMin
171 | {
172 |     /// Boolean min operator, preferring the item having the smaller offset in case of ties
173 |     template <typename T, typename Offset>
174 |     __host__ __device__ __forceinline__ ItemOffsetPair<T, Offset> operator()(
175 |         const ItemOffsetPair<T, Offset> &a,
176 |         const ItemOffsetPair<T, Offset> &b) const
177 |     {
178 |         if (a.value == b.value)
179 |             return (b.offset < a.offset) ? b : a;
180 | 
181 |         return (b.value < a.value) ? b : a;
182 |     }
183 | };
184 | 
185 | 
186 | /**
187 |  * \brief Default cast functor
188 |  */
189 | template <typename B>
190 | struct Cast
191 | {
192 |     /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
193 |     template <typename A>
194 |     __host__ __device__ __forceinline__ B operator()(const A &a) const
195 |     {
196 |         return (B) a;
197 |     }
198 | };
199 | 
200 | 
201 | 
202 | /** @} */       // end group UtilModule
203 | 
204 | 
205 | }               // CUB namespace
206 | CUB_NS_POSTFIX  // Optional outer namespace(s)
207 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/thread/thread_reduce.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Thread utilities for sequential reduction over statically-sized array types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../thread/thread_operators.cuh"
 37 | #include "../util_namespace.cuh"
 38 | 
 39 | /// Optional outer namespace(s)
 40 | CUB_NS_PREFIX
 41 | 
 42 | /// CUB namespace
 43 | namespace cub {
 44 | 
 45 | /**
 46 |  * \addtogroup UtilModule
 47 |  * @{
 48 |  */
 49 | 
 50 | /**
 51 |  * \name Sequential reduction over statically-sized array types
 52 |  * @{
 53 |  */
 54 | 
 55 | 
 56 | template <
 57 |     int         LENGTH,
 58 |     typename    T,
 59 |     typename    ReductionOp>
 60 | __device__ __forceinline__ T ThreadReduce(
 61 |     T*                  input,                  ///< [in] Input array
 62 |     ReductionOp         reduction_op,           ///< [in] Binary reduction operator
 63 |     T                   prefix,                 ///< [in] Prefix to seed reduction with
 64 |     Int2Type<LENGTH>    length)
 65 | {
 66 |     T addend = *input;
 67 |     prefix = reduction_op(prefix, addend);
 68 | 
 69 |     return ThreadReduce(input + 1, reduction_op, prefix, Int2Type<LENGTH - 1>());
 70 | }
 71 | 
 72 | template <
 73 |     typename    T,
 74 |     typename    ReductionOp>
 75 | __device__ __forceinline__ T ThreadReduce(
 76 |     T*                  input,                  ///< [in] Input array
 77 |     ReductionOp         reduction_op,           ///< [in] Binary reduction operator
 78 |     T                   prefix,                 ///< [in] Prefix to seed reduction with
 79 |     Int2Type<0>         length)
 80 | {
 81 |     return prefix;
 82 | }
 83 | 
 84 | 
 85 | /**
 86 |  * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
 87 |  *
 88 |  * \tparam LENGTH     Length of input array
 89 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
 90 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
 91 |  */
 92 | template <
 93 |     int         LENGTH,
 94 |     typename    T,
 95 |     typename    ReductionOp>
 96 | __device__ __forceinline__ T ThreadReduce(
 97 |     T*          input,                  ///< [in] Input array
 98 |     ReductionOp reduction_op,           ///< [in] Binary reduction operator
 99 |     T           prefix)                 ///< [in] Prefix to seed reduction with
100 | {
101 |     return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
102 | }
103 | 
104 | 
105 | /**
106 |  * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
107 |  *
108 |  * \tparam LENGTH     Length of input array
109 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
110 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
111 |  */
112 | template <
113 |     int         LENGTH,
114 |     typename    T,
115 |     typename    ReductionOp>
116 | __device__ __forceinline__ T ThreadReduce(
117 |     T*          input,                  ///< [in] Input array
118 |     ReductionOp reduction_op)           ///< [in] Binary reduction operator
119 | {
120 |     T prefix = input[0];
121 |     return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
122 | }
123 | 
124 | 
125 | /**
126 |  * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
127 |  *
128 |  * \tparam LENGTH     <b>[inferred]</b> Length of \p input array
129 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
130 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
131 |  */
132 | template <
133 |     int         LENGTH,
134 |     typename    T,
135 |     typename    ReductionOp>
136 | __device__ __forceinline__ T ThreadReduce(
137 |     T           (&input)[LENGTH],       ///< [in] Input array
138 |     ReductionOp reduction_op,           ///< [in] Binary reduction operator
139 |     T           prefix)                 ///< [in] Prefix to seed reduction with
140 | {
141 |     return ThreadReduce<LENGTH>(input, reduction_op, prefix);
142 | }
143 | 
144 | 
145 | /**
146 |  * \brief Serial reduction with the specified operator
147 |  *
148 |  * \tparam LENGTH     <b>[inferred]</b> Length of \p input array
149 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
150 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
151 |  */
152 | template <
153 |     int         LENGTH,
154 |     typename    T,
155 |     typename    ReductionOp>
156 | __device__ __forceinline__ T ThreadReduce(
157 |     T           (&input)[LENGTH],       ///< [in] Input array
158 |     ReductionOp reduction_op)           ///< [in] Binary reduction operator
159 | {
160 |     return ThreadReduce<LENGTH>((T*) input, reduction_op);
161 | }
162 | 
163 | 
164 | //@}  end member group
165 | 
166 | /** @} */       // end group UtilModule
167 | 
168 | }               // CUB namespace
169 | CUB_NS_POSTFIX  // Optional outer namespace(s)
170 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/util_arch.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Static architectural properties by SM version.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "util_namespace.cuh"
 37 | 
 38 | /// Optional outer namespace(s)
 39 | CUB_NS_PREFIX
 40 | 
 41 | /// CUB namespace
 42 | namespace cub {
 43 | 
 44 | 
 45 | /**
 46 |  * \addtogroup UtilMgmt
 47 |  * @{
 48 |  */
 49 | 
 50 | 
 51 | /// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
 52 | #ifndef __CUDA_ARCH__
 53 |     #define CUB_PTX_ARCH 0
 54 | #else
 55 |     #define CUB_PTX_ARCH __CUDA_ARCH__
 56 | #endif
 57 | 
 58 | 
 59 | /// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
 60 | #if (CUB_PTX_ARCH == 0) || defined(CUB_CDP)
 61 |     #define CUB_RUNTIME_ENABLED
 62 |     #define CUB_RUNTIME_FUNCTION __host__ __device__
 63 | #else
 64 |     #define CUB_RUNTIME_FUNCTION __host__
 65 | #endif
 66 | 
 67 | 
 68 | 
 69 | /// Number of threads per warp (log)
 70 | #define CUB_LOG_WARP_THREADS(arch)                      \
 71 | 	(5)
 72 | 
 73 | /// Number of threads per warp
 74 | #define CUB_WARP_THREADS(arch)                          \
 75 |     (1 << CUB_LOG_WARP_THREADS(arch))
 76 | 
 77 | /// Number of smem banks (log)
 78 | #define CUB_LOG_SMEM_BANKS(arch)                        \
 79 |     ((arch >= 200) ?                                    \
 80 |         (5) :                                           \
 81 |         (4))
 82 | 
 83 | /// Number of smem banks
 84 | #define CUB_SMEM_BANKS(arch)                            \
 85 |     (1 << CUB_LOG_SMEM_BANKS(arch))
 86 | 
 87 | /// Number of bytes per smem bank
 88 | #define CUB_SMEM_BANK_BYTES(arch)                       \
 89 |     (4)
 90 | 
 91 | /// Number of smem bytes provisioned per SM
 92 | #define CUB_SMEM_BYTES(arch)                            \
 93 |     ((arch >= 200) ?                                    \
 94 | 		(48 * 1024) :                                   \
 95 | 		(16 * 1024))
 96 | 
 97 | /// Smem allocation size in bytes
 98 | #define CUB_SMEM_ALLOC_UNIT(arch)                       \
 99 |     ((arch >= 300) ?                                    \
100 |     	(256) :                                         \
101 | 		((arch >= 200) ?                                \
102 | 		    (128) :                                     \
103 | 		    (512)))
104 | 
105 | /// Whether or not the architecture allocates registers by block (or by warp)
106 | #define CUB_REGS_BY_BLOCK(arch)                         \
107 |     ((arch >= 200) ?                                    \
108 |     	(false) :                                       \
109 |     	(true))
110 | 
111 | /// Number of registers allocated at a time per block (or by warp)
112 | #define CUB_REG_ALLOC_UNIT(arch)                        \
113 |     ((arch >= 300) ?                                    \
114 |     	(256) :                                         \
115 |         ((arch >= 200) ?                                \
116 |         	(64) :                                      \
117 |             ((arch >= 120) ?                            \
118 |             	(512) :                                 \
119 |             	(256))))
120 | 
121 | /// Granularity of warps for which registers are allocated
122 | #define CUB_WARP_ALLOC_UNIT(arch)                       \
123 |     ((arch >= 300) ?                                    \
124 |         (4) :                                           \
125 |         (2))
126 | 
127 | /// Maximum number of threads per SM
128 | #define CUB_MAX_SM_THREADS(arch)                        \
129 |     ((arch >= 300) ?                                    \
130 |     	(2048) :                                        \
131 |         ((arch >= 200) ?                                \
132 |         	(1536) :                                    \
133 |             ((arch >= 120) ?                            \
134 |            		(1024) :                                \
135 |            		(768))))
136 | 
137 | /// Maximum number of thread blocks per SM
138 | #define CUB_MAX_SM_BLOCKS(arch)                         \
139 |     ((arch >= 300) ?                                    \
140 |         (16) :                                          \
141 |         (8))
142 | 
143 | /// Maximum number of threads per thread block
144 | #define CUB_MAX_BLOCK_THREADS(arch)                     \
145 |     ((arch >= 200) ?                                    \
146 |         (1024) :                                        \
147 |         (512))
148 | 
149 | /// Maximum number of registers per SM
150 | #define CUB_MAX_SM_REGISTERS(arch)                      \
151 |     ((arch >= 300) ?                                    \
152 |         (64 * 1024) :                                   \
153 |         ((arch >= 200) ?                                \
154 |             (32 * 1024) :                               \
155 |             ((arch >= 120) ?                            \
156 |                 (16 * 1024) :                           \
157 |                 (8 * 1024))))
158 | 
159 | /// Oversubscription factor
160 | #define CUB_SUBSCRIPTION_FACTOR(arch)                   \
161 |     ((arch >= 300) ?                                    \
162 |         (5) :                                           \
163 |         ((arch >= 200) ?                                \
164 |             (3) :                                       \
165 |             (10)))
166 | 
167 | /// Prefer padding overhead vs X-way conflicts greater than this threshold
168 | #define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
169 |     ((arch >= 300) ?                                    \
170 |         (1) :                                           \
171 |         (4))
172 | 
173 | #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
174 | 
175 | #define CUB_PTX_LOG_WARP_THREADS                CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
176 | #define CUB_PTX_WARP_THREADS                    CUB_WARP_THREADS(CUB_PTX_ARCH)
177 | #define CUB_PTX_LOG_SMEM_BANKS                  CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
178 | #define CUB_PTX_SMEM_BANKS                      CUB_SMEM_BANKS(CUB_PTX_ARCH)
179 | #define CUB_PTX_SMEM_BANK_BYTES                 CUB_SMEM_BANK_BYTES(CUB_PTX_ARCH)
180 | #define CUB_PTX_SMEM_BYTES                      CUB_SMEM_BYTES(CUB_PTX_ARCH)
181 | #define CUB_PTX_SMEM_ALLOC_UNIT                 CUB_SMEM_ALLOC_UNIT(CUB_PTX_ARCH)
182 | #define CUB_PTX_REGS_BY_BLOCK                   CUB_REGS_BY_BLOCK(CUB_PTX_ARCH)
183 | #define CUB_PTX_REG_ALLOC_UNIT                  CUB_REG_ALLOC_UNIT(CUB_PTX_ARCH)
184 | #define CUB_PTX_WARP_ALLOC_UNIT                 CUB_WARP_ALLOC_UNIT(CUB_PTX_ARCH)
185 | #define CUB_PTX_MAX_SM_THREADS                  CUB_MAX_SM_THREADS(CUB_PTX_ARCH)
186 | #define CUB_PTX_MAX_SM_BLOCKS                   CUB_MAX_SM_BLOCKS(CUB_PTX_ARCH)
187 | #define CUB_PTX_MAX_BLOCK_THREADS               CUB_MAX_BLOCK_THREADS(CUB_PTX_ARCH)
188 | #define CUB_PTX_MAX_SM_REGISTERS                CUB_MAX_SM_REGISTERS(CUB_PTX_ARCH)
189 | #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
190 | 
191 | #endif  // Do not document
192 | 
193 | 
194 | /** @} */       // end group UtilMgmt
195 | 
196 | }               // CUB namespace
197 | CUB_NS_POSTFIX  // Optional outer namespace(s)
198 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/util_debug.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Error and event logging routines.
 32 |  *
 33 |  * The following macros definitions are supported:
 34 |  * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
 35 |  */
 36 | 
 37 | #pragma once
 38 | 
 39 | #include <stdio.h>
 40 | #include "util_namespace.cuh"
 41 | #include "util_arch.cuh"
 42 | 
 43 | /// Optional outer namespace(s)
 44 | CUB_NS_PREFIX
 45 | 
 46 | /// CUB namespace
 47 | namespace cub {
 48 | 
 49 | 
 50 | /**
 51 |  * \addtogroup UtilMgmt
 52 |  * @{
 53 |  */
 54 | 
 55 | 
 56 | /// CUB error reporting macro (prints error messages to stderr)
 57 | #if (defined(DEBUG) || defined(_DEBUG))
 58 |     #define CUB_STDERR
 59 | #endif
 60 | 
 61 | 
 62 | 
 63 | /**
 64 |  * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
 65 |  *
 66 |  * \return The CUDA error.
 67 |  */
 68 | __host__ __device__ __forceinline__ cudaError_t Debug(
 69 |     cudaError_t     error,
 70 |     const char*     filename,
 71 |     int             line)
 72 | {
 73 | #ifdef CUB_STDERR
 74 |     if (error)
 75 |     {
 76 |     #if (CUB_PTX_ARCH == 0)
 77 |         fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
 78 |         fflush(stderr);
 79 |     #elif (CUB_PTX_ARCH >= 200)
 80 |         printf("CUDA error %d [block %d, thread %d, %s, %d]\n", error, blockIdx.x, threadIdx.x, filename, line);
 81 |     #endif
 82 |     }
 83 | #endif
 84 |     return error;
 85 | }
 86 | 
 87 | 
 88 | /**
 89 |  * \brief Debug macro
 90 |  */
 91 | #define CubDebug(e) cub::Debug((e), __FILE__, __LINE__)
 92 | 
 93 | 
 94 | /**
 95 |  * \brief Debug macro with exit
 96 |  */
 97 | #define CubDebugExit(e) if (cub::Debug((e), __FILE__, __LINE__)) { exit(1); }
 98 | 
 99 | 
100 | /**
101 |  * \brief Log macro for printf statements.
102 |  */
103 | #if (CUB_PTX_ARCH == 0)
104 |     #define CubLog(format, ...) printf(format,__VA_ARGS__);
105 | #elif (CUB_PTX_ARCH >= 200)
106 |     #define CubLog(format, ...) printf("[block %d, thread %d]: " format, blockIdx.x, threadIdx.x, __VA_ARGS__);
107 | #endif
108 | 
109 | 
110 | 
111 | 
112 | /** @} */       // end group UtilMgmt
113 | 
114 | }               // CUB namespace
115 | CUB_NS_POSTFIX  // Optional outer namespace(s)
116 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/util_macro.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /******************************************************************************
 30 |  * Common C/C++ macro utilities
 31 |  ******************************************************************************/
 32 | 
 33 | #pragma once
 34 | 
 35 | #include "util_namespace.cuh"
 36 | 
 37 | /// Optional outer namespace(s)
 38 | CUB_NS_PREFIX
 39 | 
 40 | /// CUB namespace
 41 | namespace cub {
 42 | 
 43 | 
 44 | /**
 45 |  * \addtogroup UtilModule
 46 |  * @{
 47 |  */
 48 | 
 49 | /**
 50 |  * Align struct
 51 |  */
 52 | #if defined(_WIN32) || defined(_WIN64)
 53 |     #define CUB_ALIGN(bytes) __declspec(align(32))
 54 | #else
 55 |     #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
 56 | #endif
 57 | 
 58 | /**
 59 |  * Select maximum(a, b)
 60 |  */
 61 | #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
 62 | 
 63 | /**
 64 |  * Select minimum(a, b)
 65 |  */
 66 | #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
 67 | 
 68 | /**
 69 |  * Quotient of x/y rounded down to nearest integer
 70 |  */
 71 | #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
 72 | 
 73 | /**
 74 |  * Quotient of x/y rounded up to nearest integer
 75 |  */
 76 | #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
 77 | 
 78 | /**
 79 |  * x rounded up to the nearest multiple of y
 80 |  */
 81 | #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
 82 | 
 83 | /**
 84 |  * x rounded down to the nearest multiple of y
 85 |  */
 86 | #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
 87 | 
 88 | /**
 89 |  * Return character string for given type
 90 |  */
 91 | #define CUB_TYPE_STRING(type) ""#type
 92 | 
 93 | #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 94 |     #define CUB_CAT_(a, b) a ## b
 95 |     #define CUB_CAT(a, b) CUB_CAT_(a, b)
 96 | #endif // DOXYGEN_SHOULD_SKIP_THIS
 97 | 
 98 | /**
 99 |  * Static assert
100 |  */
101 | #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
102 | 
103 | 
104 | /** @} */       // end group UtilModule
105 | 
106 | }               // CUB namespace
107 | CUB_NS_POSTFIX  // Optional outer namespace(s)
108 | 


--------------------------------------------------------------------------------
/external/cub-1.3.2/cub/util_namespace.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
 4 |  *
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  *
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * Place-holder for prefixing the cub namespace
32 |  */
33 | 
34 | #pragma once
35 | 
36 | // For example:
37 | //#define CUB_NS_PREFIX namespace thrust{ namespace detail {
38 | //#define CUB_NS_POSTFIX } }
39 | 
40 | #define CUB_NS_PREFIX
41 | #define CUB_NS_POSTFIX
42 | 


--------------------------------------------------------------------------------
/python/SmoothParticleNets/ImageProjection.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numbers
  3 | import numpy as np
  4 | 
  5 | import torch
  6 | import torch.autograd
  7 | 
  8 | import _ext
  9 | import _extc
 10 | import error_checking as ec
 11 | from kernels import KERNELS, KERNEL_NAMES
 12 | 
 13 | MAX_FLOAT = float(np.finfo(np.float32).max)
 14 | 
 15 | 
 16 | class ImageProjection(torch.nn.Module):
 17 |     """ 
 18 |     """
 19 | 
 20 |     def __init__(self, camera_fl):
 21 |         """ Initialize a ParticleProjection layer.
 22 |         TODO
 23 | 
 24 |         Arguments:
 25 |             -camera_fl: The camera focal length in pixels (all pixels are
 26 |                         assumed to be square. This layer does not simulate
 27 |                         any image warping e.g. radial distortion).
 28 |         """
 29 |         super(ImageProjection, self).__init__()
 30 | 
 31 |         self.camera_fl = ec.check_conditions(camera_fl, "camera_fl",
 32 |                                              "%s > 0", "isinstance(%s, numbers.Real)")
 33 | 
 34 |         self.register_buffer("empty_depth_mask",
 35 |                              torch.ones(1, 1, 1)*MAX_FLOAT)
 36 | 
 37 |     def _rotationMatrixFromQuaternion(self, quat):
 38 |         """
 39 |         1 - 2*qy2 - 2*qz2   2*qx*qy - 2*qz*qw   2*qx*qz + 2*qy*qw
 40 |         2*qx*qy + 2*qz*qw   1 - 2*qx2 - 2*qz2   2*qy*qz - 2*qx*qw
 41 |         2*qx*qz - 2*qy*qw   2*qy*qz + 2*qx*qw   1 - 2*qx2 - 2*qy2
 42 |         """
 43 |         quat = quat.data
 44 |         qx = quat[:, 0]
 45 |         qy = quat[:, 1]
 46 |         qz = quat[:, 2]
 47 |         qw = quat[:, 3]
 48 |         qx2 = qx*qx
 49 |         qxqy = qx*qy
 50 |         qxqz = qx*qz
 51 |         qxqw = qx*qw
 52 |         qy2 = qy*qy
 53 |         qyqz = qy*qz
 54 |         qyqw = qy*qw
 55 |         qz2 = qz*qz
 56 |         qzqw = qz*qw
 57 |         ret = quat.new(quat.size()[0], 3, 3)
 58 |         ret[:, 0, 0] = 1 - 2*qy2 - 2*qz2
 59 |         ret[:, 1, 0] = 2*qxqy - 2*qzqw
 60 |         ret[:, 2, 0] = 2*qxqz + 2*qyqw
 61 |         ret[:, 0, 1] = 2*qxqy + 2*qzqw
 62 |         ret[:, 1, 1] = 1 - 2*qx2 - 2*qz2
 63 |         ret[:, 2, 1] = 2*qyqz - 2*qxqw
 64 |         ret[:, 0, 2] = 2*qxqz - 2*qyqw
 65 |         ret[:, 1, 2] = 2*qyqz + 2*qxqw
 66 |         ret[:, 2, 2] = 1 - 2*qx2 - 2*qy2
 67 |         return torch.autograd.Variable(ret, requires_grad=False)
 68 | 
 69 |     def forward(self, locs, image, camera_pose, camera_rot, depth_mask=None):
 70 |         """ Forwad pass for the particle projection. Takes in the set of
 71 |         particles and outputs an image.
 72 |         TODO
 73 | 
 74 |         Arguments:
 75 |             -locs: A BxNx3 tensor where B is the batch size, N is the number
 76 |                    of particles, and 3 is the dimensionality of the 
 77 |                    particles' coordinate space (this layer currently only
 78 |                    supports 3D projections).
 79 |             -camera_pose: A Bx3 tensor containing the camera translation.
 80 |             -camera_rot: A Bx4 tensor containing the camera rotation as a
 81 |                          quaternion in xyzw format.
 82 |             -depth_mask: An optional BxHxW tensor where W and H are the
 83 |                          camera image width and height respectively. If not
 84 |                          None, then this is used to compute occlusions. The
 85 |                          value in each pixel in the depth_mask should be
 86 |                          the distance to the first object. Any particles
 87 |                          further away than that value will not be projected
 88 |                          onto the output image.
 89 | 
 90 |         Returns: A BxHxW tensor of the projected particles.
 91 |         """
 92 | 
 93 |         # Error checking.
 94 |         batch_size = locs.size()[0]
 95 |         N = locs.size()[1]
 96 |         width = image.size()[3]
 97 |         height = image.size()[2]
 98 |         channels = image.size()[1]
 99 |         ec.check_tensor_dims(locs, "locs", (batch_size, N, 3))
100 |         ec.check_tensor_dims(
101 |             image, "image", (batch_size, channels, height, width))
102 |         ec.check_tensor_dims(camera_pose, "camera_pose", (batch_size, 3))
103 |         ec.check_tensor_dims(camera_rot, "camera_rot", (batch_size, 4))
104 | 
105 |         ec.check_nans(locs, "locs")
106 |         ec.check_nans(image, "image")
107 |         ec.check_nans(camera_pose, "camera_pose")
108 |         ec.check_nans(camera_rot, "camera_rot")
109 | 
110 |         if depth_mask is not None:
111 |             ec.check_tensor_dims(depth_mask, "depth_mask", (batch_size,
112 |                                                             height, width))
113 |             ec.check_nans(depth_mask, "depth_mask")
114 |             depth_mask = depth_mask.contiguous()
115 |         else:
116 |             if (self.empty_depth_mask.size()[0] != batch_size or
117 |                 self.empty_depth_mask.size()[1] != height or
118 |                     self.empty_depth_mask.size()[2] != width):
119 |                 self.empty_depth_mask.resize_(batch_size, height, width)
120 |                 self.empty_depth_mask.fill_(MAX_FLOAT)
121 |             depth_mask = torch.autograd.Variable(
122 |                 self.empty_depth_mask, requires_grad=False)
123 |             if locs.is_cuda:
124 |                 depth_mask = depth_mask.cuda()
125 | 
126 |         # Let's transform the particles to camera space here.
127 |         locs = locs - camera_pose.unsqueeze(1)
128 |         # Ensure the rotation quaternion is normalized.
129 |         camera_rot = camera_rot / \
130 |             torch.sqrt(torch.sum(camera_rot**2, 1, keepdim=True))
131 |         # Invert the rotation.
132 |         inv = camera_rot.data.new(1, 4)
133 |         inv[0, 0] = -1
134 |         inv[0, 1] = -1
135 |         inv[0, 2] = -1
136 |         inv[0, 3] = 1
137 |         inv = torch.autograd.Variable(inv, requires_grad=False)
138 |         camera_rot = camera_rot*inv
139 |         rot = self._rotationMatrixFromQuaternion(camera_rot)
140 |         if (rot != rot).data.any():
141 |             raise ValueError("No NaNs found in camera_rot argument, but NaNs created when"
142 |                              " constructing a rotation matrix from it.")
143 |         # Rotate the locs into camera space.
144 |         try:
145 |             # There's a bug that causes this to fail on the first call when using cuda.
146 |             # To fix that, just call it again.
147 |             locs = torch.bmm(locs, rot)
148 |         except RuntimeError:
149 |             locs = torch.bmm(locs, rot)
150 |         if (locs != locs).data.any():
151 |             raise ValueError(
152 |                 "Rotating locs by rotation matrix resulted in NaNs.")
153 | 
154 |         locs = locs.contiguous()
155 |         image = image.contiguous()
156 |         proj = _ImageProjectionFunction(self.camera_fl)
157 |         ret = proj(locs, image, depth_mask)
158 |         return ret
159 | 
160 | 
161 | """
162 | 
163 | INTERNAL FUNCTIONS
164 | 
165 | """
166 | 
167 | 
168 | class _ImageProjectionFunction(torch.autograd.Function):
169 | 
170 |     def __init__(self, camera_fl):
171 |         super(_ImageProjectionFunction, self).__init__()
172 |         self.camera_fl = camera_fl
173 | 
174 |     def forward(self, locs, image, depth_mask):
175 |         self.save_for_backward(locs, image, depth_mask)
176 |         batch_size = locs.size()[0]
177 |         N = locs.size()[1]
178 |         channels = image.size()[1]
179 |         ret = locs.new(batch_size, N, channels)
180 |         ret.fill_(0)
181 |         if locs.is_cuda:
182 |             if not _extc.spnc_imageprojection_forward(locs, image,
183 |                                                       self.camera_fl, depth_mask, ret):
184 |                 raise Exception("Cuda error")
185 |         else:
186 |             _ext.spn_imageprojection_forward(locs, image,
187 |                                              self.camera_fl, depth_mask, ret)
188 | 
189 |         return ret
190 | 
191 |     def backward(self, grad_output):
192 |         locs, image, depth_mask = self.saved_tensors
193 |         ret_locs = grad_output.new(locs.size())
194 |         ret_locs.fill_(0)
195 |         ret_image = grad_output.new(image.size())
196 |         ret_image.fill_(0)
197 |         ret_depth_mask = grad_output.new(depth_mask.size())
198 |         ret_depth_mask.fill_(0)
199 |         if grad_output.is_cuda:
200 |             if not _extc.spnc_imageprojection_backward(locs, image,
201 |                                                        self.camera_fl, depth_mask, grad_output, ret_locs, ret_image):
202 |                 raise Exception("Cuda error")
203 |         else:
204 |             _ext.spn_imageprojection_backward(locs, image,
205 |                                               self.camera_fl, depth_mask, grad_output, ret_locs, ret_image)
206 | 
207 |         return (ret_locs,
208 |                 ret_image,
209 |                 ret_depth_mask,)
210 | 


--------------------------------------------------------------------------------
/python/SmoothParticleNets/ParticleProjection.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numbers
  3 | import numpy as np
  4 | 
  5 | import torch
  6 | import torch.autograd
  7 | 
  8 | import _ext
  9 | import _extc
 10 | import error_checking as ec
 11 | from kernels import KERNELS, KERNEL_NAMES
 12 | 
 13 | MAX_FLOAT = float(np.finfo(np.float32).max)
 14 | 
 15 | 
 16 | class ParticleProjection(torch.nn.Module):
 17 |     """ The particle projection layer. Projects the given set of particles onto
 18 |     a camera image plane. For each particle, this layer finds its location on
 19 |     the image plane, then adds a small circular Gaussian centered at that location
 20 |     to the image. The contributions from all particles are added together into
 21 |     a final image. Note that unlike the other layers in this package, this layer
 22 |     only works with 3D particles.
 23 |     """
 24 | 
 25 |     def __init__(self, camera_fl, camera_size, filter_std, filter_scale):
 26 |         """ Initialize a ParticleProjection layer.
 27 | 
 28 |         Arguments:
 29 |             -camera_fl: The camera focal length in pixels (all pixels are
 30 |                         assumed to be square. This layer does not simulate
 31 |                         any image warping e.g. radial distortion).
 32 |             -camera_size: 2-tuple with the image width and height in pixels.
 33 |             -filter_std: The standard deviation of the Gaussian that is
 34 |                          added at each pixel location.
 35 |             -filter_scale: Before adding the Gaussian for an individual
 36 |                            particle, it is scaled by this value.
 37 |         """
 38 |         super(ParticleProjection, self).__init__()
 39 | 
 40 |         self.camera_size = ec.make_list(camera_size, 2, "camera_size",
 41 |                                         "%s > 0", "isinstance(%s, numbers.Integral)")
 42 | 
 43 |         self.camera_fl = ec.check_conditions(camera_fl, "camera_fl",
 44 |                                              "%s > 0", "isinstance(%s, numbers.Real)")
 45 |         self.filter_std = ec.check_conditions(filter_std, "filter_std",
 46 |                                               "%s > 0", "isinstance(%s, numbers.Real)")
 47 |         self.filter_scale = ec.check_conditions(filter_scale, "filter_scale",
 48 |                                                 "%s > 0", "isinstance(%s, numbers.Real)")
 49 | 
 50 |         self.register_buffer("empty_depth_mask",
 51 |                              torch.ones(1, self.camera_size[1], self.camera_size[0])*MAX_FLOAT)
 52 | 
 53 |     def _rotationMatrixFromQuaternion(self, quat):
 54 |         """
 55 |         1 - 2*qy2 - 2*qz2   2*qx*qy - 2*qz*qw   2*qx*qz + 2*qy*qw
 56 |         2*qx*qy + 2*qz*qw   1 - 2*qx2 - 2*qz2   2*qy*qz - 2*qx*qw
 57 |         2*qx*qz - 2*qy*qw   2*qy*qz + 2*qx*qw   1 - 2*qx2 - 2*qy2
 58 |         """
 59 |         quat = quat.data
 60 |         qx = quat[:, 0]
 61 |         qy = quat[:, 1]
 62 |         qz = quat[:, 2]
 63 |         qw = quat[:, 3]
 64 |         qx2 = qx*qx
 65 |         qxqy = qx*qy
 66 |         qxqz = qx*qz
 67 |         qxqw = qx*qw
 68 |         qy2 = qy*qy
 69 |         qyqz = qy*qz
 70 |         qyqw = qy*qw
 71 |         qz2 = qz*qz
 72 |         qzqw = qz*qw
 73 |         ret = quat.new(quat.size()[0], 3, 3)
 74 |         ret[:, 0, 0] = 1 - 2*qy2 - 2*qz2
 75 |         ret[:, 1, 0] = 2*qxqy - 2*qzqw
 76 |         ret[:, 2, 0] = 2*qxqz + 2*qyqw
 77 |         ret[:, 0, 1] = 2*qxqy + 2*qzqw
 78 |         ret[:, 1, 1] = 1 - 2*qx2 - 2*qz2
 79 |         ret[:, 2, 1] = 2*qyqz - 2*qxqw
 80 |         ret[:, 0, 2] = 2*qxqz - 2*qyqw
 81 |         ret[:, 1, 2] = 2*qyqz + 2*qxqw
 82 |         ret[:, 2, 2] = 1 - 2*qx2 - 2*qy2
 83 |         return torch.autograd.Variable(ret, requires_grad=False)
 84 | 
 85 |     def forward(self, locs, camera_pose, camera_rot, depth_mask=None):
 86 |         """ Forwad pass for the particle projection. Takes in the set of
 87 |         particles and outputs an image.
 88 | 
 89 |         Arguments:
 90 |             -locs: A BxNx3 tensor where B is the batch size, N is the number
 91 |                    of particles, and 3 is the dimensionality of the 
 92 |                    particles' coordinate space (this layer currently only
 93 |                    supports 3D projections).
 94 |             -camera_pose: A Bx3 tensor containing the camera translation.
 95 |             -camera_rot: A Bx4 tensor containing the camera rotation as a
 96 |                          quaternion in xyzw format.
 97 |             -depth_mask: An optional BxHxW tensor where W and H are the
 98 |                          camera image width and height respectively. If not
 99 |                          None, then this is used to compute occlusions. The
100 |                          value in each pixel in the depth_mask should be
101 |                          the distance to the first object. Any particles
102 |                          further away than that value will not be projected
103 |                          onto the output image.
104 | 
105 |         Returns: A BxHxW tensor of the projected particles.
106 |         """
107 | 
108 |         # Error checking.
109 |         batch_size = locs.size()[0]
110 |         N = locs.size()[1]
111 |         ec.check_tensor_dims(locs, "locs", (batch_size, N, 3))
112 |         ec.check_tensor_dims(camera_pose, "camera_pose", (batch_size, 3))
113 |         ec.check_tensor_dims(camera_rot, "camera_rot", (batch_size, 4))
114 | 
115 |         if depth_mask is not None:
116 |             ec.check_tensor_dims(depth_mask, "depth_mask", (batch_size,
117 |                                                             self.camera_size[1], self.camera_size[0]))
118 |             depth_mask = depth_mask.contiguous()
119 |         else:
120 |             if self.empty_depth_mask.size()[0] != batch_size:
121 |                 self.empty_depth_mask.resize_(
122 |                     batch_size, self.camera_size[1], self.camera_size[0])
123 |                 self.empty_depth_mask.fill_(MAX_FLOAT)
124 |             depth_mask = torch.autograd.Variable(
125 |                 self.empty_depth_mask, requires_grad=False)
126 |             if locs.is_cuda:
127 |                 depth_mask = depth_mask.cuda()
128 | 
129 |         # Let's transform the particles to camera space here.
130 |         locs = locs - camera_pose.unsqueeze(1)
131 |         # Ensure the rotation quaternion is normalized.
132 |         camera_rot = camera_rot / \
133 |             torch.sqrt(torch.sum(camera_rot**2, 1, keepdim=True))
134 |         # Invert the rotation.
135 |         inv = camera_rot.data.new(1, 4)
136 |         inv[0, 0] = -1
137 |         inv[0, 1] = -1
138 |         inv[0, 2] = -1
139 |         inv[0, 3] = 1
140 |         inv = torch.autograd.Variable(inv, requires_grad=False)
141 |         camera_rot = camera_rot*inv
142 |         rot = self._rotationMatrixFromQuaternion(camera_rot)
143 |         # Rotate the locs into camera space.
144 |         try:
145 |             # There's a bug that causes this to fail on the first call when using cuda.
146 |             # To fix that, just call it again.
147 |             locs = torch.bmm(locs, rot)
148 |         except RuntimeError:
149 |             locs = torch.bmm(locs, rot)
150 | 
151 |         locs = locs.contiguous()
152 |         proj = _ParticleProjectionFunction(self.camera_fl, self.camera_size, self.filter_std,
153 |                                            self.filter_scale)
154 |         ret = proj(locs, depth_mask)
155 |         return ret
156 | 
157 | 
158 | """
159 | 
160 | INTERNAL FUNCTIONS
161 | 
162 | """
163 | 
164 | 
165 | class _ParticleProjectionFunction(torch.autograd.Function):
166 | 
167 |     def __init__(self, camera_fl, camera_size, filter_std, filter_scale):
168 |         super(_ParticleProjectionFunction, self).__init__()
169 |         self.camera_fl = camera_fl
170 |         self.camera_size = camera_size
171 |         self.filter_std = filter_std
172 |         self.filter_scale = filter_scale
173 | 
174 |     def forward(self, locs, depth_mask):
175 |         self.save_for_backward(locs, depth_mask)
176 |         batch_size = locs.size()[0]
177 |         ret = locs.new(batch_size, self.camera_size[1], self.camera_size[0])
178 |         ret.fill_(0)
179 |         if locs.is_cuda:
180 |             if not _extc.spnc_particleprojection_forward(locs, self.camera_fl,
181 |                                                          self.filter_std, self.filter_scale, depth_mask, ret):
182 |                 raise Exception("Cuda error")
183 |         else:
184 |             _ext.spn_particleprojection_forward(locs, self.camera_fl,
185 |                                                 self.filter_std, self.filter_scale, depth_mask, ret)
186 | 
187 |         return ret
188 | 
189 |     def backward(self, grad_output):
190 |         locs, depth_mask = self.saved_tensors
191 |         ret_locs = grad_output.new(locs.size())
192 |         ret_locs.fill_(0)
193 |         ret_depth_mask = grad_output.new(depth_mask.size())
194 |         ret_depth_mask.fill_(0)
195 |         if grad_output.is_cuda:
196 |             if not _extc.spnc_particleprojection_backward(locs,
197 |                                                           self.camera_fl, self.filter_std, self.filter_scale, depth_mask, grad_output, ret_locs):
198 |                 raise Exception("Cuda error")
199 |         else:
200 |             _ext.spn_particleprojection_backward(locs,
201 |                                                  self.camera_fl, self.filter_std, self.filter_scale, depth_mask, grad_output, ret_locs)
202 | 
203 |         return (ret_locs,
204 |                 ret_depth_mask,)
205 | 


--------------------------------------------------------------------------------
/python/SmoothParticleNets/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from os.path import dirname, basename, isfile
 3 | import glob
 4 | import sys
 5 | sys.path.append(dirname(__file__))
 6 | modules = glob.glob(dirname(__file__)+"/*.py")
 7 | __all__ = [basename(f)[:-3] for f in modules if isfile(f)]
 8 | for f in modules:
 9 |     if isfile(f) and "__init__" not in f and "install" not in f:
10 |         exec('from %s import *' % basename(f)[:-3])
11 | 


--------------------------------------------------------------------------------
/python/SmoothParticleNets/error_checking.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numbers
 3 | import numpy as np
 4 | 
 5 | import torch
 6 | 
 7 | def check_nans(v, name):
 8 |     if (v != v).data.any():
 9 |         raise ValueError("Found NaNs in %s" % name)
10 | 
11 | def throws_exception(exception_type, func, *args, **kwargs):
12 |     try:
13 |         func(*args, **kwargs)
14 |         return False
15 |     except exception_type:
16 |         return True
17 | 
18 | def check_conditions(v, name, *conditions):
19 |     for condition in conditions:
20 |         if not eval(condition % "v"):
21 |             raise ValueError(("%s must meet the following condition: " + condition) 
22 |             	% (name, name))
23 |     return v
24 | 
25 | def make_list(l, length, name, *conditions):
26 |     if throws_exception(TypeError, list, l):
27 |         l = [l]*length
28 |     else:
29 |         l = list(l)
30 |     if len(l) != length:
31 |         raise ValueError("%s must be a list of length %d." % (name, length))
32 |     for i, ll in enumerate(l):
33 |         l[i] = check_conditions(ll, name, *conditions)
34 |     return l
35 | 
36 | def check_tensor_dims(t, name, dims):
37 |     s = t.size()
38 |     if len(s) != len(dims):
39 |         raise ValueError("%s must be a %d-dimensional tensor." % (name, len(dims)))
40 |     for i in range(len(dims)):
41 |         if dims[i] >= 0 and s[i] != dims[i]:
42 |             raise ValueError("The %dth dimension of %s must have size %d, not %d." 
43 |             	% (i, name, dims[i], s[i]))
44 | 
45 | def list2tensor(l):
46 |     return torch.from_numpy(np.array(l, dtype=np.float32))


--------------------------------------------------------------------------------
/python/SmoothParticleNets/kernels.py:
--------------------------------------------------------------------------------
  1 | 
  2 | KERNELS = {}
  3 | DKERNELS = {}
  4 | 
  5 | 
  6 | """ DEFAULT:
  7 | 	\eta * \sigma * max(0, H^2 - d^2)^3
  8 | 		 H = radius
  9 | 		 d = distance
 10 | 	\sigma = 1/pi (dim norm)
 11 | 	  \eta = 315/(64*H^9) (norm)
 12 | """
 13 | KERNELS["default"] = ( 
 14 | 	"(315.0f/(64.0f*M_PI*H*H*H*H*H*H*H*H*H))*(H*H-d*d)*(H*H-d*d)*(H*H-d*d)")
 15 | 
 16 | """ DDEFAULT:
 17 | 	\eta * \sigma * d * max(0, H^2 - d^2)^2
 18 | 		 H = radius
 19 | 		 d = distance
 20 | 	\sigma = 1/pi (dim norm)
 21 | 	  \eta = -945/(32*H^9) (norm)
 22 | """
 23 | KERNELS["ddefault"] = "(-945.0f/(32.0f*M_PI*H*H*H*H*H*H*H*H*H))*(H*H-d*d)*(H*H-d*d)*d"
 24 | DKERNELS["default"] = KERNELS["ddefault"]
 25 | 
 26 | """ DDEFAULT2:
 27 | 	\eta * \sigma * (H^4 - 6*H^2*d^2 + 5d^4)
 28 | 		 H = radius
 29 | 		 d = distance
 30 | 	\sigma = 1/pi (dim norm)
 31 | 	  \eta = -945/(32*H^9) (norm)
 32 | """
 33 | KERNELS["ddefault2"] = "(-945.0f/(32.0f*M_PI*H*H*H*H*H*H*H*H*H))*(H*H*H*H - 6*H*H*d*d + 5*d*d*d*d)"
 34 | DKERNELS["ddefault"] = KERNELS["ddefault2"]
 35 | DKERNELS["ddefault2"] = "(-945.0f/(32.0f*M_PI*H*H*H*H*H*H*H*H*H))*(20*d*d*d - 12*H*H*d)"
 36 | 
 37 | """ PRESSURE:
 38 | 	\eta * \sigma * max(0, H - d)^3
 39 | 		 H = radius
 40 | 		 d = distance
 41 | 	\sigma = 1/pi (dim norm)
 42 | 	  \eta = 15/(H^6) (norm)
 43 | """
 44 | KERNELS["pressure"] = "(15.0f/(M_PI*H*H*H*H*H*H))*(H-d)*(H-d)*(H-d)"
 45 | 
 46 | """ DPRESSURE:
 47 | 	\eta * \sigma * max(0, H - d)^2
 48 | 		 H = radius
 49 | 		 d = distance
 50 | 	\sigma = 1/pi (dim norm)
 51 | 	  \eta = -45/(H^6) (norm)
 52 | """
 53 | KERNELS["dpressure"] = "(-45.0f/(M_PI*H*H*H*H*H*H))*(H-d)*(H-d)"
 54 | DKERNELS["pressure"] = KERNELS["dpressure"]
 55 | 
 56 | """ DPRESSURE2:
 57 | 	\eta * \sigma * max(0, H - d) * (H - 2*d)/2
 58 | 		 H = radius
 59 | 		 d = distance
 60 | 	\sigma = 1/pi (dim norm)
 61 | 	  \eta = -90/(H^6) (norm)
 62 | """
 63 | KERNELS["dpressure2"] = "(90.0f/(M_PI*H*H*H*H*H*H))*(H-d)"
 64 | DKERNELS["dpressure"] = KERNELS["dpressure2"]
 65 | DKERNELS["dpressure2"] = "(-90.0f/(M_PI*H*H*H*H*H*H))"
 66 | 
 67 | """ INDIRECT:
 68 | 	H - d
 69 | 		 H = radius
 70 | 		 d = distance
 71 | """
 72 | KERNELS["indirect"] = "H - d"
 73 | DKERNELS["indirect"] = "-1.0f"
 74 | 
 75 | """ CONSTANT:
 76 | 	1
 77 | """
 78 | KERNELS["constant"] = "1.0f"
 79 | DKERNELS["constant"] = "0.0f"
 80 | 
 81 | """ SPIKY:
 82 | 	\eta * \sigma * (1 - d/H)^2
 83 | 		 H = radius
 84 | 		 d = distance
 85 | 	\sigma = 1/pi (dim norm)
 86 | 	  \eta = 15/(H^3) (norm)
 87 | """
 88 | KERNELS["spiky"] = "15.0f/(M_PI*H*H*H)*(1.0f-d/H)*(1.0f-d/H)"
 89 | 
 90 | """ DSPIKY:
 91 | 	\eta * \sigma * 2 * (1 - d/H)/H
 92 | 		 H = radius
 93 | 		 d = distance
 94 | 	\sigma = 1/pi (dim norm)
 95 | 	  \eta = 15/(H^3) (norm)
 96 | """
 97 | KERNELS["dspiky"] = "-15.0f/(M_PI*H*H*H)*2.0f*(1.0f - d/H)/H"
 98 | DKERNELS["spiky"] = KERNELS["dspiky"]
 99 | DKERNELS["dspiky"] = "-15.0f/(M_PI*H*H*H)*2.0f*(-1.0f/H)/H"
100 | 
101 | """ COHESION:
102 | -(1.0f + \eta)/\eta^2*(d/H)^3 + (\eta^2 + \eta + 1)/\eta^2*(d/H)^2 - 1
103 | 	\eta * \sigma * (1 - d/H)^2
104 | 		 H = radius
105 | 		 d = distance
106 | 	  \eta = 0.5 (rest)
107 | """
108 | KERNELS["cohesion"] = "-6.0f*(d/H)*(d/H)*(d/H) + 7*(d/H)*(d/H) - 1"
109 | DKERNELS["cohesion"] = "2.0f*d*(7.0f*H - 9.0f*d)/(H*H*H)"
110 | 
111 | """ SIGMOID:
112 | 1/(1 + exp((d - C*H)*S/H))
113 | 		 H = radius
114 | 		 d = distance
115 | 		 S = 20 (sharpness)
116 | 		 C = 0.2 (center ratio)
117 | """
118 | KERNELS["sigmoid"] = "1.0f/(1.0f + expf((d - 0.2f*H)*20.0f/H))"
119 | # -S*expf((d - C*H)*S/H)/(H*(expf((d - C*H)*S/H) + 1.0f)*(expf((d - C*H)*S/H) + 1.0f))
120 | DKERNELS["sigmoid"] = ("-20.0f*expf((d - 0.2f*H)*20.0f/H)/" + 
121 | 	"(H*(expf((d - 0.2f*H)*20.0f/H) + 1.0f)*(expf((d - 0.2f*H)*20.0f/H) + 1.0f))")
122 | 
123 | KERNEL_NAMES = sorted(KERNELS.keys())
124 | 
125 | import math
126 | KERNEL_FN = {k : eval("lambda d, H: " + v
127 | 			            .replace("M_PI", "math.pi")
128 | 			            .replace("fmaxf", "max")
129 | 			            .replace("expf", "math.exp")
130 | 			            .replace("f", ""))
131 | 			        for k,v in KERNELS.items()}


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import sys
  4 | 
  5 | from setuptools import setup
  6 | import torch
  7 | from torch.utils.cpp_extension import CppExtension, BuildExtension, CUDAExtension
  8 | 
  9 | # Parse command line args.
 10 | parser = argparse.ArgumentParser()
 11 | parser.add_argument('--with_cuda', action="store_true", default=None)
 12 | parser.add_argument('--without_cuda', action="store_true", default=None)
 13 | args, unknown = parser.parse_known_args()
 14 | sys.argv = sys.argv[:2] + unknown
 15 | 
 16 | if args.with_cuda is None:
 17 |     if args.without_cuda is not None:
 18 |         args.with_cuda = not args.without_cuda
 19 |     else:
 20 |         print("--with_cuda or --without_cuda not specified, using PyTorch to decide...")
 21 |         args.with_cuda = torch.cuda.is_available()
 22 |         if args.with_cuda:
 23 |             print("torch.cuda.is_available says True, proceeding to build with cuda.")
 24 |         else:
 25 |             print("torch.cuda.is_available says False, proceeding to build without cuda.")
 26 | 
 27 | 
 28 | # Setup global variables.
 29 | root_dir = os.path.dirname(os.path.abspath(__file__))
 30 | test_dir = os.path.join(root_dir, "tests")
 31 | py_dir = os.path.join(root_dir, "python", "SmoothParticleNets")
 32 | src_dir = os.path.join(root_dir, "src")
 33 | 
 34 | # Create pytest args.
 35 | pytest_args = {
 36 |     'with_cuda': args.with_cuda,
 37 | }
 38 | fp = open(os.path.join(test_dir, "pytest_args.py"), "w")
 39 | for k, v in pytest_args.items():
 40 |     if isinstance(v, str):
 41 |         v = "'" + v + "'"
 42 |     fp.write("%s = %s\n" % (k, str(v)))
 43 | fp.close()
 44 | 
 45 | # Build kernel_constants.h
 46 | # Add path to python source to path.
 47 | sys.path.append(py_dir)
 48 | from kernels import KERNELS, KERNEL_NAMES, DKERNELS
 49 | fp = open(os.path.join(src_dir, "kernel_constants.h"), "w")
 50 | fp.write("// THIS FILE IS AUTOGENERATED. DO NOT ALTER.\n")
 51 | fp.write("#ifndef __kernel_constants_h__\n")
 52 | fp.write("#define __kernel_constants_h__\n")
 53 | fp.write("#ifdef __cplusplus\n")
 54 | fp.write("extern \"C\" {\n")
 55 | fp.write("#endif\n")
 56 | fp.write("\n")
 57 | fp.write("#include <math.h>\n")
 58 | fp.write("#include <stdio.h>\n")
 59 | fp.write("\n")
 60 | fp.write("#ifdef CUDA\n")
 61 | fp.write("__host__ __device__\n")
 62 | fp.write("#endif\n")
 63 | fp.write("inline\n")
 64 | fp.write("float KERNEL_W(float d, float H, int fn) {\n")
 65 | fp.write("    float ret = 0.0f;\n")
 66 | for i, k in enumerate(KERNEL_NAMES):
 67 |     fp.write("    if(fn == %d) { ret = (%s); }\n" % (i, KERNELS[k]))
 68 | fp.write("    return ret;\n")
 69 | fp.write("}\n\n")
 70 | fp.write("#ifdef CUDA\n")
 71 | fp.write("__host__ __device__\n")
 72 | fp.write("#endif\n")
 73 | fp.write("inline\n")
 74 | fp.write("float KERNEL_DW(float d, float H, int fn) {\n")
 75 | fp.write("    float ret = 0.0f;\n")
 76 | for i, k in enumerate(KERNEL_NAMES):
 77 |     fp.write("    if(fn == %d) { ret = (%s); }\n" % (i, DKERNELS[k]))
 78 | fp.write("    return ret;\n")
 79 | fp.write("}\n\n")
 80 | fp.write("#define VALIDATE_KERNEL_ID(fn) (fn >= 0 && fn < %d)" % len(KERNELS))
 81 | fp.write("\n")
 82 | fp.write("#ifdef __cplusplus\n")
 83 | fp.write("}\n")
 84 | fp.write("#endif\n")
 85 | fp.write("#endif\n")
 86 | fp.flush()
 87 | fp.close()
 88 | 
 89 | # Define extensions.
 90 | ext_modules = [
 91 |     CppExtension('SmoothParticleNets._ext', [
 92 |         os.path.join(src_dir, 'cpu_layer_funcs.cpp'),
 93 |     ]),
 94 | ]
 95 | if args.with_cuda:
 96 |     ext_modules.append(CUDAExtension('SmoothParticleNets._extc', [
 97 |         os.path.join(src_dir, 'cuda_layer_funcs.cpp'),
 98 |         os.path.join(src_dir, 'gpu_kernels.cu'),
 99 |     ]))
100 | 
101 | # The main setup call.
102 | setup(
103 |     name='SmoothParticleNets',
104 |     package_dir={'': 'python'},
105 |     packages=['SmoothParticleNets'],
106 |     ext_modules=ext_modules,
107 |     cmdclass={
108 |         'build_ext': BuildExtension
109 |     })
110 | 


--------------------------------------------------------------------------------
/src/constants.h:
--------------------------------------------------------------------------------
 1 | #ifndef __constants_h__
 2 | #define __constants_h__
 3 | #ifdef __cplusplus
 4 | extern "C" {
 5 | #endif
 6 | 
 7 | #define MAX_CARTESIAN_DIM 20
 8 | 
 9 | #ifdef __cplusplus
10 | }
11 | #endif
12 | 
13 | #endif


--------------------------------------------------------------------------------
/src/gpu_kernels.h:
--------------------------------------------------------------------------------
  1 | #ifndef __gpu_kernels_h__
  2 | #define __gpu_kernels_h__
  3 | #ifdef __cplusplus
  4 | extern "C" {
  5 | #endif
  6 | 
  7 | int cuda_convsp(
  8 |         const float* qlocs,
  9 |         const float* locs, 
 10 |         const float* data, 
 11 |         const float* neighbors,
 12 |         const float* weight, 
 13 |         const float* bias, 
 14 |         const int batch_size, 
 15 |         const int M,
 16 |         const int N, 
 17 |         const int nchannels, 
 18 |         const int ndims, 
 19 |         const int max_neighbors,
 20 |         const int nkernels, 
 21 |         const int ncells, 
 22 |         const float radius, 
 23 |         const float* kernel_size, 
 24 |         const float* dilation, 
 25 |         const int dis_norm, 
 26 |         const int kernel_fn, 
 27 |         float* out, 
 28 |         float* dqlocs,
 29 |         float* dlocs,
 30 |         float* ddata, 
 31 |         float* dweight, 
 32 |         cudaStream_t stream, 
 33 |         const size_t nshared_device_mem);
 34 | 
 35 | int cuda_convsdf(
 36 | 	const float* locs, 
 37 | 	const int batch_size, 
 38 | 	const int N, 
 39 | 	const int ndims, 
 40 | 	const float* idxs,
 41 |     const float* poses, 
 42 |     const float* scales, 
 43 |     const int M, 
 44 |     const int pose_len, 
 45 |     const float* sdfs, 
 46 |     const float* sdf_offsets, 
 47 |     const float* sdf_shapes, 
 48 |     const float* weight, 
 49 |     const float* bias, 
 50 |     const int nkernels, 
 51 |     const int ncells, 
 52 |     const float* kernel_size, 
 53 |     const float* dilation, 
 54 |     const float max_distance, 
 55 |     float* out, 
 56 |     float* dlocs,
 57 |     float* dweight, 
 58 |     float* dposes,
 59 |     cudaStream_t stream);
 60 | 
 61 | size_t GetSharedMemPerBlock(int device);
 62 | 
 63 | int cuda_hashgrid_order(
 64 |     float* locs,
 65 |     const float* low,
 66 |     const float* grid_dims,
 67 |     float* cellIDs,
 68 |     float* idxs,
 69 |     float* buffer,
 70 |     const int batch_size,
 71 |     const int N,
 72 |     const int ndims,
 73 |     const float cellEdge,
 74 |     cudaStream_t stream);
 75 | 
 76 | int cuda_compute_collisions(
 77 |     const float* qlocs,
 78 |     const float* locs,
 79 |     const float* low,
 80 |     const float* grid_dims,
 81 |     const float* cellIDs,
 82 |     float* cellStarts,
 83 |     float* cellEnds,
 84 |     float* collisions,
 85 |     const int batch_size,
 86 |     const int M,
 87 |     const int N,
 88 |     const int ndims,
 89 |     const int max_collisions,
 90 |     const int ncells,
 91 |     const float cellEdge,
 92 |     const float radius,
 93 |     const int include_self,
 94 |     cudaStream_t stream);
 95 | 
 96 | int cuda_reorder_data(
 97 |     float* locs,
 98 |     float* data,
 99 |     float* idxs,
100 |     float* nlocs,
101 |     float* ndata,
102 |     const int batch_size,
103 |     const int N,
104 |     const int ndims,
105 |     const int nchannels,
106 |     const int reverse,
107 |     cudaStream_t stream);
108 | 
109 | size_t get_radixsort_buffer_size(cudaStream_t stream);
110 | 
111 | int cuda_particleprojection(
112 |         const float* locs, 
113 |         const float camera_fl,
114 |         const float filter_std,
115 |         const float filter_scale,
116 |         const float* depth_mask, 
117 |         const int batch_size,
118 |         const int N,
119 |         const int width,
120 |         const int height,
121 |         float* out, 
122 |         float* dlocs,
123 |         cudaStream_t stream);
124 | 
125 | int cuda_imageprojection(
126 |         const float* locs, 
127 |         const float* image,
128 |         const float camera_fl,
129 |         const float* depth_mask, 
130 |         const int batch_size,
131 |         const int N,
132 |         const int width,
133 |         const int height,
134 |         const int channels,
135 |         float* out, 
136 |         float* dlocs,
137 |         float* dimage,
138 |         cudaStream_t stream);
139 | 
140 | #ifdef __cplusplus
141 | }
142 | #endif
143 | 
144 | #endif


--------------------------------------------------------------------------------
/tests/test_convsp.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | # Add path to python source to path.
  4 | sys.path.append(os.path.join(os.path.dirname(
  5 |     os.path.dirname(os.path.abspath(__file__))), "python"))
  6 | import SmoothParticleNets as spn
  7 | 
  8 | import itertools
  9 | import numpy as np
 10 | import torch
 11 | import torch.autograd
 12 | 
 13 | from gradcheck import gradcheck
 14 | try:
 15 |     import pytest_args
 16 | except ImportError:
 17 |     print("Make sure to compile SmoothParticleNets before running tests.")
 18 |     raise
 19 | 
 20 | 
 21 | def pyconvsp(qlocs, locs, data, weights, biases, kernel_fn, KERNEL_SIZE, RADIUS, DILATION, NKERNELS):
 22 |     w = spn.KERNEL_FN[kernel_fn]
 23 | 
 24 |     BATCH_SIZE = locs.shape[0]
 25 |     M = qlocs.shape[1]
 26 |     N = locs.shape[1]
 27 |     NDIM = locs.shape[-1]
 28 | 
 29 |     kernel_centers = (np.array(KERNEL_SIZE) - 1)/2
 30 |     ground_truth = np.zeros((BATCH_SIZE, M, NKERNELS), dtype=data.dtype)
 31 |     for b in range(BATCH_SIZE):
 32 |         for i in range(M):
 33 |             for j in range(N):
 34 |                 dd = np.square(qlocs[b, i, :] - locs[b, j, :]).sum()
 35 |                 nr = DILATION*max(KERNEL_SIZE)/2 + RADIUS
 36 |                 if dd > nr*nr:
 37 |                     continue
 38 |                 for k, idxs in enumerate(itertools.product(*[range(x) for x in KERNEL_SIZE[::-1]])):
 39 |                     dd = np.square(qlocs[b, i, :] + (idxs[::-1] - kernel_centers)*DILATION
 40 |                                    - locs[b, j, :]).sum()
 41 |                     if dd > RADIUS*RADIUS:
 42 |                         continue
 43 |                     ground_truth[b, i, :] += weights[:, :, k].dot(
 44 |                         w(np.sqrt(dd), RADIUS)*data[b, j, :])
 45 |     ground_truth += biases[np.newaxis, np.newaxis, :]
 46 |     return ground_truth
 47 | 
 48 | 
 49 | def test_convsp(cpu=True, cuda=True):
 50 |     if cpu:
 51 |         print("Testing CPU implementation of ConvSP...")
 52 |         eval_convsp(cuda=False)
 53 |         print("CPU implementation passed!")
 54 |         print("")
 55 | 
 56 |     if cuda:
 57 |         if pytest_args.with_cuda:
 58 |             print("Testing CUDA implementation of ConvSP...")
 59 |             eval_convsp(cuda=True)
 60 |             print("CUDA implementation passed!")
 61 |         else:
 62 |             print("Not compiled with CUDA, skipping CUDA test.")
 63 | 
 64 | 
 65 | def eval_convsp(cuda=False):
 66 |     BATCH_SIZE = 2
 67 |     N = 5
 68 |     M = 3
 69 |     NDIM = 2
 70 |     KERNEL_SIZE = (3, 1)
 71 |     RADIUS = 1.0
 72 |     DILATION = 0.05
 73 |     NCHANNELS = 2
 74 |     NKERNELS = 3
 75 | 
 76 |     np.random.seed(0)
 77 | 
 78 |     locs = np.random.rand(BATCH_SIZE, N, NDIM).astype(np.float32)
 79 |     qlocs = np.random.rand(BATCH_SIZE, M, NDIM).astype(np.float32)
 80 |     data = np.random.rand(BATCH_SIZE, N, NCHANNELS).astype(np.float32)
 81 |     weights = np.random.rand(NKERNELS, NCHANNELS, np.prod(
 82 |         KERNEL_SIZE)).astype(np.float32)
 83 |     biases = np.random.rand(NKERNELS).astype(np.float32)
 84 | 
 85 |     def use_cuda(x):
 86 |         if cuda:
 87 |             return x.cuda()
 88 |         else:
 89 |             return x
 90 | 
 91 |     def undo_cuda(x):
 92 |         if cuda:
 93 |             return x.cpu()
 94 |         else:
 95 |             return x
 96 | 
 97 |     for use_qlocs in (True, False):
 98 | 
 99 |         locs_t = torch.autograd.Variable(
100 |             use_cuda(torch.FloatTensor(locs)), requires_grad=True)
101 |         if use_qlocs:
102 |             qlocs_t = torch.autograd.Variable(
103 |                 use_cuda(torch.FloatTensor(qlocs)), requires_grad=True)
104 |         else:
105 |             qlocs_t = None
106 |         data_t = torch.autograd.Variable(
107 |             use_cuda(torch.FloatTensor(data)), requires_grad=True)
108 |         weights_t = torch.nn.Parameter(
109 |             torch.FloatTensor(weights), requires_grad=True)
110 |         biases_t = torch.nn.Parameter(
111 |             torch.FloatTensor(biases), requires_grad=True)
112 | 
113 |         coll = use_cuda(spn.ParticleCollision(NDIM,
114 |                                               RADIUS + DILATION*max((k - 1)/2 for k in KERNEL_SIZE)))
115 |         locs_t, data_t, idxs_t, neighbors_t = coll(
116 |             locs_t, data_t, (qlocs_t if use_qlocs else None))
117 | 
118 |         for kernel_fn in spn.KERNEL_NAMES:
119 |             print("\tTesting kernel %s (%s query locations)..." %
120 |                   (kernel_fn, "with" if use_qlocs else "without"))
121 |             ground_truth = pyconvsp((qlocs if use_qlocs else locs), locs, data, weights, biases,
122 |                                     kernel_fn, KERNEL_SIZE, RADIUS, DILATION, NKERNELS)
123 | 
124 |             convsp = spn.ConvSP(NCHANNELS, NKERNELS, NDIM, KERNEL_SIZE, DILATION, RADIUS,
125 |                                 kernel_fn=kernel_fn)
126 |             convsp.weight = weights_t
127 |             convsp.bias = biases_t
128 |             convsp = use_cuda(convsp)
129 | 
130 |             pred_t = undo_cuda(convsp(locs_t, data_t, neighbors_t, qlocs_t))
131 |             np.testing.assert_array_almost_equal(
132 |                 pred_t.data.numpy(), ground_truth, decimal=3)
133 | 
134 |             dt = torch.autograd.Variable(data_t.data, requires_grad=True)
135 |             lt = torch.autograd.Variable(locs_t.data, requires_grad=True)
136 |             if use_qlocs:
137 |                 qt = torch.autograd.Variable(qlocs_t.data, requires_grad=True)
138 |             wt = torch.nn.Parameter(weights_t.data, requires_grad=True)
139 |             bt = torch.nn.Parameter(biases_t.data, requires_grad=True)
140 |             # Use pyconvsp to allow for double precision when computing numeric grads.
141 | 
142 |             def func_numerical(l, d, w, b, q=None):
143 |                 return (torch.autograd.Variable(torch.from_numpy(
144 |                     pyconvsp((q.data.cpu().numpy() if use_qlocs else l.data.cpu().numpy()),
145 |                              l.data.cpu().numpy(),
146 |                              d.data.cpu().numpy(), w.data.cpu().numpy(), b.data.cpu().numpy(),
147 |                              kernel_fn, KERNEL_SIZE, RADIUS, DILATION, NKERNELS))),)
148 | 
149 |             def func_analytical(l, d, w, b, q=None):
150 |                 convsp.weight = w
151 |                 convsp.bias = b
152 |                 return (convsp(l, d, neighbors_t, (q if use_qlocs else None)),)
153 |             assert gradcheck(func_analytical,
154 |                              ((lt, dt, wt, bt, qt)
155 |                               if use_qlocs else (lt, dt, wt, bt,)),
156 |                              eps=1e-4, atol=1e-3, rtol=1e-1, func_numerical=func_numerical, use_double=True)
157 | 
158 | 
159 | if __name__ == '__main__':
160 |     import argparse
161 |     parser = argparse.ArgumentParser()
162 |     parser.add_argument('--cpu', dest='cpu', action="store_true", default=True)
163 |     parser.add_argument('--no-cpu', dest='cpu', action="store_false")
164 |     parser.add_argument('--cuda', dest='cuda',
165 |                         action="store_true", default=True)
166 |     parser.add_argument('--no-cuda', dest='cuda', action="store_false")
167 |     args = parser.parse_args()
168 |     test_convsp(cpu=args.cpu, cuda=args.cuda)
169 | 


--------------------------------------------------------------------------------
/tests/test_imageprojection.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | # Add path to python source to path.
  4 | sys.path.append(os.path.join(os.path.dirname(
  5 |     os.path.dirname(os.path.abspath(__file__))), "python"))
  6 | import SmoothParticleNets as spn
  7 | 
  8 | import itertools
  9 | import numpy as np
 10 | import torch
 11 | import torch.autograd
 12 | 
 13 | from gradcheck import gradcheck
 14 | from test_convsdf import quaternionMult, quaternionConjugate
 15 | from regular_grid_interpolater import RegularGridInterpolator
 16 | try:
 17 |     import pytest_args
 18 | except ImportError:
 19 |     print("Make sure to compile SmoothParticleNets before running tests.")
 20 |     raise
 21 | 
 22 | 
 23 | def pyproject(locs, image, camera_fl, camera_pose,
 24 |               camera_rot, depth_mask=None, dtype=np.float32):
 25 |     batch_size = locs.shape[0]
 26 |     N = locs.shape[1]
 27 |     channels = image.shape[1]
 28 |     width = image.shape[3]
 29 |     height = image.shape[2]
 30 |     ret = np.zeros((batch_size, N, channels), dtype=dtype)
 31 |     if depth_mask is None:
 32 |         depth_mask = np.ones((batch_size, height, width),
 33 |                              dtype=dtype)*np.finfo(np.float32).max
 34 |     depth_fns = [RegularGridInterpolator(
 35 |         [np.arange(0.5, width, 1), np.arange(0.5, height, 1)],
 36 |         depth_mask[b, ...].transpose(), bounds_error=False, fill_value=np.finfo(np.float32).max)
 37 |         for b in range(batch_size)]
 38 |     for b in range(batch_size):
 39 |         r = locs[b, ...] - camera_pose[b, ...]
 40 |         r = np.concatenate((r, np.zeros((N, 1), dtype=r.dtype)), axis=-1)
 41 |         r = np.array([quaternionMult(quaternionConjugate(camera_rot[b, :]),
 42 |                                      quaternionMult(r[i, ...], camera_rot[b, :])) for i in range(N)], dtype=dtype)
 43 |         ijs = np.concatenate((
 44 |             r[:, 0:1]*camera_fl/r[:, 2:3] + width/2.0,
 45 |             r[:, 1:2]*camera_fl/r[:, 2:3] + height/2.0,
 46 |         ), axis=-1)
 47 |         depths = depth_fns[b](ijs)
 48 |         mask = (r[:, 2] <= depths)*(r[:, 2] > 0)
 49 |         for c in range(channels):
 50 |             fn = RegularGridInterpolator(
 51 |                 [np.arange(0.5, width, 1), np.arange(0.5, height, 1)],
 52 |                 image[b, c, ...].transpose(), bounds_error=False, fill_value=0)
 53 |             ret[b, :, c] = fn(ijs)*mask
 54 | 
 55 |     return ret
 56 | 
 57 | 
 58 | def test_imageprojection(cpu=True, cuda=True):
 59 |     if cpu:
 60 |         print("Testing CPU implementation of ImageProjection...")
 61 |         eval_imageprojection(cuda=False)
 62 |         print("CPU implementation passed!")
 63 |         print("")
 64 | 
 65 |     if cuda:
 66 |         if pytest_args.with_cuda:
 67 |             print("Testing CUDA implementation of ImageProjection...")
 68 |             eval_imageprojection(cuda=True)
 69 |             print("CUDA implementation passed!")
 70 |         else:
 71 |             print("Not compiled with CUDA, skipping CUDA test.")
 72 | 
 73 | 
 74 | def eval_imageprojection(cuda=False):
 75 |     np.random.seed(1)
 76 |     BATCH_SIZE = 2
 77 |     N = 5
 78 |     CHANNELS = 2
 79 |     CAMERA_FOV = 45.0/180.0*np.pi
 80 |     CAMERA_SIZE = (30, 30)
 81 |     CAMERA_FL = CAMERA_SIZE[0]/2/(CAMERA_FOV/2.0)
 82 |     CAMERA_POSE = 5.0*(np.random.rand(BATCH_SIZE, 3).astype(np.float32) - 0.5)
 83 |     CAMERA_TARGET = np.array([(0.0, 0.0, 0.0)]*BATCH_SIZE, dtype=np.float32)
 84 | 
 85 |     CAMERA_ROT = np.zeros((BATCH_SIZE, 4), dtype=np.float32)
 86 |     for b in range(BATCH_SIZE):
 87 |         CAMERA_ROT[b, :] = pointAt(
 88 |             CAMERA_POSE[b, :], np.array([0, 0, 0], dtype=np.float32))
 89 | 
 90 |     locs = 2.0*(np.random.rand(BATCH_SIZE, N, 3).astype(np.float32) - 0.5)
 91 |     image = np.random.rand(BATCH_SIZE, CHANNELS,
 92 |                            CAMERA_SIZE[1], CAMERA_SIZE[0])
 93 |     depth_mask = np.ones((BATCH_SIZE, CAMERA_SIZE[1], CAMERA_SIZE[0]),
 94 |                          dtype=np.float32)*np.finfo(np.float32).max
 95 |     ir = (int(CAMERA_SIZE[0]/2 - CAMERA_SIZE[0]*0.2),
 96 |           int(CAMERA_SIZE[0]/2 + CAMERA_SIZE[0]*0.2) + 1)
 97 |     jr = (int(CAMERA_SIZE[1]/2 - CAMERA_SIZE[1]*0.2),
 98 |           int(CAMERA_SIZE[1]/2 + CAMERA_SIZE[1]*0.2) + 1)
 99 |     ul = 0.0
100 |     lr = 10.0
101 |     ur = 5.0
102 |     ll = 3.5
103 |     for i in range(ir[0], ir[1]):
104 |         for j in range(jr[0], jr[1]):
105 |             ii = 1.0*(i - ir[0])/(ir[1] - ir[0])
106 |             jj = 1.0*(j - jr[0])/(jr[1] - jr[0])
107 |             l = ul*(1 - jj) + ll*jj
108 |             r = ur*(1 - jj) + lr*jj
109 |             depth_mask[0, j, i] = l*(1 - ii) + r*ii
110 | 
111 |     def use_cuda(x):
112 |         if cuda:
113 |             return x.cuda()
114 |         else:
115 |             return x
116 | 
117 |     def undo_cuda(x):
118 |         if cuda:
119 |             return x.cpu()
120 |         else:
121 |             return x
122 | 
123 |     def np2var(t):
124 |         return torch.autograd.Variable(use_cuda(torch.from_numpy(t)), requires_grad=False)
125 | 
126 |     locs_t = torch.autograd.Variable(
127 |         use_cuda(torch.FloatTensor(locs)), requires_grad=True)
128 |     image_t = torch.autograd.Variable(
129 |         use_cuda(torch.FloatTensor(image)), requires_grad=True)
130 |     depth_mask_t = torch.autograd.Variable(
131 |         use_cuda(torch.FloatTensor(depth_mask)), requires_grad=False)
132 |     camera_pose_t = torch.autograd.Variable(use_cuda(torch.FloatTensor(CAMERA_POSE)),
133 |                                             requires_grad=False)
134 |     camera_rot_t = torch.autograd.Variable(use_cuda(torch.FloatTensor(CAMERA_ROT)),
135 |                                            requires_grad=False)
136 | 
137 |     imageProjection = spn.ImageProjection(CAMERA_FL)
138 | 
139 |     ground_truth = pyproject(locs, image, CAMERA_FL,
140 |                              CAMERA_POSE, CAMERA_ROT, depth_mask)
141 |     pred_t = imageProjection(
142 |         locs_t, image_t, camera_pose_t, camera_rot_t, depth_mask_t)
143 |     pred = undo_cuda(pred_t).data.numpy()
144 |     np.testing.assert_array_almost_equal(pred, ground_truth, decimal=3)
145 | 
146 |     # Use pyproject to allow for double precision when computing numeric grads.
147 |     def func_numerical(l, i):
148 |         ll = undo_cuda(l).data.numpy()
149 |         ii = undo_cuda(i).data.numpy()
150 |         return torch.autograd.Variable(use_cuda(torch.from_numpy(pyproject(ll, ii, CAMERA_FL, CAMERA_POSE,
151 |                                                                            CAMERA_ROT, dtype=np.float64))), requires_grad=False)
152 | 
153 |     def func_analytical(l, i):
154 |         return imageProjection(l, i, camera_pose_t, camera_rot_t)
155 |     assert torch.autograd.gradcheck(func_analytical, (locs_t, image_t,),
156 |                                     eps=1e-3, atol=1e-3, rtol=1e-1)
157 | 
158 | 
159 | def quaternionFromMatrix(matrix):
160 |     M = matrix
161 |     m00 = M[0, 0]
162 |     m01 = M[0, 1]
163 |     m02 = M[0, 2]
164 |     m10 = M[1, 0]
165 |     m11 = M[1, 1]
166 |     m12 = M[1, 2]
167 |     m20 = M[2, 0]
168 |     m21 = M[2, 1]
169 |     m22 = M[2, 2]
170 |     # symmetric matrix K
171 |     K = np.array([[m00-m11-m22, 0.0,         0.0,         0.0],
172 |                   [m01+m10,     m11-m00-m22, 0.0,         0.0],
173 |                   [m02+m20,     m12+m21,     m22-m00-m11, 0.0],
174 |                   [m21-m12,     m02-m20,     m10-m01,     m00+m11+m22]])
175 |     K /= 3.0
176 |     # quaternion is eigenvector of K that corresponds to largest eigenvalue
177 |     w, V = np.linalg.eigh(K)
178 |     q = V[[3, 0, 1, 2], np.argmax(w)]
179 |     if q[0] < 0.0:
180 |         np.negative(q, q)
181 |     return [q[1], q[2], q[3], q[0]]
182 | 
183 | 
184 | def pointAt(pose, target):
185 |     # Convention: +Z=out of camera, +Y=Down, +X=right
186 |     z = target - pose
187 |     z /= np.sqrt(np.sum(z**2))
188 |     y = np.array([0, -1, 0], dtype=np.float32)
189 |     x = np.cross(y, z)
190 |     x /= np.sqrt(np.sum(x**2))
191 |     y = np.cross(z, x)
192 |     ret = quaternionFromMatrix(np.array([x, y, z]).transpose())
193 |     return ret
194 | 
195 | 
196 | if __name__ == '__main__':
197 |     import argparse
198 |     parser = argparse.ArgumentParser()
199 |     parser.add_argument('--cpu', dest='cpu', action="store_true", default=True)
200 |     parser.add_argument('--no-cpu', dest='cpu', action="store_false")
201 |     parser.add_argument('--cuda', dest='cuda',
202 |                         action="store_true", default=True)
203 |     parser.add_argument('--no-cuda', dest='cuda', action="store_false")
204 |     args = parser.parse_args()
205 |     test_imageprojection(cpu=args.cpu, cuda=args.cuda)
206 | 


--------------------------------------------------------------------------------
/tests/test_particlecollision.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | # Add path to python source to path.
  4 | sys.path.append(os.path.join(os.path.dirname(os.path.dirname(
  5 |     os.path.abspath(__file__))), "python"))
  6 | import SmoothParticleNets as spn
  7 | 
  8 | import itertools
  9 | import numpy as np
 10 | import torch
 11 | import torch.autograd
 12 | 
 13 | from gradcheck import gradcheck
 14 | try:
 15 |     import pytest_args
 16 | except ImportError:
 17 |     print("Make sure to compile SmoothParticleNets before running tests.")
 18 |     raise
 19 | 
 20 | 
 21 | def test_particlecollision(cpu=True, cuda=True):
 22 |     if cpu:
 23 |         print("Testing CPU implementation of ParticleCollision...")
 24 |         eval_particlecollision(cuda=False)
 25 |         print("CPU implementation passed!")
 26 |         print("")
 27 | 
 28 |     if cuda:
 29 |         if pytest_args.with_cuda:
 30 |             print("Testing CUDA implementation of ParticleCollision...")
 31 |             eval_particlecollision(cuda=True)
 32 |             print("CUDA implementation passed!")
 33 |         else:
 34 |             print("Not compiled with CUDA, skipping CUDA test.")
 35 | 
 36 | def eval_particlecollision(cuda=False):
 37 |     BATCH_SIZE = 2
 38 |     N = 100
 39 |     M = 77
 40 |     NDIM = 2
 41 |     RADIUS = 0.2
 42 |     NCHANNELS = 2
 43 | 
 44 |     np.random.seed(0)
 45 | 
 46 |     locs = np.random.rand(BATCH_SIZE, N, NDIM).astype(np.float32)
 47 |     qlocs = np.random.rand(BATCH_SIZE, M, NDIM).astype(np.float32)
 48 |     data = np.random.rand(BATCH_SIZE, N, NCHANNELS).astype(np.float32)
 49 | 
 50 |     gt_neighbors = np.ones((BATCH_SIZE, M, N), dtype=int)*-1
 51 |     for b in range(BATCH_SIZE):
 52 |         for i in range(M):
 53 |             for j in range(N):
 54 |                 d = np.square(qlocs[b, i, :] - locs[b, j, :]).sum()
 55 |                 if d <= RADIUS*RADIUS:
 56 |                     nc = min(np.where(gt_neighbors[b, i, :] < 0)[0])
 57 |                     gt_neighbors[b, i, nc] = j
 58 | 
 59 |     def use_cuda(x):
 60 |         if cuda:
 61 |             return x.cuda()
 62 |         else:
 63 |             return x
 64 |     def undo_cuda(x):
 65 |         if cuda:
 66 |             return x.cpu()
 67 |         else:
 68 |             return x
 69 | 
 70 |     olocs = locs
 71 |     oqlocs = qlocs
 72 |     odata = data
 73 |     locs = torch.autograd.Variable(use_cuda(torch.FloatTensor(locs.copy())), 
 74 |         requires_grad=False)
 75 |     qlocs = torch.autograd.Variable(use_cuda(torch.FloatTensor(qlocs.copy())), 
 76 |         requires_grad=False)
 77 |     data = torch.autograd.Variable(use_cuda(torch.FloatTensor(data.copy())), 
 78 |         requires_grad=False)
 79 | 
 80 |     coll = spn.ParticleCollision(NDIM, RADIUS, max_collisions=N)
 81 |     convsp = use_cuda(coll)
 82 | 
 83 |     vlocs, vdata, vidxs, vneighbors = coll(locs, data, qlocs)
 84 |     
 85 |     idxs = undo_cuda(vidxs).data.numpy().astype(int)
 86 |     neighbors = undo_cuda(vneighbors).data.numpy().astype(int)
 87 |     nlocs = undo_cuda(vlocs).data.numpy()
 88 |     ndata = undo_cuda(vdata).data.numpy()
 89 | 
 90 |     # First make sure all the indexes are in idxs.
 91 |     for b in range(BATCH_SIZE):
 92 |         for i in range(N):
 93 |             assert i in idxs[b, :]
 94 | 
 95 |     # Next make sure locs and data are in the order idxs says they're in.
 96 |     for b in range(BATCH_SIZE):
 97 |         for i, j in enumerate(idxs[b, :]):
 98 |             assert all(olocs[b, j, :] == nlocs[b, i, :])
 99 |             assert all(odata[b, j, :] == ndata[b, i, :])
100 | 
101 |     # Make sure the input locs and data weren't altered.
102 |     assert np.all(undo_cuda(locs).data.numpy() == olocs)
103 |     assert np.all(undo_cuda(data).data.numpy() == odata)
104 | 
105 |     # Check the neighbor list.
106 |     for b in range(BATCH_SIZE):
107 |         for i in range(M):
108 |             for j in neighbors[b, i, :]:
109 |                 if j < 0:
110 |                     break
111 |                 assert idxs[b, j] in gt_neighbors[b, i, :]
112 |             for j in gt_neighbors[b, i, :]:
113 |                 if j < 0:
114 |                     break
115 |                 jj = np.where(idxs[b, :] == j)[0][0]
116 |                 assert jj in neighbors[b, i, :]
117 | 
118 |     # Finally put the locations and data back in their original order.
119 |     reorder = use_cuda(spn.ReorderData(reverse=True))
120 |     vlocs, vdata = reorder(vidxs, vlocs, vdata)
121 |     assert np.all(undo_cuda(vlocs).data.numpy() == olocs)
122 |     assert np.all(undo_cuda(vdata).data.numpy() == odata)
123 | 
124 |     # Test gradients.
125 |     def func(l, d, q):
126 |         return coll(l, d, q)[:2]
127 |     assert gradcheck(func, (locs, data, qlocs), eps=1e-2, atol=1e-3)
128 | 
129 | 
130 | 
131 | if __name__ == '__main__':
132 |     import argparse
133 |     parser = argparse.ArgumentParser()
134 |     parser.add_argument('--cpu', dest='cpu', action="store_true", default=True)
135 |     parser.add_argument('--no-cpu', dest='cpu', action="store_false")
136 |     parser.add_argument('--cuda', dest='cuda', action="store_true", default=True)
137 |     parser.add_argument('--no-cuda', dest='cuda', action="store_false")
138 |     args = parser.parse_args()
139 |     test_particlecollision(cpu=args.cpu, cuda=args.cuda)


--------------------------------------------------------------------------------